In [None]:
def sentences_to_padded_index_sequences(datasets):
    """
    Annotate datasets with feature vectors. Adding right-sided padding. 
    """
    # Extract vocabulary
    def tokenize(string):
        string = re.sub(r'\(|\)', '', string)
        return string.split()
    

    word_counter = collections.Counter()
    char_counter = collections.Counter()

    for example in tqdm(dataset):
        s1_tokenize = tokenize(example[0])
        s2_tokenize = tokenize(example[1])

        word_counter.update(s1_tokenize)
        word_counter.update(s2_tokenize)

        for i, word in enumerate(s1_tokenize):
            char_counter.update([c for c in word])
        for word in s2_tokenize:
            char_counter.update([c for c in word])

    vocabulary = set([word for word in word_counter])
    vocabulary = list(vocabulary)
    if config.embedding_replacing_rare_word_with_UNK: 
        vocabulary = [PADDING, "<UNK>"] + vocabulary
    else:
        vocabulary = [PADDING] + vocabulary
    
    word_indices = dict(zip(vocabulary, range(len(vocabulary))))
    indices_to_words = {v: k for k, v in word_indices.items()}
    char_vocab = set([char for char in char_counter])
    char_vocab = list(char_vocab)
    char_vocab = [PADDING] + char_vocab
    char_indices = dict(zip(char_vocab, range(len(char_vocab))))
    indices_to_char = {v: k for k, v in char_indices.items()}
    

    for i, dataset in enumerate(datasets):
        for example in tqdm(dataset):
            for sentence in [, 'sentence2_binary_parse']:
                example[sentence + '_index_sequence'] = np.zeros((FIXED_PARAMETERS["seq_length"]), dtype=np.int32)
                example[sentence + '_inverse_term_frequency'] = np.zeros((FIXED_PARAMETERS["seq_length"]), dtype=np.float32)

                token_sequence = tokenize(example[sentence])
                padding = FIXED_PARAMETERS["seq_length"] - len(token_sequence)
                      
                for i in range(FIXED_PARAMETERS["seq_length"]):
                    if i >= len(token_sequence):
                        index = word_indices[PADDING]
                        itf = 0
                    else:
                        if config.embedding_replacing_rare_word_with_UNK:
                            index = word_indices[token_sequence[i]] if word_counter[token_sequence[i]] >= config.UNK_threshold else word_indices["<UNK>"]
                        else:
                            index = word_indices[token_sequence[i]]
                        itf = 1 / (word_counter[token_sequence[i]] + 1)
                    example[sentence + '_index_sequence'][i] = index
                    
                    example[sentence + '_inverse_term_frequency'][i] = itf
                
                example[sentence + '_char_index'] = np.zeros((FIXED_PARAMETERS["seq_length"], config.char_in_word_size), dtype=np.int32)
                for i in range(FIXED_PARAMETERS["seq_length"]):
                    if i >= len(token_sequence):
                        continue
                    else:
                        chars = [c for c in token_sequence[i]]
                        for j in range(config.char_in_word_size):
                            if j >= (len(chars)):
                                break
                            else:
                                index = char_indices[chars[j]]
                            example[sentence + '_char_index'][i,j] = index 
    

    return indices_to_words, word_indices, char_indices, indices_to_char


In [1]:
import numpy as np
import re
import random
import json
import collections
import numpy as np
from tqdm import tqdm
import nltk
from nltk.corpus import wordnet as wn 
import os
import pickle
import multiprocessing
from nltk.tag import StanfordNERTagger
from nltk.tag import StanfordPOSTagger

In [2]:
PADDING = "<PAD>"
POS_Tagging = [PADDING, 'WP$', 'RBS', 'SYM', 'WRB', 'IN', 'VB', 'POS', 'TO', ':', '-RRB-', '$', 'MD', 'JJ', '#', 'CD', '``', 'JJR', 'NNP', "''", 'LS', 'VBP', 'VBD', 'FW', 'RBR', 'JJS', 'DT', 'VBG', 'RP', 'NNS', 'RB', 'PDT', 'PRP$', '.', 'XX', 'NNPS', 'UH', 'EX', 'NN', 'WDT', 'VBN', 'VBZ', 'CC', ',', '-LRB-', 'PRP', 'WP']
POS_dict = {pos:i for i, pos in enumerate(POS_Tagging)}

stemmer = nltk.SnowballStemmer('english')

tt = nltk.tokenize.treebank.TreebankWordTokenizer()

nst = StanfordNERTagger('stanford-ner-2020-11-17/classifiers/english.muc.7class.distsim.crf.ser.gz', 'stanford-ner-2020-11-17/stanford-ner-4.2.0.jar',encoding='utf-8')


pst = StanfordPOSTagger('stanford-postagger-full-2020-11-17/models/english-bidirectional-distsim.tagger', \
                    'stanford-postagger-full-2020-11-17/stanford-postagger.jar')


The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordNERTagger, self).__init__(*args, **kwargs)
The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordPOSTagger, self).__init__(*args, **kwargs)


In [3]:

def is_exact_match(token1, token2):
    token1 = token1.lower()
    token2 = token2.lower()
    
    token1_stem = stemmer.stem(token1)

    if token1 == token2:
        return True
    
    for synsets in wn.synsets(token2):
        for lemma in synsets.lemma_names():
            if token1_stem == stemmer.stem(lemma):
                return True
    
    if token1 == "n't" and token2 == "not":
        return True
    elif token1 == "not" and token2 == "n't":
        return True
    elif token1_stem == stemmer.stem(token2):
        return True
    return False

In [4]:

def is_antonyms(token1, token2):
    token1 = token1.lower()
    token2 = token2.lower()
    token1_stem = stemmer.stem(token1)
    antonym_lists_for_token2 = []
    for synsets in wn.synsets(token2):
        for lemma_synsets in [wn.synsets(l) for l in synsets.lemma_names()]:
            for lemma_syn in lemma_synsets:
                for lemma in lemma_syn.lemmas():
                    for antonym in lemma.antonyms():
                        antonym_lists_for_token2.append(antonym.name())
                        # if token1_stem == stemmer.stem(antonym.name()):
                        #     return True 
    antonym_lists_for_token2 = list(set(antonym_lists_for_token2))
    for atnm in antonym_lists_for_token2:
        if token1_stem == stemmer.stem(atnm):
            return True
    return False  

In [3]:
from dataloaders import *

In [4]:
data = SNLIDataset('train')

In [7]:
for i,e in enumerate(data):
    print(i)
    print(e)
    break

0
('A person on a horse jumps over a broken down airplane.', 'A person is training his horse for a competition.', tensor(2))


In [10]:
# Extract vocabulary
def tokenize(string):
    string = re.sub(r'\(|\)', '', string)
    return string.split()


word_counter = collections.Counter()
char_counter = collections.Counter()

for example in tqdm(data):
    s1_tokenize = tokenize(example[0])
    s2_tokenize = tokenize(example[1])

    word_counter.update(s1_tokenize)
    word_counter.update(s2_tokenize)

    for i, word in enumerate(s1_tokenize):
        char_counter.update([c for c in word])
    for word in s2_tokenize:
        char_counter.update([c for c in word])

vocabulary = set([word for word in word_counter])
vocabulary = list(vocabulary)
vocabulary = [PADDING, "<UNK>"] + vocabulary


word_indices = dict(zip(vocabulary, range(len(vocabulary))))
indices_to_words = {v: k for k, v in word_indices.items()}
char_vocab = set([char for char in char_counter])
char_vocab = list(char_vocab)
char_vocab = [PADDING] + char_vocab
char_indices = dict(zip(char_vocab, range(len(char_vocab))))
indices_to_char = {v: k for k, v in char_indices.items()}

100%|██████████| 549360/549360 [00:43<00:00, 12667.21it/s]


In [11]:
for example in tqdm(data):
    indx_seq = np.zeros((12), dtype=np.int32)
    tfidf = np.zeros((12), dtype=np.float32)

    for sentence in [0,1]:
        token_sequence = tokenize(example[sentence])
        padding = 12 - len(token_sequence)
        
        for i in range(12):
            if i >= len(token_sequence):
                index = word_indices[PADDING]
                itf = 0
            else:
                
                index = word_indices[token_sequence[i]] if word_counter[token_sequence[i]] >= 3 else word_indices["<UNK>"]
                itf = 1 / (word_counter[token_sequence[i]] + 1)
            indx_seq[i] = index
            
            tfidf[i] = itf
        
        char_index = np.zeros((12, 8), dtype=np.int32)
        for i in range(12):
            if i >= len(token_sequence):
                continue
            else:
                chars = [c for c in token_sequence[i]]
                for j in range(8):
                        if j >= (len(chars)):
                            break
                        else:
                            index = char_indices[chars[j]]
                        char_index[i,j] = index 



100%|██████████| 549360/549360 [00:54<00:00, 9992.75it/s]


In [9]:
for i in tqdm(data):
    s1_pos = nltk.pos_tag(nltk.word_tokenize(i[0]))
    



  0%|          | 0/549360 [00:00<?, ?it/s][A[A

  0%|          | 84/549360 [00:00<10:59, 832.51it/s][A[A

  0%|          | 193/549360 [00:00<10:13, 894.92it/s][A[A

  0%|          | 286/549360 [00:00<10:08, 902.97it/s][A[A

  0%|          | 375/549360 [00:00<10:11, 897.24it/s][A[A

  0%|          | 459/549360 [00:00<10:25, 877.99it/s][A[A

  0%|          | 566/549360 [00:00<09:51, 927.76it/s][A[A

  0%|          | 674/549360 [00:00<09:27, 967.13it/s][A[A

  0%|          | 774/549360 [00:00<09:23, 973.11it/s][A[A

  0%|          | 899/549360 [00:00<08:46, 1040.95it/s][A[A

  0%|          | 1002/549360 [00:01<09:07, 1000.87it/s][A[A

  0%|          | 1102/549360 [00:01<09:10, 996.15it/s] [A[A

  0%|          | 1202/549360 [00:01<09:52, 924.50it/s][A[A

  0%|          | 1301/549360 [00:01<09:41, 942.66it/s][A[A

  0%|          | 1401/549360 [00:01<09:32, 957.14it/s][A[A

  0%|          | 1512/549360 [00:01<09:09, 997.54it/s][A[A

  0%|          | 1616/549

KeyboardInterrupt: 

In [29]:
nltk.pos_tag(nltk.word_tokenize("Hello how are you"))

[('Hello', 'NNP'), ('how', 'WRB'), ('are', 'VBP'), ('you', 'PRP')]

In [11]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.


True

In [13]:
!pip install chars2vec

Collecting chars2vec
[?25l  Downloading https://files.pythonhosted.org/packages/04/0a/8c327aae23e0532d239ec7b30446aca765eb5d9547b4c4b09cdd82e49797/chars2vec-0.1.7.tar.gz (8.1MB)
[K     |████████████████████████████████| 8.1MB 5.3MB/s 
[?25hBuilding wheels for collected packages: chars2vec
  Building wheel for chars2vec (setup.py) ... [?25l[?25hdone
  Created wheel for chars2vec: filename=chars2vec-0.1.7-cp36-none-any.whl size=8111095 sha256=70ab86f76114af6952aba5fdc1f8ec59a9420683d6d57c98d49cf4d534feef89
  Stored in directory: /root/.cache/pip/wheels/97/b6/65/d7e778ef1213ec77d315aea0f536068b96e36cc94c02abbfde
Successfully built chars2vec
Installing collected packages: chars2vec
Successfully installed chars2vec-0.1.7


In [15]:
import chars2vec
c2v_model = chars2vec.load_model('eng_50')


In [18]:

CHAR_LIST = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.',
               '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<',
               '=', '>', '?', '@', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
               'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
               'x', 'y', 'z']


In [22]:
char_to_ind = {CHAR_LIST[j]:j for j in range(len(CHAR_LIST))}
ind_to_char = {j:CHAR_LIST[j] for j in range(len(CHAR_LIST))}

54