In [None]:
def sentences_to_padded_index_sequences(datasets):
    """
    Annotate datasets with feature vectors. Adding right-sided padding. 
    """
    # Extract vocabulary
    def tokenize(string):
        string = re.sub(r'\(|\)', '', string)
        return string.split()
    

    word_counter = collections.Counter()
    char_counter = collections.Counter()

    for example in tqdm(dataset):
        s1_tokenize = tokenize(example[0])
        s2_tokenize = tokenize(example[1])

        word_counter.update(s1_tokenize)
        word_counter.update(s2_tokenize)

        for i, word in enumerate(s1_tokenize):
            char_counter.update([c for c in word])
        for word in s2_tokenize:
            char_counter.update([c for c in word])

    vocabulary = set([word for word in word_counter])
    vocabulary = list(vocabulary)
    if config.embedding_replacing_rare_word_with_UNK: 
        vocabulary = [PADDING, "<UNK>"] + vocabulary
    else:
        vocabulary = [PADDING] + vocabulary
    
    word_indices = dict(zip(vocabulary, range(len(vocabulary))))
    indices_to_words = {v: k for k, v in word_indices.items()}
    char_vocab = set([char for char in char_counter])
    char_vocab = list(char_vocab)
    char_vocab = [PADDING] + char_vocab
    char_indices = dict(zip(char_vocab, range(len(char_vocab))))
    indices_to_char = {v: k for k, v in char_indices.items()}
    

    for i, dataset in enumerate(datasets):
        for example in tqdm(dataset):
            for sentence in [, 'sentence2_binary_parse']:
                example[sentence + '_index_sequence'] = np.zeros((FIXED_PARAMETERS["seq_length"]), dtype=np.int32)
                example[sentence + '_inverse_term_frequency'] = np.zeros((FIXED_PARAMETERS["seq_length"]), dtype=np.float32)

                token_sequence = tokenize(example[sentence])
                padding = FIXED_PARAMETERS["seq_length"] - len(token_sequence)
                      
                for i in range(FIXED_PARAMETERS["seq_length"]):
                    if i >= len(token_sequence):
                        index = word_indices[PADDING]
                        itf = 0
                    else:
                        if config.embedding_replacing_rare_word_with_UNK:
                            index = word_indices[token_sequence[i]] if word_counter[token_sequence[i]] >= config.UNK_threshold else word_indices["<UNK>"]
                        else:
                            index = word_indices[token_sequence[i]]
                        itf = 1 / (word_counter[token_sequence[i]] + 1)
                    example[sentence + '_index_sequence'][i] = index
                    
                    example[sentence + '_inverse_term_frequency'][i] = itf
                
                example[sentence + '_char_index'] = np.zeros((FIXED_PARAMETERS["seq_length"], config.char_in_word_size), dtype=np.int32)
                for i in range(FIXED_PARAMETERS["seq_length"]):
                    if i >= len(token_sequence):
                        continue
                    else:
                        chars = [c for c in token_sequence[i]]
                        for j in range(config.char_in_word_size):
                            if j >= (len(chars)):
                                break
                            else:
                                index = char_indices[chars[j]]
                            example[sentence + '_char_index'][i,j] = index 
    

    return indices_to_words, word_indices, char_indices, indices_to_char


In [7]:
import numpy as np
import re
import random
import json
import collections
import numpy as np
from tqdm import tqdm
import nltk
from nltk.corpus import wordnet as wn 
import os
import pickle
import multiprocessing
from nltk.tag import StanfordNERTagger
from nltk.tag import StanfordPOSTagger

In [31]:
PADDING = "<PAD>"
POS_Tagging = [PADDING, 'WP$', 'RBS', 'SYM', 'WRB', 'IN', 'VB', 'POS', 'TO', ':', '-RRB-', '$', 'MD', 'JJ', '#', 'CD', '``', 'JJR', 'NNP', "''", 'LS', 'VBP', 'VBD', 'FW', 'RBR', 'JJS', 'DT', 'VBG', 'RP', 'NNS', 'RB', 'PDT', 'PRP$', '.', 'XX', 'NNPS', 'UH', 'EX', 'NN', 'WDT', 'VBN', 'VBZ', 'CC', ',', '-LRB-', 'PRP', 'WP']
POS_dict = {pos:i for i, pos in enumerate(POS_Tagging)}

stemmer = nltk.SnowballStemmer('english')

tt = nltk.tokenize.treebank.TreebankWordTokenizer()


In [5]:

def is_exact_match(token1, token2):
    token1 = token1.lower()
    token2 = token2.lower()
    
    token1_stem = stemmer.stem(token1)

    if token1 == token2:
        return True
    
    for synsets in wn.synsets(token2):
        for lemma in synsets.lemma_names():
            if token1_stem == stemmer.stem(lemma):
                return True
    
    if token1 == "n't" and token2 == "not":
        return True
    elif token1 == "not" and token2 == "n't":
        return True
    elif token1_stem == stemmer.stem(token2):
        return True
    return False

In [6]:

def is_antonyms(token1, token2):
    token1 = token1.lower()
    token2 = token2.lower()
    token1_stem = stemmer.stem(token1)
    antonym_lists_for_token2 = []
    for synsets in wn.synsets(token2):
        for lemma_synsets in [wn.synsets(l) for l in synsets.lemma_names()]:
            for lemma_syn in lemma_synsets:
                for lemma in lemma_syn.lemmas():
                    for antonym in lemma.antonyms():
                        antonym_lists_for_token2.append(antonym.name())
                        # if token1_stem == stemmer.stem(antonym.name()):
                        #     return True 
    antonym_lists_for_token2 = list(set(antonym_lists_for_token2))
    for atnm in antonym_lists_for_token2:
        if token1_stem == stemmer.stem(atnm):
            return True
    return False  

In [1]:
from dataloaders import *

In [4]:
data = SNLIDataset('train')

In [5]:
for i,e in enumerate(data):
    print(i)
    break

('A person on a horse jumps over a broken down airplane.', 'A person is training his horse for a competition.', tensor(2))


In [11]:
# Extract vocabulary
def tokenize(string):
    string = re.sub(r'\(|\)', '', string)
    return string.split()


word_counter = collections.Counter()
char_counter = collections.Counter()

for example in tqdm(data):
    s1_tokenize = tokenize(example[0])
    s2_tokenize = tokenize(example[1])

    word_counter.update(s1_tokenize)
    word_counter.update(s2_tokenize)

    for i, word in enumerate(s1_tokenize):
        char_counter.update([c for c in word])
    for word in s2_tokenize:
        char_counter.update([c for c in word])

vocabulary = set([word for word in word_counter])
vocabulary = list(vocabulary)
vocabulary = [PADDING, "<UNK>"] + vocabulary


word_indices = dict(zip(vocabulary, range(len(vocabulary))))
indices_to_words = {v: k for k, v in word_indices.items()}
char_vocab = set([char for char in char_counter])
char_vocab = list(char_vocab)
char_vocab = [PADDING] + char_vocab
char_indices = dict(zip(char_vocab, range(len(char_vocab))))
indices_to_char = {v: k for k, v in char_indices.items()}

100%|██████████| 549360/549360 [00:42<00:00, 13078.76it/s]


In [14]:
for example in tqdm(data):
    indx_seq = np.zeros((12), dtype=np.int32)
    tfidf = np.zeros((12), dtype=np.float32)

    for sentence in [0,1]:
        token_sequence = tokenize(example[sentence])
        padding = 12 - len(token_sequence)
        
        for i in range(12):
            if i >= len(token_sequence):
                index = word_indices[PADDING]
                itf = 0
            else:
                
                index = word_indices[token_sequence[i]] if word_counter[token_sequence[i]] >= 3 else word_indices["<UNK>"]
                itf = 1 / (word_counter[token_sequence[i]] + 1)
            indx_seq[i] = index
            
            tfidf[i] = itf
        
        char_index = np.zeros((12, 8), dtype=np.int32)
        for i in range(12):
            if i >= len(token_sequence):
                continue
            else:
                chars = [c for c in token_sequence[i]]
                for j in range(8):
                        if j >= (len(chars)):
                            break
                        else:
                            index = char_indices[chars[j]]
                        char_index[i,j] = index 



100%|██████████| 549360/549360 [00:52<00:00, 10524.93it/s]


In [15]:
char_index

array([[12, 19,  0,  0,  0,  0,  0,  0],
       [ 1, 73, 25,  0,  0,  0,  0,  0],
       [15, 25, 35, 43,  1,  2, 18, 43],
       [15, 13, 43, 25,  0,  0,  0,  0],
       [64, 35,  1, 25, 34,  0,  0,  0],
       [ 1, 73, 25, 34, 25,  0,  0,  0],
       [ 2, 28,  0,  0,  0,  0,  0,  0],
       [35,  0,  0,  0,  0,  0,  0,  0],
       [72, 35, 19,  0,  0,  0,  0,  0],
       [ 2, 19,  0,  0,  0,  0,  0,  0],
       [35,  0,  0,  0,  0,  0,  0,  0],
       [15,  5, 62, 46, 28, 43,  2,  1]], dtype=int32)

In [17]:
char_index[0]

array([12, 19,  0,  0,  0,  0,  0,  0], dtype=int32)

In [28]:
for i in tqdm(data):
    s1 = nltk.word_tokenize(i[0])
    s2 = nltk.word_tokenize(i[1])

    lens = []
    lens += [len(i) for i in s1]
    lens += [len(i) for i in s2]

100%|██████████| 549360/549360 [02:33<00:00, 3569.42it/s]


In [30]:
for i in data:
    s1_pos = pst.tag(i[0])

NameError: name 'pst' is not defined

In [None]:
nst = StanfordNERTagger('stanford-ner-2020-11-17/classifiers/english.muc.7class.distsim.crf.ser.gz', 'stanford-ner-2020-11-17/stanford-ner-4.2.0.jar',encoding='utf-8')
pst = StanfordPOSTagger('/home/users/yichen.gong/Stanford/stanford-postagger-2014-08-27/models/english-bidirectional-distsim.tagger', \
                    '/home/users/yichen.gong/Stanford/stanford-postagger-2014-08-27/stanford-postagger.jar')
