In [140]:
import numpy as np
import pandas as pd
import spacy
import string
import pickle as pkl
from sklearn import preprocessing

In [141]:
#load in the data
train_df = pd.read_csv('snli_train.tsv', sep="\t")
val_df = pd.read_csv('snli_val.tsv',sep="\t")

#get data & convert sentences to lists
train_sentence1=train_df['sentence1'].values.tolist()
train_sentence2=train_df['sentence2'].values.tolist()
val_sentence1=val_df['sentence1'].values.tolist()
val_sentence2=val_df['sentence2'].values.tolist()

#convert the text labels to numeric
le = preprocessing.LabelEncoder()
le.fit(train_df['label'])
train_targets=le.transform(train_df['label']).tolist()
val_targets=le.transform(val_df['label']).tolist()

In [142]:
train_df

Unnamed: 0,sentence1,sentence2,label
0,A young girl in a pink shirt sitting on a dock...,A young girl watching the sunset over the water .,neutral
1,A woman is smiling while the man next to her i...,Two people are next to each other .,entailment
2,"Across the river , you can see a large building .",The large building is full of apartments and t...,neutral
3,a man in white shorts and a black shirt is par...,A man is riding a jetski on the ocean .,contradiction
4,Four black dogs run together on bright green g...,Four dogs are preparing to be launched into sp...,contradiction
5,A female laying on her stomach in the water ou...,There is a women outdoors,entailment
6,Children eat at a long table with black chairs .,Kids at a short table with red chairs .,contradiction
7,A person rides a motorcycle quickly .,The man is racing his motorcycle in a race .,neutral
8,Woman riding a red bicycle down a city street ...,person riding a bike,entailment
9,"Two young women dancers , one brunette and one...",Two women are wearing costumes,entailment


In [143]:
# Load English tokenizer, tagger, parser, NER and word vectors
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation

# lowercase and remove punctuation
def tokenize(sent):
  tokens = tokenizer(sent)
  return [token.text.lower() for token in tokens if (token.text not in punctuations)]

In [145]:
# This is the code cell that tokenizes train/val/test datasets
def tokenize_dataset(dataset):
    token_dataset = []
    # we are keeping track of all tokens in dataset 
    # in order to create vocabulary later
    all_tokens = []
    
    for sample in dataset:
        tokens = tokenize(sample)
        token_dataset.append(tokens)
        all_tokens += tokens

    return token_dataset, all_tokens

In [147]:
#train set tokens
print ("Tokenizing train data")
train_sentence1_tokens, train_all_sentence1_tokens = tokenize_dataset(train_sentence1)
train_sentence2_tokens, train_all_sentence2_tokens = tokenize_dataset(train_sentence2)
train_all_combined_sentence_tokens = train_all_sentence1_tokens + train_all_sentence2_tokens
pkl.dump(train_all_combined_sentence_tokens, open("train_all_combined_sentence_tokens.p", "wb"))

Tokenizing train data


In [None]:
#run this cell only once, otherwise leverage the existing saved files
#train set tokens
print ("Tokenizing train data")
train_sentence1_tokens, train_all_sentence1_tokens = tokenize_dataset(train_sentence1)
train_sentence2_tokens, train_all_sentence2_tokens = tokenize_dataset(train_sentence2)
pkl.dump(train_sentence1_tokens, open("train_sentence1_tokens.p", "wb"))
pkl.dump(train_sentence2_tokens, open("train_sentence2_tokens.p", "wb"))
#pkl.dump(train_all_sentence1_tokens, open("train_all_sentence1_tokens.p", "wb"))
#pkl.dump(train_all_sentence2_tokens, open("train_all_sentence2_tokens.p", "wb"))1

#combine tokens from both sentences to create a shared dictionary
train_all_combined_sentence_tokens = train_all_sentence1_tokens + train_all_sentence2_tokens
pkl.dump(train_all_combined_sentence_tokens, open("train_all_combined_sentence_tokens.p", "wb"))

#val set tokens
print ("Tokenizing val data")
val_sentence1_tokens, _ = tokenize_dataset(val_sentence1)
val_sentence2_tokens, _ = tokenize_dataset(val_sentence2)
pkl.dump(val_sentence1_tokens, open("val_sentence1_tokens.p", "wb"))
pkl.dump(val_sentence2_tokens, open("val_sentence2_tokens.p", "wb"))

In [148]:
#If you have previously run the previous cell, run this cell instead to load preprocessed datasets
train_sentence1_tokens = pkl.load(open("train_sentence1_tokens.p", "rb"))
train_sentence2_tokens = pkl.load(open("train_sentence2_tokens.p", "rb"))
train_all_combined_sentence_tokens = pkl.load(open("train_all_combined_sentence_tokens.p", "rb"))
#train_all_sentence1_tokens = pkl.load(open("train_all_sentence1_tokens.p", "rb"))
#train_all_sentence2_tokens = pkl.load(open("train_all_sentence2_tokens.p", "rb"))
val_sentence1_tokens = pkl.load(open("val_sentence1_tokens.p", "rb"))
val_sentence2_tokens = pkl.load(open("val_sentence2_tokens.p", "rb"))

In [151]:
#print information about the token datasets
# double checking
print ("Train sentence1 dataset size is {}".format(len(train_sentence1_tokens)))
print ("Train sentence2 dataset size is {}".format(len(train_sentence2_tokens)))
print ("Val sentence1 dataset size is {}".format(len(val_sentence1_tokens)))
print ("Val sentence2 dataset size is {}".format(len(val_sentence2_tokens)))

print ("\nTotal number of tokens in sentence1 train dataset is {}".format(len(train_all_sentence1_tokens)))
print ("Total number of tokens in sentence2 train dataset is {}".format(len(train_all_sentence2_tokens)))
print ("Total number of tokens in combined sent1 & sent2 train dataset is {}".format(len(train_all_combined_sentence_tokens)))

print ("\nTotal number of *unique* tokens in sentence1 train dataset is {}".format(len(set(train_all_sentence1_tokens))))
print ("Total number of *unique* tokens in sentence2 train dataset is {}".format(len(set(train_all_sentence2_tokens))))
print ("Total number of *unique* tokens in sent1 & sent2 train dataset is {}".format(len(set(train_all_combined_sentence_tokens))))


Train sentence1 dataset size is 100000
Train sentence2 dataset size is 100000
Val sentence1 dataset size is 1000
Val sentence2 dataset size is 1000

Total number of tokens in sentence1 train dataset is 1294135
Total number of tokens in sentence2 train dataset is 743372
Total number of tokens in combined sent1 & sent2 train dataset is 2037507

Total number of *unique* tokens in sentence1 train dataset is 14131
Total number of *unique* tokens in sentence2 train dataset is 15225
Total number of *unique* tokens in sent1 & sent2 train dataset is 19642


In [152]:
#build vocabularies for sentence1 and sentence2
from collections import Counter

max_vocab_size = 2000
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

token2id_combined_sent, id2token_combined_sent = build_vocab(train_all_combined_sentence_tokens)

In [154]:
# Lets check the dictionary by loading random token from it
import random 

random_token_id = random.randint(0, len(id2token_sentence1)-1)
random_token = id2token_sentence1[random_token_id]

print ("Token id {} ; token {}".format(random_token_id, id2token_combined_sent[random_token_id]))
print ("Token {}; token id {}".format(random_token, token2id_combined_sent[random_token]))

Token id 773 ; token giant
Token sticks; token id 879


In [155]:
# convert token to id in the dataset.  After running this cell we will have converted the word tokens to indices
def token2index_dataset(tokens_data,token2id):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

train_sentence1_data_indices = token2index_dataset(train_sentence1_tokens,token2id_combined_sent)
train_sentence2_data_indices = token2index_dataset(train_sentence2_tokens,token2id_combined_sent)
val_sentence1_data_indices = token2index_dataset(val_sentence1_tokens,token2id_combined_sent)
val_sentence2_data_indices = token2index_dataset(val_sentence2_tokens,token2id_combined_sent)

# double checking
print ("Train sentence1 dataset size is {}".format(len(train_sentence1_data_indices)))
print ("Train sentence2 dataset size is {}".format(len(train_sentence2_data_indices)))
print ("Val sentence1 dataset size is {}".format(len(val_sentence1_data_indices)))
print ("Val sentence2 dataset size is {}".format(len(val_sentence2_data_indices)))

Train sentence1 dataset size is 100000
Train sentence2 dataset size is 100000
Val sentence1 dataset size is 1000
Val sentence2 dataset size is 1000


In [156]:
#visualize a random sentence1 and sentence2 paired training example
rand_training_example = random.randint(0, len(train_sentence1) - 1)
print (train_sentence1_tokens[rand_training_example])
print(train_sentence1_data_indices[rand_training_example])

print (train_sentence2_tokens[rand_training_example])
print(train_sentence2_data_indices[rand_training_example])

['a', 'musician', 'guiding', 'his', 'fellows', 'outside', 'for', 'music']
[2, 712, 1, 19, 1, 38, 36, 244]
['the', 'musician', 'is', 'paid']
[3, 712, 5, 1]


In [195]:
#Check average, max, min sentence lengths to determine word padding
total_sent1_len=0
total_sent2_len=0
sent1_lens=[]
sent2_lens=[]
for i in range(0,len(train_sentence1_tokens)):
    total_sent1_len+=len(train_sentence1_tokens[i])
    total_sent2_len+=len(train_sentence2_tokens[i])
    sent1_lens.append(len(train_sentence1_tokens[i]))
    sent2_lens.append(len(train_sentence2_tokens[i]))

avg1=total_sent1_len/len(train_sentence1)
avg2=total_sent2_len/len(train_sentence2)
print("sentence1 average is: "+str(avg1)+", std dev is: "+str(np.std(sent1_lens))+", max is: "+str(max(sent1_lens))+", min is: "+str(min(sent1_lens)))
print("sentence2 average is: "+str(avg2)+", std dev is: "+str(np.std(sent2_lens))+", max is: "+str(max(sent2_lens))+", min is: "+str(min(sent2_lens)))


sentence1 average is: 12.94135, std dev is: 5.755700667816214, max is: 78, min is: 2
sentence2 average is: 7.43372, std dev is: 3.0907033118046123, max is: 38, min is: 1


In [196]:
import numpy as np
import torch
from torch.utils.data import Dataset

class NewsGroupDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list, MAX_SENTENCE_LENGTH):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.data_list = data_list
        self.MAX_SENTENCE_LENGTH=MAX_SENTENCE_LENGTH
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        token_idx = self.data_list[key][:self.MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]
    

In [197]:
train_dataset_sent1 = NewsGroupDataset(train_sentence1_data_indices,train_targets, 25)
train_dataset_sent2 = NewsGroupDataset(train_sentence2_data_indices,train_targets, 25)

In [198]:
print("sample sent1 lengths:")
for i in range(10):
    print(train_dataset_sent1[i][1])
print('\nsample sent2 lengths:')
for i in range(10):
    print(train_dataset_sent2[i][1])

sample sent1 lengths:
16
21
9
14
9
12
9
6
16
25

sample sent2 lengths:
9
7
9
9
9
5
8
9
4
5


In [199]:
## example output

print("sent1: x {};\ny {}".format(train_dataset_sent1[0][0], train_dataset_sent1[0][2]))
print("sent2: x {};\ny {}".format(train_dataset_sent2[0][0], train_dataset_sent2[0][2]))

sent1: x [2, 21, 23, 4, 2, 101, 20, 30, 7, 2, 720, 1934, 2, 351, 10, 50];
y 2
sent2: x [2, 21, 23, 114, 3, 807, 78, 3, 50];
y 2


In [200]:
MAX_SENTENCE_LENGTH=25

def newsgroup_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]

In [205]:
BATCH_SIZE = 32

train_sent1_loader = torch.utils.data.DataLoader(dataset=train_dataset_sent1, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

train_sent2_loader = torch.utils.data.DataLoader(dataset=train_dataset_sent2, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

val_dataset_sent1 = NewsGroupDataset(val_sentence1_data_indices, val_targets, 25)
val_dataset_sent2 = NewsGroupDataset(val_sentence2_data_indices, val_targets, 25)

val_sent1_loader = torch.utils.data.DataLoader(dataset=val_dataset_sent1, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

val_sent2_loader = torch.utils.data.DataLoader(dataset=val_dataset_sent2, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=newsgroup_collate_func,
                                           shuffle=True)

In [207]:
for i, (data, lengths, labels) in enumerate(train_sent1_loader):
    print(data)
    print(data.shape)
    print(lengths)
    print(lengths.shape)
    print(labels)
    print(labels.shape)
    break

tensor([[   2,   12,  166,  153,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0],
        [  13,   81,    9,   64,   67,    3, 1277,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0],
        [   2,   41,    1,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0],
        [ 704,    9,  707,   11,   14,  114,  887, 1451, 1332,   47,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0],
        [   2,   21,   26,  161,    2,   77,  786,   23,    7,   19,  840,   27,
            1,   33,    4,   19,   86,    0,    0,    0,    0,    0,    0,    0,
            0],
        [   2,   21,   25,  189,    4,    2,   28,    1,    8, 1407,    1,  283,
            7,    3,  418,  1