In [3]:
import pandas as pd
import pickle as pk
import torch

## load Fast-text Embedding

In [18]:
ft_home = './'
words_to_load = 50000

import numpy as np

with open(ft_home + 'wiki-news-300d-1M.vec') as f:
    loaded_embeddings_ft = np.zeros((words_to_load+2, 300), dtype=np.float32)
    words_ft = {}
    idx2words_ft = {}
    ordered_words_ft = []
    for i, line in enumerate(f):
        if i >= words_to_load: 
            break
        s = line.split()
        loaded_embeddings_ft[i+2, :] = np.asarray(s[1:])
        words_ft[s[0]] = i+2
        idx2words_ft[i+2] = s[0]
        ordered_words_ft.append(s[0])

In [19]:
#add padding and unknow token
words_ft['<PAD>'] = 0
words_ft['<UNK>'] = 1
idx2words_ft[0] = '<PAD>'
idx2words_ft[1] = '<UNK>'
#init padding and unknow embedding to gaussian random numbers
loaded_embeddings_ft[0,:] = np.zeros(loaded_embeddings_ft.shape[1])
loaded_embeddings_ft[1,:] = np.random.normal(size = (loaded_embeddings_ft.shape[1],))

In [None]:
## save embedding matrix
# pk.dump(torch.from_numpy(loaded_embeddings_ft), open('./hw2_data/loaded_embeddings_ft.pk', 'wb'))

## Preprocessing function

In [20]:
def preprocess(dataset, label_dct, word2id, id2word):
    dataset['sentence1'] = dataset['sentence1'].apply(lambda x: x.split()) 
    dataset['sentence2'] = dataset['sentence2'].apply(lambda x: x.split()) 
    dataset['label'] = dataset['label'].apply(lambda x: label_dct[x])
    
    def token2vocab(row):
        for i in range(len(row)):
            if row[i] in word2id.keys(): 
                row[i] = word2id[row[i]]
            else: row[i] = word2id['<UNK>']
        return row

    dataset['sentence1'] = dataset['sentence1'].apply(lambda x: token2vocab(x))
    dataset['sentence2'] = dataset['sentence2'].apply(lambda x: token2vocab(x))
    return dataset

## define labels

In [None]:
label_dct = {'neutral':0, 'entailment':1, 'contradiction':2}

## SNLI dataset

In [None]:
snli_train = pd.read_csv('hw2_data/snli_train.tsv', sep = '\t')
snli_val = pd.read_csv('hw2_data/snli_val.tsv', sep = '\t')

In [26]:
snli_train_id = preprocess(snli_train, label_dct, words_ft, idx2words_ft)
snli_val_id = preprocess(snli_val, label_dct, words_ft, idx2words_ft)

In [205]:
# pk.dump(snli_train_id, open("./hw2_data/snli_train_id.pk", "wb"))
# pk.dump(snli_val_id, open("./hw2_data/snli_val_id.pk", "wb"))

## MNLI dataset

In [6]:
mnli_train = pd.read_csv('hw2_data/mnli_train.tsv', sep = '\t')
mnli_val = pd.read_csv('hw2_data/mnli_val.tsv', sep = '\t')

In [7]:
Genres = mnli_train['genre'].unique()

In [22]:
mnli_train_dict = {}
mnli_val_dict = {}
for g in Genres:
    mnli_train_dict[g] = mnli_train[mnli_train['genre'] == g].drop('genre', axis = 1)
    mnli_train_dict[g] = preprocess(mnli_train_dict[g], label_dct, words_ft, idx2words_ft)
    mnli_val_dict[g] = mnli_val[mnli_val['genre']==g].drop('genre', axis = 1)
    mnli_val_dict[g] = preprocess(mnli_val_dict[g], label_dct, words_ft, idx2words_ft)

In [28]:
# pk.dump(mnli_train_dict, open("./hw2_data/mnli_train_dict.pk", "wb"))
# pk.dump(mnli_val_dict, open("./hw2_data/mnli_val_dict.pk", "wb"))