<h1 id="tocheading">Reading Data</h1>
<div id="toc"></div>

In [1]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

In [2]:
import pickle
import random
import random
import spacy
import csv
import string
import io
import os
import re
import torch
import functools
import numpy as np
import pandas as pd
from collections import Counter
from collections import defaultdict
import spacy
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
langs = ["ar", "bg", "de", "el", "en", "es", "fr", "hi", "ru", "th", "tr", "vi", "zh"]

In [4]:
language_dict = defaultdict(dict)

## Load Vectors

In [5]:
def load_vectors(fname):
    fin = io.open(fname, "r", encoding="utf-8", newline="\n", errors="ignore")
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(" ")
        data[tokens[0]] = [*map(float, tokens[1:])]
    return data

In [6]:
vector_path = "../../data/aligned_embeddings"
for x in langs:
    print ("loading vectors for", x)
    fname = "{}/wiki.{}.align.vec".format(vector_path, x)
    language_dict[x]["vectors"] = load_vectors(fname)

loading vectors for ar
loading vectors for bg
loading vectors for de
loading vectors for el
loading vectors for en
loading vectors for es
loading vectors for fr
loading vectors for hi
loading vectors for ru
loading vectors for th
loading vectors for tr
loading vectors for vi
loading vectors for zh


## Load XNLI

In [7]:
# load datasets
xnli_dev = pd.read_csv("../../data/XNLI/xnli.dev.tsv", sep="\t")
xnli_test = pd.read_csv("../../data/XNLI/xnli.test.tsv", sep="\t")
mnli_train = pd.read_json("../../data/MultiNLI/multinli_1.0_train.jsonl", lines=True)
mnli_dev = pd.read_json("../../data/MultiNLI/multinli_1.0_dev_matched.jsonl", lines=True)

In [8]:
language_dict.keys()

dict_keys(['ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'ru', 'th', 'tr', 'vi', 'zh'])

In [9]:
for x in langs:
    language_dict[x]["xnli_dev"] = xnli_dev[xnli_dev["language"]==x]
    language_dict[x]["xnli_test"] = xnli_test[xnli_test["language"]==x]

## Define Language Classes

### Build Vocabulary

In [10]:
def tokenize_dataset(dataset, remove_punc=False):
    all_s1_tokens = []
    all_s2_tokens = []
    for s in ["sentence1", "sentence2"]:
        if remove_punc:
            punc = [*string.punctuation]
            dataset["{}_tokenized".format(s)] = dataset["{}_tokenized".format(s)].\
            apply(lambda x: "".join(c for c in x if c not in punc).lower().split(" "))
        else:
            dataset["{}_tokenized".format(s)] = dataset["{}_tokenized".format(s)].\
            apply(lambda x: x.lower().split(" "))
    dataset["sentence1_tokenized"].apply(lambda x: all_s1_tokens.extend(x))
    dataset["sentence2_tokenized"].apply(lambda x: all_s2_tokens.extend(x))
    all_tokens = all_s1_tokens + all_s2_tokens
    return dataset, all_tokens

In [11]:
reg = re.compile("[%s]" % re.escape(string.punctuation))

def tokenize_mnli(dataset, remove_punc=True):
    punc = string.punctuation
    all_s1_tokens = []
    all_s2_tokens = []
    for s in [1,2]:
        if remove_punc:
            dataset["sentence{}_tokenized".format(s)] = dataset["sentence{}".format(s)].\
            apply(lambda x: reg.sub("", x).lower().split(" "))
        else:
            dataset["sentence{}_tokenized".format(s)] = dataset["sentence{}".format(s)].\
            apply(lambda x: (reg.sub("", x) + " .").lower().split(" "))
    print ("Tokenizing done.")
    dataset["sentence1_tokenized"].apply(lambda x: all_s1_tokens.extend(x))
    dataset["sentence2_tokenized"].apply(lambda x: all_s2_tokens.extend(x))
    print ("Token collection done.")
    all_tokens = all_s1_tokens + all_s2_tokens
    print ("Concatenation done.")
    return dataset, all_tokens

In [12]:
mnli_train_tokenized, all_train_tokens = tokenize_mnli(mnli_train, remove_punc=False)

Tokenizing done.
Token collection done.
Concatenation done.


In [13]:
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens, max_vocab_size):
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = [*vocab]
    token2id = dict(zip(vocab, range(2,2+len(vocab))))
    id2token = ['<PAD>', '<UNK>'] + id2token
    token2id["<PAD>"] = PAD_IDX
    token2id["<UNK>"] = UNK_IDX
    return token2id, id2token

### Define Language Classes

In [14]:
class XNLILang:
    def __init__(self, name, max_vocab_size):
        self.name = name
        self.xnli_dev, self.xnli_test = language_dict[self.name]["xnli_dev"], language_dict[self.name]["xnli_test"] 
        self.tokenized_dev, self.all_dev_tokens = tokenize_dataset(self.xnli_dev, remove_punc=False)
        self.tokenized_test, _ = tokenize_dataset(self.xnli_test, remove_punc=False)
        self.token2id, self.id2token = build_vocab(self.all_dev_tokens, max_vocab_size)

In [16]:
class MNLILang:
    def __init__(self, name, max_vocab_size):
        self.name = name
        self.tokenized_train_data, self.all_train_tokens = tokenize_mnli(mnli_train)
        self.train_tokens = all_train_tokens
        self.xnli_dev, self.xnli_test = language_dict[self.name]["xnli_dev"], language_dict[self.name]["xnli_test"] 
        self.tokenized_dev, self.all_dev_tokens = tokenize_dataset(self.xnli_dev, remove_punc=False)
        self.tokenized_test, _ = tokenize_dataset(self.xnli_test, remove_punc=False)
        self.token2id, self.id2token = build_vocab(all_train_tokens, max_vocab_size)

In [17]:
tr = XNLILang("tr", max_vocab_size=100000)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [25]:
tr.xnli_dev["sentence1_tokenized"].iloc[0]

['ve', 'anne', ',', 'evdeyim', 'dedi', '.']

## Define Loaders

In [28]:
MAX_SENTENCE_LENGTH = 100
BATCH_SIZE = 64

class MNLIDataset(Dataset):
    def __init__(self, lang, max_sentence_length=MAX_SENTENCE_LENGTH):
        self.sentence1, self.sentence2, self.labels = lang.tokenized_train_data["sentence1_tokenized"].values, \
                                                      lang.tokenized_train_data["sentence2_tokenized"].values, \
                                                      lang.tokenized_train_data["gold_label"].values
        self.max_sentence_length = max_sentence_length
        self.token2id, self.id2token = lang.token2id, lang.id2token
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, row):
        label = self.labels[row]
        sentence1_word_idx, sentence2_word_idx = [], []
        sentence1_mask, sentence2_mask = [], []
        for word in self.sentence1[row][:self.max_sentence_length]:
            if word in self.token2id.keys():
                sentence1_word_idx.append(self.token2id[word])
                sentence1_mask.append(0)
            else:
                sentence1_word_idx.append(UNK_IDX)
                sentence1_mask.append(1)
        for word in self.sentence2[row][:self.max_sentence_length]:
            if word in self.token2id.keys():
                sentence2_word_idx.append(self.token2id[word])
                sentence2_mask.append(0)
            else:
                sentence2_word_idx.append(UNK_IDX)
                sentence2_mask.append(1)
        sentence1_list = [sentence1_word_idx, sentence1_mask, len(sentence1_word_idx)]
        sentence2_list = [sentence2_word_idx, sentence2_mask, len(sentence2_word_idx)]
        
        return sentence1_list + sentence2_list + [label]

def mnli_func(batch, max_sent_length):
    s1_data, s2_data = [], []
    s1_mask, s2_mask = [], []
    s1_lengths, s2_lengths = [], []
    labels = []

    for datum in batch:
        s1_lengths.append(datum[2])
        s2_lengths.append(datum[5])
        labels.append(datum[6])

        sentence1_data_padded = np.pad(np.array(datum[0]), pad_width=((0, MAX_SENTENCE_LENGTH-datum[2])), mode="constant", constant_values=0)
        sentence1_data.append(sentence1_data_padded)
        sentence1_mask_padded = np.pad(np.array(datum[1]), pad_width=((0, MAX_SENTENCE_LENGTH-datum[2])), mode="constant", constant_values=0)
        sentence1_mask.append(sentence1_mask_padded)
        
        sentence2_data_padded = np.pad(np.array(datum[3]), pad_width=((0, MAX_SENTENCE_LENGTH-datum[5])), mode="constant", constant_values=0)
        sentence2_data.append(sentence2_data_padded)
        sentence2_mask_padded = np.pad(np.array(datum[4]), pad_width=((0, MAX_SENTENCE_LENGTH-datum[5])), mode="constant", constant_values=0)
        sentence2_mask.append(sentence2_mask_padded)
        
    ind_dec_order = np.argsort(s1_lengths)[::-1]
    s1_data = np.array(s1_data)[ind_dec_order]
    s2_data = np.array(s2_data)[ind_dec_order]
    s1_mask = np.array(s1_mask)[ind_dec_order].reshape(len(batch), -1, 1)
    s2_mask = np.array(s2_mask)[ind_dec_order].reshape(len(batch), -1, 1)
    s1_lengths = np.array(s1_lengths)[ind_dec_order]
    s2_lengths = np.array(s2_lengths)[ind_dec_order]
    labels = np.array(labels)[ind_dec_order]
    
    s1_list = [torch.from_numpy(s1_data), torch.from_numpy(s1_mask).float(), s1_lengths]
    s2_list = [torch.from_numpy(s2_data), torch.from_numpy(s2_mask).float(), s2_lengths]
        
    return s1_list + s2_list + [torch.from_numpy(labels)]