<h1 id="tocheading">Reading Data</h1>
<div id="toc"></div>

In [1]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

In [66]:
import pickle
import random
import random
import spacy
import csv
import string
import io
import os
import re
import torch
import functools
import numpy as np
import pandas as pd
from collections import Counter
from collections import defaultdict
import spacy
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
langs = ["ar", "bg", "de", "el", "en", "es", "fr", "hi", "ru", "th", "tr", "vi", "zh"]

In [4]:
language_dict = defaultdict(dict)

## Load Vectors

In [5]:
def load_vectors(fname):
    fin = io.open(fname, "r", encoding="utf-8", 
                  newline="\n", errors="ignore")
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(" ")
        data[tokens[0]] = [*map(float, tokens[1:])]
    return data

In [6]:
vector_path = "../../data/aligned_embeddings"
for x in langs:
    print ("loading vectors for", x)
    fname = "{}/wiki.{}.align.vec".format(vector_path, x)
    language_dict[x]["vectors"] = load_vectors(fname)

loading vectors for ar
loading vectors for bg
loading vectors for de
loading vectors for el
loading vectors for en
loading vectors for es
loading vectors for fr
loading vectors for hi
loading vectors for ru
loading vectors for th
loading vectors for tr
loading vectors for vi
loading vectors for zh


## Load XNLI

In [7]:
# load datasets
xnli_dev = pd.read_csv("../../data/XNLI/xnli.dev.tsv", sep="\t")
xnli_test = pd.read_csv("../../data/XNLI/xnli.test.tsv", sep="\t")
mnli_train = pd.read_json("../../data/MultiNLI/multinli_1.0_train.jsonl", lines=True)
mnli_dev = # TOOD
mnli_test = # TODO

In [8]:
xnli_dev.head(3)

Unnamed: 0,language,gold_label,sentence1_binary_parse,sentence2_binary_parse,sentence1_parse,sentence2_parse,sentence1,sentence2,promptID,pairID,genre,label1,label2,label3,label4,label5,sentence1_tokenized,sentence2_tokenized,match
0,ar,neutral,,,,,وقال، ماما، لقد عدت للمنزل.,اتصل بأمه حالما أوصلته حافلة المدرسية.,1,1,facetoface,neutral,contradiction,neutral,neutral,neutral,وقال ، ماما ، لقد عدت للمنزل .,اتصل بأمه حالما أوصلته حافلة المدرسية .,True
1,ar,contradiction,,,,,وقال، ماما، لقد عدت للمنزل.,لم ينطق ببنت شفة.,1,2,facetoface,contradiction,contradiction,contradiction,contradiction,contradiction,وقال ، ماما ، لقد عدت للمنزل .,لم ينطق ببنت شفة .,True
2,ar,entailment,,,,,وقال، ماما، لقد عدت للمنزل.,أخبر أمه أنه قد عاد للمنزل.,1,3,facetoface,entailment,entailment,neutral,entailment,entailment,وقال ، ماما ، لقد عدت للمنزل .,أخبر أمه أنه قد عاد للمنزل .,True


In [9]:
language_dict.keys()

dict_keys(['ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'ru', 'th', 'tr', 'vi', 'zh'])

In [10]:
for x in langs:
    language_dict[x]["xnli_dev"] = xnli_dev[xnli_dev["language"]==x]
    language_dict[x]["xnli_test"] = xnli_test[xnli_test["language"]==x]

## Define Language Classes

### Build Vocabulary

In [24]:
def tokenize_dataset(dataset, remove_punc=False):
    all_tokens = []
    for s in ["sentence1", "sentence2"]:
        if remove_punc:
            punc = [*string.punctuation]
            dataset["{}_tokenized".format(s)] = dataset["{}_tokenized".format(s)].\
            apply(lambda x: "".join(c for c in x if c not in punc).lower().split(" "))
        else:
            dataset["{}_tokenized".format(s)] = dataset["{}_tokenized".format(s)].\
            apply(lambda x: x.lower().split(" "))
    all_s1_tokens = functools.reduce(lambda x, y: x + y, [*dataset["sentence1_tokenized"]])
    all_s2_tokens = functools.reduce(lambda x, y: x + y, [*dataset["sentence2_tokenized"]])
    all_tokens = all_s1_tokens + all_s2_tokens
    return dataset, all_tokens

In [82]:
reg = re.compile("[%s]" % re.escape(string.punctuation))

def tokenize_mnli(dataset, remove_punc=True):
    all_tokens = []
    punc = string.punctuation
    for s in [1,2]:
        if remove_punc:
            dataset["sentence{}_tokenized".format(s)] = dataset["sentence{}".format(s)].\
            apply(lambda x: reg.sub("", x).lower().split(" "))
        else:
            dataset["sentence{}_tokenized".format(s)] = dataset["sentence{}".format(s)].\
            apply(lambda x: (reg.sub("", x) + " .").lower().split(" "))
    print ("Tokenizing done.")
    all_s1_tokens = functools.reduce(lambda x, y: x + y, dataset["sentence1_tokenized"])
    all_s2_tokens = functools.reduce(lambda x, y: x + y, dataset["sentence2_tokenized"])
    print ("Token collection done.")
    all_tokens = all_s1_tokens + all_s2_tokens
    return dataset, all_tokens

In [None]:
mnli_train_tokenized, all_train_tokens = tokenize_mnli(mnli_train, remove_punc=False)

Tokenizing done.


In [None]:
pickle.dump(mnli_train_tokenized, open("mnli_train_tokenized.pickle", "wb"))
pickle.dump(all_train_tokens, open("all_train_tokens_mnli.pickle", "wb"))

In [None]:
all_train_tokens[0]

In [None]:
all_tokens[0]

In [21]:
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens, max_vocab_size):
    token_counter = Counter(all_tokens)
    vocab, count = zip(troken_counter.most_common(max_vocab_size))
    id2token = [*vocab]
    token2id = dict(zip(vocab, range(2,2+len(vocab))))
    id2token = ['<PAD>', '<UNK>'] + id2token
    token2id["<PAD>"] = PAD_IDX
    token2id["<UNK>"] = UNK_IDX
    return token2id, id2token

### Define Language Classes

In [25]:
class XNLILang:
    # all langs besides en
    def __init__(self, name, max_vocab_size):
        self.name = name
        self.vectors = language_dict[self.name]["vectors"]
        self.xnli_dev, self.xnli_test = language_dict[self.name]["xnli_dev"], language_dict[self.name]["xnli_test"] 
        self.tokenized_dev, self.all_dev_tokens = tokenize_dataset(self.xnli_dev)
        self.tokenized_test, _ = tokenize_dataset(self.xnli_test)
        self.token2id, self.id2token = build_vocab(self.all_dev_tokens, max_vocab_size)

In [None]:
def MNLILang:
    # en-only
    def __init__(self, name, max_vocab_size):
        self.name = name
        self.vectors = language_dict[self.name]["vectors"]
        self.train = language_dict[self.name]["mnli_train_tokenized"]
        self.train_tokens = all_train_tokens
        self.xnli_dev, self.xnli_test = language_dict[self.name]["xnli_dev"], language_dict[self.name]["xnli_test"] 
        self.tokenized_dev, self.all_dev_tokens = tokenize_dataset(self.xnli_dev)
        self.tokenized_test, _ = tokenize_dataset(self.xnli_test)
        self.token2id, self.id2token = build_vocab(all_train_tokens, max_vocab_size)