In [None]:
import pandas as pd
import os
from tqdm.notebook import tqdm
import json
import pickle
from utils import *

In [None]:
with open("./config.json", "r") as fp:
    config = json.load(fp)

In [None]:
DATA_DIR_PATH = config["data_dir_path"]
LABEL_DICT = config["label_dict"]
WORD_EMBEDDINGS_PATH = os.path.join(DATA_DIR_PATH, "glove.840B.300d.txt")

# Generate Word Index Map and Word Embedding Matrix from UD Tree

In [None]:
df_snli_train = pd.read_csv(os.path.join(DATA_DIR_PATH, "snli_train.tsv"), delimiter='\t', index_col=0)
df_snli_train.head()

In [None]:
word_list = []
for sample in tqdm(list(df_snli_train.itertuples())):
    word_list.extend(tree2tokenlist(ET.fromstring(sample.udtree1)))
    word_list.extend(tree2tokenlist(ET.fromstring(sample.udtree2)))
len(word_list)

In [None]:
from collections import Counter

counts = Counter(word_list)

word2index = {}

word2index["_PAD_"] = 0
word2index["_OOV_"] = 1

offset = 2

for i, word in enumerate(counts.most_common()):
    word2index[word[0]] = i + offset
len(word2index)

In [None]:
import numpy as np

embeddings = {}

with open(WORD_EMBEDDINGS_PATH, "r") as fp:
    for line in fp:
        line = line.split()

        try:
            float(line[1])
            word = line[0]
            if word in word2index:
                embeddings[word] = np.array(line[1:], dtype=float)

        except ValueError:
            pass
len(embeddings)

In [None]:
vocab_size = len(word2index) # 埋め込みの単語数
embedding_size = len(list(embeddings.values())[0]) # 単語埋め込みの次元数

embedding_matrix = np.zeros((vocab_size, embedding_size)) # 埋め込み行列(単語数×埋め込み次元数)

missing_words = []
for word, index in word2index.items():
    if word in embeddings:
        embedding_matrix[index] = embeddings[word]
    else:
        if word == "_PAD_":
            continue
        if word != "_OOV_":
            missing_words.append(word)
        embedding_matrix[index] = np.random.normal(size=(embedding_size))

print("The number of missing words:", len(missing_words))
embedding_matrix.shape

In [None]:
with open(os.path.join(DATA_DIR_PATH, "word_index_map.json"), "w") as fp:
    json.dump(word2index, fp, ensure_ascii=False)

In [None]:
with open(os.path.join(DATA_DIR_PATH, "word_embedding_matrix.pkl"), "wb") as fp:
    pickle.dump(embedding_matrix, fp)

# Generate Train Dataset, Dev Dataset, Test Data

In [None]:
df_snli_dev = pd.read_csv(os.path.join(DATA_DIR_PATH, "snli_dev.tsv"), delimiter='\t', index_col=0)
df_snli_dev.head()

In [None]:
def df_to_dataset(df):
    id_list = []
    premise_list = []
    hypothesis_list = []
    label_list = []
    
    for sample in tqdm(list(df.itertuples())):
        id_list.append(sample.Index)
        premise_list.append(tokenlist2indexlist(tree2tokenlist(ET.fromstring(sample.udtree1)), word2index))
        hypothesis_list.append(tokenlist2indexlist(tree2tokenlist(ET.fromstring(sample.udtree2)), word2index))
        label_list.append(LABEL_DICT[sample.gold_label])
        
    return {
        "ids": id_list,
        "premises": premise_list,
        "hypotheses": hypothesis_list,
        "labels": label_list
    }

In [None]:
train_data = df_to_dataset(df_snli_train)
with open(os.path.join(DATA_DIR_PATH, "train_data.pkl"), "wb") as fp:
    pickle.dump(train_data, fp)

In [None]:
dev_data = df_to_dataset(df_snli_dev)
with open(os.path.join(DATA_DIR_PATH, "dev_data.pkl"), "wb") as fp:
    pickle.dump(dev_data, fp)