# Extract Dataset from WordNet
---

In [1]:
from nltk.corpus import wordnet as wn
import numpy  as np
import pandas as pd
import random
import pandas as pd

## WordNetに利用されているLemmaSet(用語の集合)を作成
---

In [2]:
max_seq_len = 3

In [3]:
# 全lemmaをWordNetから取得
lemma_set = set(wn.all_lemma_names(pos='n'))
print('Num of entire lemma   :', len(lemma_set))

# lemma set をフィルタリングし学習に適したものだけに
lemma_set = set(l for l in lemma_set if len(l.split('_')) <= max_seq_len)
print('Num of filtered lemma :', len(lemma_set))

Num of entire lemma   : 117798
Num of filtered lemma : 116135


## LemmaSet内の全てのLemmaをベクトルに変換する辞書の作成
---

In [4]:
def vectorize_lemma(w2v, lemma, max_seq_len):
    lemma = lemma.split('_')
    vecs = []
    for w in lemma:
        if w in w2v: vecs.append(w2v[w])
        else       : vecs.append(np.zeros(300))
    # padding
    n_pad = max_seq_len - len(lemma)
    vecs += [np.zeros(300)] * n_pad

    return np.array(vecs)

In [5]:
word2vec = pd.read_pickle('../vectorizer/word2vec.pkl')

vectorizer_w2v = {l: vectorize_lemma(word2vec, l, max_seq_len) for l in lemma_set}

pd.to_pickle(vectorizer_w2v, 'vectorizer_w2v.pkl')

## WordNetからデータを抽出## 
---

In [6]:
n_unrelated = 500000

train_rate = 0.8

fname_full  = f'wordnet_full'
fname_train = f'wordnet_train'
fname_valid = f'wordnet_valid'

In [7]:
# 同義語のペアの追加
synonyms = []
for s in wn.all_synsets(pos='n'):
    for a in s.lemma_names():
        for b in s.lemma_names():
            if a in lemma_set and b in lemma_set:
                synonyms.append((a, b))

# 上位下位、下位上位のペアの追加
sup_subs = []
sub_sups = []
for s in wn.all_synsets(pos='n'):
    hypos = s.hyponyms()
    for h in hypos:
        for a in s.lemma_names():
            for b in h.lemma_names():
                if a in lemma_set and b in lemma_set:
                    sup_subs.append((a, b))
                    sub_sups.append((b, a))

# 無関係ペアの追加
unrelated = []
lemma_list = list(lemma_set)
related_set = set(synonyms + sup_subs + sub_sups)
while len(unrelated) < n_unrelated:
    a = random.choice(lemma_list)
    b = random.choice(lemma_list)
    if (a, b) not in related_set: unrelated.append((a, b))

# 抽出したデータのラベル付け
synonyms  = [(a, b, 0) for a, b in synonyms ]
sup_subs  = [(a, b, 1) for a, b in sup_subs ]
sub_sups  = [(a, b, 2) for a, b in sub_sups ]
unrelated = [(a, b, 3) for a, b in unrelated]

In [8]:
# データの統合とシャッフル
dataset = synonyms + sup_subs + sub_sups + unrelated
random.seed(1000)
random.shuffle(dataset)

# トレーニング用とテスト用に分割
n_total = len(dataset)
n_train = round(n_total * train_rate)
n_valid = n_total - n_train

print('num of total data :', n_total)
print('num of train data :', n_train)
print('num of valid data :', n_valid)

train_dataset = dataset[:n_train]
valid_dataset = dataset[n_train:]

# データセットをpickle、csvで保存
pd.to_pickle(train_dataset, fname_train + '.pkl')
pd.to_pickle(valid_dataset, fname_valid + '.pkl')

columns = ('Lemma A', 'Lemma B', 'Label')
pd.DataFrame(train_dataset, columns=columns).to_csv(fname_train + '.csv', index=None)
pd.DataFrame(valid_dataset, columns=columns).to_csv(fname_valid + '.csv', index=None)

num of total data : 1166215
num of train data : 932972
num of valid data : 233243
