In [1]:
from nltk.corpus import wordnet as wn
import numpy  as np
import pandas as pd
import random
import pandas as pd

import torch
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data

from pytorch_transformers import *

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device).eval()

In [3]:
max_seq_len = 8

In [4]:
# 全lemmaをWordNetから取得
lemma_set = set(wn.all_lemma_names(pos='n'))
print('Num of entire lemma   :', len(lemma_set))

# lemma set をフィルタリングし、学習に適したものだけに
tkn_lemmas = ['[CLS] ' + l.replace('_', ' ') + ' [SEP]' for l in lemma_set]
tkn_lemmas = [tokenizer.tokenize(t) for t in tkn_lemmas]
lemma_set = set(l for l, t in zip(lemma_set, tkn_lemmas) if len(t) <= max_seq_len)
print('Num of filtered lemma :', len(lemma_set))

Num of entire lemma   : 117798
Num of filtered lemma : 113427


In [5]:
def create_vectorizer(lemmas):
    tkn_lemmas = ['[CLS] ' + l.replace('_', ' ') + ' [SEP]' for l in lemmas]
    tkn_lemmas = [tokenizer.tokenize(t) for t in tkn_lemmas]

    max_seq_len = max(len(t) for t in tkn_lemmas)
    print(max_seq_len)
    
    batch_size = 128
    
    vectorizer = dict()
    for i in range(0, len(lemmas), batch_size):
        print(i)
        tidxs = []
        masks = []
        for l, t in zip(lemmas[i:i+batch_size], tkn_lemmas[i:i+batch_size]):
            tidxs.append(tokenizer.convert_tokens_to_ids(t) + [0] * (max_seq_len - len(t)))
            masks.append([1] * len(t)                       + [0] * (max_seq_len - len(t)))
        tidxs = torch.tensor(tidxs).to(device)
        masks = torch.tensor(masks).to(device)
                         
        features, _ = model(tidxs, None, masks)
        features = features.detach().cpu().numpy()
                         
        for l, f in zip(lemmas[i:i+batch_size], features):
            vectorizer[l] = f

    return vectorizer

In [6]:
# word2vec = pd.read_pickle('../vectorizer/word2vec.pkl')

# vectorizer_w2v = {l: vectorize_lemma(word2vec, l, max_seq_len) for l in lemma_set}

# pd.to_pickle(vectorizer_w2v, 'vectorizer_w2v.pkl')
vectorizer_bert = create_vectorizer(list(lemma_set))

8
0
128
256
384
512
640
768
896
1024
1152
1280
1408
1536
1664
1792
1920
2048
2176
2304
2432
2560
2688
2816
2944
3072
3200
3328
3456
3584
3712
3840
3968
4096
4224
4352
4480
4608
4736
4864
4992
5120
5248
5376
5504
5632
5760
5888
6016
6144
6272
6400
6528
6656
6784
6912
7040
7168
7296
7424
7552
7680
7808
7936
8064
8192
8320
8448
8576
8704
8832
8960
9088
9216
9344
9472
9600
9728
9856
9984
10112
10240
10368
10496
10624
10752
10880
11008
11136
11264
11392
11520
11648
11776
11904
12032
12160
12288
12416
12544
12672
12800
12928
13056
13184
13312
13440
13568
13696
13824
13952
14080
14208
14336
14464
14592
14720
14848
14976
15104
15232
15360
15488
15616
15744
15872
16000
16128
16256
16384
16512
16640
16768
16896
17024
17152
17280
17408
17536
17664
17792
17920
18048
18176
18304
18432
18560
18688
18816
18944
19072
19200
19328
19456
19584
19712
19840
19968
20096
20224
20352
20480
20608
20736
20864
20992
21120
21248
21376
21504
21632
21760
21888
22016
22144
22272
22400
22528
22656
22784
22912
23040
2