# 2. word2vec モデル学習

In [1]:
import sys
import pickle
from nltk import word_tokenize
from janome.tokenizer import Tokenizer
from gensim.models.word2vec import Word2Vec

f_dir = '/root/userspace/dl4us_final_report/'

## 2.1 データ読み込み

In [2]:
shf_docs = pickle.load(open(f_dir + 'pickles/shf_docs.pickle', 'rb'))
shf_labels = pickle.load(open(f_dir + 'pickles/shf_labels.pickle', 'rb'))

## 2.2 tokenize

In [3]:
tokenized_docs = []  # 保存データは文書単位（文書x行x単語）
sentences = []  # word2vecには行単位のデータを渡す（全文書行x単語）
for i, ((cat, lang), doc) in enumerate(zip(shf_labels, shf_docs)):
    # 英語は nltk.word_tokenize, 日本語は janome.Tokanizer でtoken化
    if lang == 'en':
        tkzd_doc = [word_tokenize(s) for s in doc.split('\n')]
    elif lang == 'ja':
        t = Tokenizer()
        tkzd_doc = [t.tokenize(s, wakati=True) for s in doc.split('\n')]
    else:
        print('lang:', lang, file=sys.stderr)
        sys.exit()

    tokenized_docs.append(tkzd_doc)
    sentences.extend(tkzd_doc)

    # 経過表示
    if i % 2000 == 0:
        print(i, 'docs done')

print(len(tokenized_docs), ': total docs')
print(len(sentences), ': total sentences')

0 docs done
2000 docs done
4000 docs done
6000 docs done
8000 docs done
10000 docs done
12000 docs done
14000 docs done
16000 docs done
18000 docs done
20000 docs done
22000 docs done
24000 docs done
26000 docs done
28000 docs done
28222 : total docs
975010 : total sentences


## 2.3 tokenize データ保存

In [4]:
pickle.dump(tokenized_docs, open(f_dir + 'pickles/shf_tkn_docs.pickle', 'wb'))
pickle.dump(sentences, open(f_dir + 'pickles/sentences_for_w2v.pickle', 'wb'))

## 2.4 word2vec 学習

In [5]:
size = 400
model = Word2Vec(sentences, size=size, sg=1, min_count=1)
model.save(f_dir + 'models/w2v_' + str(size) + '.model')

In [6]:
# word2vec 確認
words = ('街道', '近い', 'castle', 'long')
for word in words:
    print(word, ':')
    out = model.most_similar(positive=[word])
    for x in out:
        print(x)
    print()

街道 :
('京街道', 0.7725939154624939)
('中山道', 0.7632403373718262)
('東高野', 0.7380308508872986)
('宿場', 0.7368693351745605)
('難所', 0.731268584728241)
('畿七道', 0.7284597158432007)
('旧道', 0.7209132313728333)
('堤上', 0.7207267880439758)
('沿い', 0.7183176279067993)
('左岸', 0.7145272493362427)

近い :
('とどまる', 0.7481799125671387)
('近かっ', 0.7400604486465454)
('つなげる', 0.7357017993927002)
('横ばい', 0.7324842214584351)
('劣る', 0.7293784618377686)
('あわせる', 0.7240008115768433)
('青みがかり', 0.722811222076416)
('遠い', 0.7224317193031311)
('接する', 0.7222763299942017)
('ないしは', 0.7221167683601379)

castle :
('osaka-jo', 0.6921572685241699)
('castles', 0.674949586391449)
('castellan', 0.6721668243408203)
('nagoya-jo', 0.6708797216415405)
('edo-jo', 0.6590505838394165)
('fushimi-jo', 0.6575532555580139)
('himeji-jo', 0.6512061953544617)
('sawayama-jo', 0.6481110453605652)
('kumamoto-jo', 0.6436932682991028)
('kannonji-jo', 0.6383404731750488)

long :
('short', 0.6913972496986389)
('longest', 0.6104753613471985)
('shorter', 0