In [None]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.manifold import TSNE

### 60. 単語ベクトルの読み込みと表示

In [None]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('data/chap7/GoogleNews-vectors-negative300.bin.gz', binary=True)
model['United_States']

### 61. 単語の類似度

In [None]:
model.similarity('United_States','U.S.')

### 62. 類似度の高い単語10件

In [None]:
model.most_similar('United_States',topn=10)

### 63. 加法構成性によるアナロジー

In [None]:
model.most_similar(positive=['Spain', 'Athens'], negative=['Madrid'],topn=10)

### 64. アナロジーデータでの実験

In [None]:
df = pd.read_table('data/chap7/questions-words.txt', sep=' ', header=None, skiprows=[0])
with open('data/chap7/questions-words.txt') as f:
    questions = f.readlines()
with open('data/chap7/questions-words-new.txt', 'w') as f:
    for i, question in enumerate(questions):
        words = question.split()
        if len(words)==4:
            ans = model.most_similar(positive=[words[1], words[2]], negative=[words[0]],topn=1)[0]
            words += [ans[0],str(ans[1])]
            output = ' '.join(words)+'\n'
        else:
            output = question
        f.write(output)

### 65. アナロジータスクでの正解率

In [None]:
with open('data/chap7/questions-words-new.txt') as f:
    questions = f.readlines()
correct = 0
count = 0
for q in questions:
    words = q.split()
    if len(words)== 6:
        count += 1
        if q[3] == q[4]:
            correct += 1

print(correct / count)

### 66. WordSimilarity-353での評価

In [None]:
df = pd.read_csv('data/chap7/combined.csv')
sim = []
for i in range(len(df)):
    line = df.iloc[i]
    sim.append(model.similarity(line['Word 1'], line['Word 2']))
df['w2v'] = sim
df[['Human (mean)', 'w2v']].corr(method='spearman')

### 67. k-meansクラスタリング

In [None]:
with open('data/chap7/country.txt') as f:
    lines = f.readlines()

dic = {'United States of America':'United_States',
       'Russian Federation':'Russia',
       'Micronesia (Federated States of)': 'Palau_Micronesia',
       'Democratic Republic of the Congo': 'DR_Congo',
       "Democratic People's Republic of Korea": 'Korea'}
vec = []
counries = []
for line in lines:
    country = line.split('　　')[-1].strip()
    if country in dic:
        country = dic[country]
    country = country.replace(' ', '_').replace('-','_')
    
    try:
        vec.append(model[country])
        countries.append(country)
    except:
        continue
kmeans = KMeans(n_clusters=5, random_state=0)
kmeans.fit(vec)
for c,l in zip(countries, kmeans.labels_):
    print (c,l)

### 68. Ward法によるクラスタリング

In [None]:
plt.figure(figsize=(32.0, 24.0))
link = linkage(vec, method='ward')
dendrogram(link, labels=countries,leaf_rotation=90,leaf_font_size=10)
plt.show()

### 69. t-SNEによる可視化

In [None]:
vec_embedded = TSNE(n_components=2).fit_transform(vec)
vec_embedded_t = list(zip(*vec_embedded)) # 転置
fig, ax = plt.subplots(figsize=(16, 12))
plt.scatter(*vec_embedded_t)
for i, c in enumerate(countries):
    ax.annotate(c, (vec_embedded[i][0],vec_embedded[i][1]))