In [1]:
LOOKUP_TABLE = "../data/wiki.zh.vec"
import gensim
from gensim.models import KeyedVectors

In [2]:
w2v_model = KeyedVectors.load_word2vec_format(LOOKUP_TABLE)
words = []
for word in w2v_model.vocab:
    words.append(word)
print("Number of tokens: {}".format(len(words)))
print("Dimensions of a word vector: {}".format(len(w2v_model[words[0]])))

Number of tokens: 332647
Dimensions of a word vector: 300


In [3]:
words[2]

'的'

In [4]:
w2v_model[words[0]].shape

(300,)

In [7]:
DATA_PICKLE = "../pickle/sentence_dict.pickle"

In [8]:
import pickle
sentence_dict = {}
with open(DATA_PICKLE, 'rb') as f:
    sentence_dict = pickle.load(f)

In [5]:
from ckip import CkipSegmenter
segmenter = CkipSegmenter()

In [6]:
import numpy as np
import string
exclude = set(string.punctuation+'，'+'。'+'、'+'「'+'」'+'？'+'！')
def sentence2vec(sentence):
    vector = np.zeros((w2v_model[words[0]].shape))
    oov_num = 0
    token_sentence = segmenter.seg(sentence)
    token_sentence = token_sentence.tok
    token_sentence = [t for t in token_sentence if not t in exclude]
    for token in token_sentence:
        if token in w2v_model.vocab:
            vector += w2v_model[token]
        else:
            oov_num += 1
    vector /= len(token_sentence)
    return vector, oov_num, token_sentence

In [9]:
test_sv, test_sv_oov, test_sentence = sentence2vec(sentence_dict['100'])

In [10]:
# Print sample result of sentence vector
print("Segmeantation of sentence: {}".format(test_sentence))
print("Sentence vector: {0}, \nlenght={1}".format(test_sv[:10], w2v_model[words[0]].shape))
print("Vocabulary not in Lookup table: {}".format(test_sv_oov))

Segmeantation of sentence: ['有', '天', '當', '媽媽', '正在', '洗碗', '時', '水龍頭', '竟', '突然', '地', '不', '受', '控制', '地', '嘩啦啦', '的', '流', '著', '水', '但', '媽媽', '卻', '像是', '沒', '看見', '似的', '依然故我', '的', '洗', '著', '碗盤', '轉頭', '一', '看', '弟弟', '踮', '著', '腳', '想', '從', '碗櫥', '拿', '些', '東西', '一', '個', '不', '小心', '便', '差點', '掉下來', '希望', '姐姐', '能', '接住', '他']
Sentence vector: [-0.05014453  1.00165772 -0.61897596  0.4185977   0.81403333 -0.43266385
 -0.62356421  0.03707956  0.29840268 -0.1195136 ], 
lenght=(300,)
Vocabulary not in Lookup table: 1


In [11]:
sv_dict = {}
oov_dict = {}
sentence_token_dict = {}   

In [12]:
for k, s in sentence_dict.items():
    sv, oov, sentence_token = sentence2vec(s)
    print(k)
    sv_dict[k] = sv
    oov_dict[k] = oov
    sentence_token_dict[k] = sentence_token

002.
003.
004.
005.
006. 
007.
008.
009.
010.
011.
012.
013.
015.
016.
018.
019.
020.
021.
023.
024.
025.
026.
027.
029.
030.
032.
034.
035.
037.
038.
039.
041.
044.
045.
046.
048.
049.
050.
052.
054.
055.
056.
057.
058.
059.
061.
061_.
063.
064.
065.
066.
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118


In [13]:
from sklearn.cluster import KMeans

In [14]:
sv_dict_array = np.asarray([i for i in sv_dict.values()])
sv_dict_array.shape

(102, 300)

In [15]:
model = KMeans(n_clusters=2).fit(sv_dict_array)

In [16]:
dementia = model.labels_[:52]
control = model.labels_[52:]

In [17]:
dementia

array([0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1], dtype=int32)

In [18]:
control

array([1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 0, 1], dtype=int32)

In [19]:
sv_dict_array[0][:10]

array([-0.09134774,  0.97664812, -0.6161853 ,  0.37809061,  0.80142591,
       -0.44241801, -0.67143148,  0.01950247,  0.26056783, -0.11032331])

In [20]:
sv_dict_array[:10][:10]

array([[-0.09134774,  0.97664812, -0.6161853 , ...,  0.30925858,
         1.02234691,  0.13697879],
       [-0.08000795,  1.013031  , -0.652757  , ...,  0.3421018 ,
         1.06676798,  0.1675923 ],
       [-0.09044532,  0.97061959, -0.61444208, ...,  0.29606421,
         1.06004   ,  0.14580231],
       ...,
       [-0.09525553,  0.95463   , -0.57256167, ...,  0.34828403,
         1.02017887,  0.13167095],
       [-0.08510944,  0.89666519, -0.59682907, ...,  0.25907822,
         0.98176092,  0.12500127],
       [-0.10120685,  0.98504443, -0.63363148, ...,  0.33146789,
         1.01251593,  0.06325852]])

In [21]:
from sklearn import metrics
dementia = np.asarray(dementia)
control = np.asarray(control)

fp_sklearn = np.where(dementia==0)
fn_sklearn = np.where(control==1)
print("FP: " + str(fp_sklearn))
print("FN: " + str(fn_sklearn))
print("sklearn kmeans score: {}".format(model.score(sv_dict_array)))
silhouette_score = metrics.silhouette_score(sv_dict_array, model.labels_, 
                                           metric='euclidean')
print("Silhouette score: {}".format(silhouette_score))

FP: (array([ 0,  1,  2,  6,  7,  9, 11, 12, 13, 16, 17, 19, 20, 21, 23, 26, 28,
       31, 32, 34, 35, 36, 37, 38, 39, 43, 44, 45, 46, 47, 48, 49, 50]),)
FN: (array([ 0,  1,  2,  3,  5,  6,  7,  8, 10, 12, 15, 17, 19, 20, 21, 22, 23,
       24, 26, 29, 30, 32, 33, 35, 37, 38, 39, 41, 42, 44, 45, 46, 47, 49]),)
sklearn kmeans score: -44.25144683115823
Silhouette score: 0.29422869677739705


In [22]:
from nltk.cluster import KMeansClusterer
import nltk

In [23]:
kclusterer = KMeansClusterer(2, distance=nltk.cluster.util.cosine_distance,
                            repeats=1000)
assigned_clusters = kclusterer.cluster(sv_dict_array, assign_clusters=True)
print(assigned_clusters)

[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [25]:
dement = np.asarray(assigned_clusters[:52])
contr = np.asarray(assigned_clusters[52:])
# dement = np.asarray(dement)

fp = np.where(dement==1)
fn = np.where(contr==0)
print("FP: " + str(fp))
print("FN: " + str(fn))


FP: (array([ 7,  8,  9, 10, 15, 16, 18, 24, 27, 28, 30, 32, 33, 39, 42, 51]),)
FN: (array([ 3, 40]),)
