In [53]:
# import modules
import pandas as pd
import pickle
import gensim
import numpy as np
import string
from opencc import OpenCC
import ckip
import jieba
# Path of files
SENTENCE_DICT = "../../pickle/sentence_dict.pickle"
WORDVEC_MODEL = '../../wordvec_model/'
# Variables
DEMENTIA_NUM = 51
CONTROL_NUM = 51
WV_DIIM = 500

def read_sentence_file(file_name=None):
    with open(SENTENCE_DICT, 'rb') as f:
        sentence_dict = pickle.load(f)
        print("Load sentence text data ...")
    return sentence_dict

def load_wordvec_model(file_name):
    w2v_model = gensim.models.Word2Vec.load(WORDVEC_MODEL+file_name)
    words = []
    for word in w2v_model.wv.vocab:
        words.append(word)
    print('Load word2vec model sucess ...')
    print('Number of token: {}'.format(len(words)))
    print('Dimensions of word vector: {}'.format(len(w2v_model[words[0]])))
    return w2v_model

In [54]:
sentence_dict = read_sentence_file()

Load sentence text data ...


In [55]:
w2v_model = load_wordvec_model('500features_20context_20mincount')

Load word2vec model sucess ...
Number of token: 259638
Dimensions of word vector: 500




In [56]:
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.models import Sequential, Model
from tensorflow.python.keras.layers import Dense, LSTM, Embedding, Input

In [57]:
# weights = np.asarray(w2v_model.wv.syn0)
# layer = Embedding(input_dim=weights.shape[0], 
#                  output_dim=weights.shape[1], 
#                  weights = [weights])

In [130]:
def split_punctuation(sentence):
    punctuation = set(string.punctuation+"，"+"、"+"」"+"「"+"。"+" "+"！")
    sentence_split = []
    tmp = ''
    for i in sentence:
        if i not in punctuation:
            tmp += i
        else:
            sentence_split.append(tmp)
            tmp = ''
    return sentence_split
sentence = '3個人，一個媽媽兩個小孩，小孩站在椅子上要拿西點，椅子都快倒下來了，在拿這個西點餅乾要吃，手下還拿著一塊，'
print(split_punctuation(sentence))

['3個人', '一個媽媽兩個小孩', '小孩站在椅子上要拿西點', '椅子都快倒下來了', '在拿這個西點餅乾要吃', '手下還拿著一塊']


In [131]:
with open('../../data/dementia.txt', encoding='utf8') as f:
    dementia_txt = f.readlines()
sentence = []
for i in range(len(dementia_txt)):
    if i%2==0:
        sentence.extend(split_punctuation(dementia_txt[i+1]))
dementia_num = len(sentence)
with open('../../data/control_51.txt', encoding='utf8') as f:
    control_txt = f.readlines()
for i in range(len(control_txt)):
    if i%2==0:
        sentence.extend(split_punctuation(control_txt[i+1]))
control_num = len(sentence) - dementia_num
############
# train set#
############
train_data = np.array(sentence)
train_y = np.zeros((train_data.shape[0]))
train_y[dementia_num:] = 1.0
print('total number of train set: {}'.format(train_data.shape[0]))
print('sentence number of dementia subject: {}'.format(len(train_y[train_y==0])))
print('sentence number of control normal subject: {}'.format(len(train_y[train_y==1])))

total number of train set: 873
sentence number of dementia subject: 442
sentence number of control normal subject: 431


In [132]:
JIEBA_DICT = '../../data/dict.txt.big'
jieba.set_dictionary(JIEBA_DICT)
train_data_seg = []
for i in train_data:
    train_data_seg.append(jieba.lcut(i))
print(train_data_seg[5])

Building prefix dict from /home/yyliu/code/NLP/data/dict.txt.big ...
Loading model from cache /tmp/jieba.u74f96b08eeb68fe4b0ac4c13a6f276ed.cache
Loading model cost 1.162 seconds.
Prefix dict has been built succesfully.


['手下', '還拿著', '一塊']


In [137]:
SEQUENCE_LENGTH = 20
VOCAB_DIM = 500
train_data_seg_array = np.array(train_data_seg)
l = []
for i in range(len(train_data_seg_array)):
    l.append(len(train_data_seg_array[i]))
#     if len(train_data_seg_array[i])==1:
#         print(i, train_data_seg_array[i])
print('Max token number of sentence: {}'.format(np.max(l)))
print('Min token number of sentence: {}'.format(np.min(l)))
print('Mean token number of sentence: {}'.format(np.mean(l)))

Max token number of sentence: 17
Min token number of sentence: 1
Mean token number of sentence: 5.747995418098511


In [139]:
train_vec = []
for s in train_data_seg:
    token_list = []
    for token in s:
        if token in w2v_model.wv.vocab:
            token_list.append(np.asarray(w2v_model.wv[token]))
    if len(token_list) < SEQUENCE_LENGTH:
        for i in range(SEQUENCE_LENGTH - len(token_list)):
            token_list.append(np.zeros(shape=VOCAB_DIM))
    train_vec.append([token_list[0:SEQUENCE_LENGTH]])
# seg_sentence_vec = []
# for key, s in seg_sentence.items():
#     token_list = []
#     for token in s:
#         if token in w2v_model.wv.vocab:
#             token_list.append(np.asarray(w2v_model.wv[token]))
#     if len(token_list) < SEQUENCE_LENGTH:
#         for i in range(SEQUENCE_LENGTH - len(token_list)):
#             token_list.append(np.zeros(shape=VOCAB_DIM))
# #             token_list.append(np.zeros(shape=(VOCAB_DIM, 1)).tolist())
#     seg_sentence_vec.append([token_list[0:SEQUENCE_LENGTH]])

In [145]:
train_vec = np.asarray(train_vec).reshape(len(train_data_seg),-1,500)
print(train_vec.shape)
print(train_vec[2])

(873, 20, 500)
[[-7.92692825e-02  1.57746319e-02  5.27736768e-02 ...  3.76927592e-02
   7.03343600e-02  3.61793354e-04]
 [ 1.03810102e-01 -3.23520899e-02  1.29130320e-03 ... -2.70457361e-02
  -3.82427163e-02  1.65345892e-02]
 [-1.31187662e-02 -1.90271542e-03 -6.02233633e-02 ...  2.55492123e-05
   2.02863179e-02  1.29380105e-02]
 ...
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]]


In [146]:
import tensorflow as tf

In [147]:
# def length(sequence):
#   used = tf.sign(tf.reduce_max(tf.abs(sequence), 2))
#   length = tf.reduce_sum(used, 1)
#   length = tf.cast(length, tf.int32)
#   return length

In [148]:
# # encoder
# output, state = tf.nn.dynamic_rnn(
#     tf.contrib.rnn.GRUCell(200),
#     tf_x,
#     dtype=tf.float32,
#     sequence_length=length(tf_x)
# )

In [None]:
BATCH_SIZE = 32
EPOCH = 10
SAVE = 0
TOTAL_NUM = train_vec.shape[0]
from tensorflow.python.keras import optimizers
def cnn(text_vec):
    data = text_vec
    data = np.asarray(data).reshape(TOTAL_NUM, SEQUENCE_LENGTH, WV_DIIM)
    print('Data shape: ', len(data))
    print(data.shape)
    seq_input = Input(shape=(SEQUENCE_LENGTH, WV_DIIM))
    encoded = LSTM(layer1, input_shape=(SEQUENCE_LENGTH, WV_DIIM), 
                  return_sequences=True)(seq_input)
    decoded = LSTM(layer1, return_sequences=True)(encoded)
    decoded2 = LSTM(WV_DIIM, return_sequences=True)(decoded)
    autoencoder = Model(seq_input, decoded2)
    encoder = Model(seq_input, encoded)
    rmsprop = optimizers.RMSprop(lr=1e-5)
    autoencoder.compile(loss='cosine', optimizer=rmsprop)
    train_history = autoencoder.fit(data, data, batch_size = BATCH_SIZE, 
                                    epochs=EPOCH)
    encoder_op = encoder.predict(data)
    print(autoencoder.summary())
    if SAVE==1:
        np.savetxt('encoder_dim500_'+str(layer1)+'.csv', encoder_op[:,-1], delimiter=',')

In [153]:
BATCH_SIZE = 32
EPOCH = 10
SAVE = 0
TOTAL_NUM = train_vec.shape[0]
from tensorflow.python.keras import optimizers
def get_sentence_vec(text_vec, layer1):
    data = text_vec
    data = np.asarray(data).reshape(TOTAL_NUM, SEQUENCE_LENGTH, WV_DIIM)
    print('Data shape: ', len(data))
    print(data.shape)
    seq_input = Input(shape=(SEQUENCE_LENGTH, WV_DIIM))
    encoded = LSTM(layer1, input_shape=(SEQUENCE_LENGTH, WV_DIIM), 
                  return_sequences=True)(seq_input)
    decoded = LSTM(layer1, return_sequences=True)(encoded)
    decoded2 = LSTM(WV_DIIM, return_sequences=True)(decoded)
    autoencoder = Model(seq_input, decoded2)
    encoder = Model(seq_input, encoded)
    rmsprop = optimizers.RMSprop(lr=1e-5)
    autoencoder.compile(loss='cosine', optimizer=rmsprop)
    train_history = autoencoder.fit(data, data, batch_size = BATCH_SIZE, 
                                    epochs=EPOCH)
    encoder_op = encoder.predict(data)
    print(autoencoder.summary())
    if SAVE==1:
        np.savetxt('encoder_dim500_'+str(layer1)+'.csv', encoder_op[:,-1], delimiter=',')

In [154]:
get_sentence_vec(train_vec, 20)

Data shape:  873
(873, 20, 500)
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
dim is deprecated, use axis instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Epoch 1/10

Epoch 2/10

Epoch 3/10

Epoch 4/10

Epoch 5/10

Epoch 6/10

Epoch 7/10

Epoch 8/10

Epoch 9/10

Epoch 10/10

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 20, 500)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 20)          41680     
_________________________________________________________________
lstm_2 (LSTM)                (None, None, 20)          3280      
_________________________________________________________________
lstm_3 (LSTM)                (None, None, 500)         1042000   
Total params: 1,086,960
Trainable params: 1,

In [16]:
def print_sentence_distance(layer1):
    sentence_vec = np.genfromtxt('encoder_dim500_'+str(layer1)+'.csv', delimiter=',')
    print(sentence_vec.shape)
    distance = []
    for i in range(len(sentence_vec)-1):
        distance.append(np.linalg.norm(sentence_vec[i]-sentence_vec[i+1]))
    print(distance)
    print(len(distance))
    print('Mean: ', np.mean(distance))
    print('Std: ', np.std(distance))
    print('Max: ', np.max(distance))
    print('Min: ', np.min(distance))
    return sentence_vec

In [18]:
sentence_vec_lstm = print_sentence_distance(20)

(102, 20)
[0.0004196578998904263, 1.8948946152676097e-10, 5.042602148925244e-09, 4.841067297379638e-09, 2.7801395007616955e-10, 9.23215523685905e-11, 3.8221787179297974e-05, 3.734022007900015e-05, 1.491073903668597e-06, 2.921503111853068e-10, 7.332580706405269e-11, 6.355123107876756e-10, 5.93693174271185e-10, 2.3655256092037394e-10, 2.794384544492862e-10, 1.4169448764114675e-10, 1.6248303243287842e-08, 1.626332293167452e-08, 3.8911070317032684e-10, 3.114245855108703e-10, 3.812074107534059e-10, 4.256435204297304e-10, 1.5379584988387959e-09, 1.7228288089068883e-09, 4.689607273756498e-10, 1.411854533117837e-09, 1.5114083546534478e-09, 2.5900476613639328e-08, 2.604353106505707e-08, 1.5744506533070236e-10, 4.826872694137851e-10, 3.0939935024097383e-10, 3.1751219461794665e-09, 3.227877548095271e-09, 2.2885980793722103e-09, 2.3794940083810614e-09, 3.1856731626597516e-10, 1.459278509728871e-10, 2.5094411795704767e-08, 2.4981518366557376e-08, 3.798193178297237e-09, 3.6364910242861593e-09, 3.005

In [19]:
sentence_vec_lstm[1]

array([ 4.45270387e-04, -3.21196130e-04,  2.73606274e-04, -4.51140630e-04,
        7.26244587e-04, -3.58796999e-04,  6.66977197e-04, -7.66090889e-05,
        6.96329516e-04,  5.22243208e-04, -1.99893893e-05, -7.37876864e-04,
       -2.89513904e-04, -6.82613172e-04,  5.55726234e-04, -4.92067193e-04,
        3.89824912e-04, -5.84699563e-04,  2.70588091e-04,  1.15227493e-04])

In [20]:
from sklearn.cluster import KMeans

In [21]:
kmean = KMeans(n_clusters=2).fit(sentence_vec_lstm)

In [22]:
dementia = kmean.labels_[:DEMENTIA_NUM]
control = kmean.labels_[DEMENTIA_NUM:]

In [23]:
print(dementia)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [24]:
print(control)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [25]:
import pickle
sentence2vec_array = sentence_vec_lstm
file = open('s2v_lstm_array_zht_500dim.pickle', 'wb')
pickle.dump(sentence2vec_array, file)
file.close()