In [1]:
# import modules
import pandas as pd
import pickle
import gensim
import numpy as np
import string
from opencc import OpenCC
import ckip
import jieba
# Path of files
SENTENCE_DICT = "../../pickle/sentence_dict.pickle"
WORDVEC_MODEL = '../../wordvec_model/'
# Variables
DEMENTIA_NUM = 51
CONTROL_NUM = 51
WV_DIIM = 500

def read_sentence_file(file_name=None):
    with open(SENTENCE_DICT, 'rb') as f:
        sentence_dict = pickle.load(f)
        print("Load sentence text data ...")
    return sentence_dict

def load_wordvec_model(file_name):
    w2v_model = gensim.models.Word2Vec.load(WORDVEC_MODEL+file_name)
    words = []
    for word in w2v_model.wv.vocab:
        words.append(word)
    print('Load word2vec model sucess ...')
    print('Number of token: {}'.format(len(words)))
    print('Dimensions of word vector: {}'.format(len(w2v_model[words[0]])))
    return w2v_model

In [2]:
sentence_dict = read_sentence_file()

Load sentence text data ...


In [3]:
w2v_model = load_wordvec_model('500features_20context_20mincount')

Load word2vec model sucess ...
Number of token: 259638
Dimensions of word vector: 500




In [4]:
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.models import Sequential, Model
from tensorflow.python.keras.layers import Dense, LSTM, Embedding, Input


In [5]:
weights = np.asarray(w2v_model.wv.syn0)
layer = Embedding(input_dim=weights.shape[0], 
                 output_dim=weights.shape[1], 
                 weights = [weights])

  """Entry point for launching an IPython kernel.


In [5]:
JIEBA_DICT = '../../data/dict.txt.big'
jieba.set_dictionary(JIEBA_DICT)
seg_sentence = {}
for key, s in sentence_dict.items():
    seg_s = jieba.lcut(s)
    seg_sentence[key] = seg_s

Building prefix dict from /home/yyliu/code/NLP/data/dict.txt.big ...
Dumping model to file cache /tmp/jieba.u74f96b08eeb68fe4b0ac4c13a6f276ed.cache
Loading model cost 1.373 seconds.
Prefix dict has been built succesfully.


In [6]:
SEQUENCE_LENGTH = 120
VOCAB_DIM = 500

In [7]:
seg_sentence_vec = []
for key, s in seg_sentence.items():
    token_list = []
    for token in s:
        if token in w2v_model.wv.vocab:
            token_list.append(np.asarray(w2v_model.wv[token]))
    if len(token_list) < SEQUENCE_LENGTH:
        for i in range(SEQUENCE_LENGTH - len(token_list)):
            token_list.append(np.zeros(shape=VOCAB_DIM))
#             token_list.append(np.zeros(shape=(VOCAB_DIM, 1)).tolist())
    seg_sentence_vec.append([token_list[0:SEQUENCE_LENGTH]])

In [8]:
seg_sentence_vec[0]

[[array([-7.92692825e-02,  1.57746319e-02,  5.27736768e-02, -3.59880365e-02,
          1.34952022e-02, -4.33791848e-03, -3.78158465e-02,  5.95633360e-03,
          5.08587062e-02, -2.69752797e-02,  2.68270951e-02, -2.46875081e-02,
          1.10765304e-02, -1.45359645e-02,  1.95320398e-02,  5.44004366e-02,
         -8.38275254e-03,  1.92029154e-04, -1.07350368e-02,  2.39980202e-02,
         -1.37013374e-02,  4.03336622e-02,  8.05993751e-02, -5.47601804e-02,
          3.82103436e-02,  2.19692774e-02, -1.46730868e-02, -1.16942059e-02,
         -1.94687955e-02, -4.07967670e-03, -2.79163700e-02, -2.38141958e-02,
          2.63037961e-02,  8.86173826e-03, -5.29346876e-02,  2.10139379e-02,
          4.29366343e-03,  4.99295443e-02,  4.49647978e-02,  9.89161525e-03,
         -3.76676843e-02,  1.09264985e-01,  8.24200660e-02,  6.43022880e-02,
          4.46678624e-02, -5.72757795e-02, -6.61939159e-02, -2.39992179e-02,
         -1.43400570e-02,  2.57018246e-02, -1.43648386e-02,  3.11755333e-02,

In [9]:
import tensorflow as tf

In [10]:
# def length(sequence):
#   used = tf.sign(tf.reduce_max(tf.abs(sequence), 2))
#   length = tf.reduce_sum(used, 1)
#   length = tf.cast(length, tf.int32)
#   return length

In [11]:
# # encoder
# output, state = tf.nn.dynamic_rnn(
#     tf.contrib.rnn.GRUCell(200),
#     tf_x,
#     dtype=tf.float32,
#     sequence_length=length(tf_x)
# )

In [14]:
BATCH_SIZE = 32
EPOCH = 10
SAVE = 1
from tensorflow.python.keras import optimizers
def get_sentence_vec(text_vec, layer1):
    data = text_vec
    data = np.asarray(data).reshape(DEMENTIA_NUM+CONTROL_NUM, SEQUENCE_LENGTH, WV_DIIM)
    print('Data shape: ', len(data))
    print(data.shape)
    seq_input = Input(shape=(SEQUENCE_LENGTH, WV_DIIM))
    encoded = LSTM(layer1, input_shape=(SEQUENCE_LENGTH, WV_DIIM), 
                  return_sequences=True)(seq_input)
    decoded = LSTM(layer1, return_sequences=True)(encoded)
    decoded2 = LSTM(WV_DIIM, return_sequences=True)(decoded)
    autoencoder = Model(seq_input, decoded2)
    encoder = Model(seq_input, encoded)
    rmsprop = optimizers.RMSprop(lr=1e-5)
    autoencoder.compile(loss='cosine', optimizer=rmsprop)
    train_history = autoencoder.fit(data, data, batch_size = BATCH_SIZE, 
                                    epochs=EPOCH)
    encoder_op = encoder.predict(data)
    print(autoencoder.summary())
    if SAVE==1:
        np.savetxt('encoder_dim500_'+str(layer1)+'.csv', encoder_op[:,-1], delimiter=',')

In [15]:
get_sentence_vec(seg_sentence_vec, 20)

Data shape:  102
(102, 120, 500)
Epoch 1/10

Epoch 2/10

Epoch 3/10

Epoch 4/10

Epoch 5/10

Epoch 6/10

Epoch 7/10

Epoch 8/10

Epoch 9/10

Epoch 10/10

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 120, 500)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, None, 20)          41680     
_________________________________________________________________
lstm_5 (LSTM)                (None, None, 20)          3280      
_________________________________________________________________
lstm_6 (LSTM)                (None, None, 500)         1042000   
Total params: 1,086,960
Trainable params: 1,086,960
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
def print_sentence_distance(layer1):
    sentence_vec = np.genfromtxt('encoder_dim500_'+str(layer1)+'.csv', delimiter=',')
    print(sentence_vec.shape)
    distance = []
    for i in range(len(sentence_vec)-1):
        distance.append(np.linalg.norm(sentence_vec[i]-sentence_vec[i+1]))
    print(distance)
    print(len(distance))
    print('Mean: ', np.mean(distance))
    print('Std: ', np.std(distance))
    print('Max: ', np.max(distance))
    print('Min: ', np.min(distance))
    return sentence_vec

In [18]:
sentence_vec_lstm = print_sentence_distance(20)

(102, 20)
[0.0004196578998904263, 1.8948946152676097e-10, 5.042602148925244e-09, 4.841067297379638e-09, 2.7801395007616955e-10, 9.23215523685905e-11, 3.8221787179297974e-05, 3.734022007900015e-05, 1.491073903668597e-06, 2.921503111853068e-10, 7.332580706405269e-11, 6.355123107876756e-10, 5.93693174271185e-10, 2.3655256092037394e-10, 2.794384544492862e-10, 1.4169448764114675e-10, 1.6248303243287842e-08, 1.626332293167452e-08, 3.8911070317032684e-10, 3.114245855108703e-10, 3.812074107534059e-10, 4.256435204297304e-10, 1.5379584988387959e-09, 1.7228288089068883e-09, 4.689607273756498e-10, 1.411854533117837e-09, 1.5114083546534478e-09, 2.5900476613639328e-08, 2.604353106505707e-08, 1.5744506533070236e-10, 4.826872694137851e-10, 3.0939935024097383e-10, 3.1751219461794665e-09, 3.227877548095271e-09, 2.2885980793722103e-09, 2.3794940083810614e-09, 3.1856731626597516e-10, 1.459278509728871e-10, 2.5094411795704767e-08, 2.4981518366557376e-08, 3.798193178297237e-09, 3.6364910242861593e-09, 3.005

In [19]:
sentence_vec_lstm[1]

array([ 4.45270387e-04, -3.21196130e-04,  2.73606274e-04, -4.51140630e-04,
        7.26244587e-04, -3.58796999e-04,  6.66977197e-04, -7.66090889e-05,
        6.96329516e-04,  5.22243208e-04, -1.99893893e-05, -7.37876864e-04,
       -2.89513904e-04, -6.82613172e-04,  5.55726234e-04, -4.92067193e-04,
        3.89824912e-04, -5.84699563e-04,  2.70588091e-04,  1.15227493e-04])

In [20]:
from sklearn.cluster import KMeans

In [21]:
kmean = KMeans(n_clusters=2).fit(sentence_vec_lstm)

In [22]:
dementia = kmean.labels_[:DEMENTIA_NUM]
control = kmean.labels_[DEMENTIA_NUM:]

In [23]:
print(dementia)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [24]:
print(control)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [25]:
import pickle
sentence2vec_array = sentence_vec_lstm
file = open('s2v_lstm_array_zht_500dim.pickle', 'wb')
pickle.dump(sentence2vec_array, file)
file.close()