In [1]:
import data_preprocess
import tokenize_data_helper
import nengo
from nengo.exceptions import SpaParseError, ValidationError
import sys
import numpy as np
from nengo.spa import pointer
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences


  from ._conv import register_converters as _register_converters


In [5]:
DIM = 100
W2V_MODEL = '100features_20context_20mincount_zht'
CONTROL_TRAIN = 'control.txt'
DEMENTIA_TRAIN = 'dementia.txt'
CONTROL_TEST = 'control_test.txt'
DEMENTIA_TEST = 'dementia_test.txt'

In [6]:
w2v_model, _, w2v_dict = data_preprocess.load_wordvec_model(W2V_MODEL)
vocab = nengo.spa.Vocabulary(DIM, max_similarity=0.3) # optional max_similarity: 0.1

Load word2vec model sucess ...
Number of token: 259425
Dimensions of word vector: 100


In [None]:
num_words = 1000
x_train, y_train = data_preprocess.read_sentence(DEMENTIA_TRAIN, CONTROL_TRAIN)
x_train_seg = data_preprocess.segmentation(x_train)
x_test, y_test = data_preprocess.read_sentence(DEMENTIA_TEST, CONTROL_TEST)
x_test_seg = data_preprocess.segmentation(x_test)
data_helper = tokenize_data_helper.tokenize_data_helper(x_train_seg, num_words)
x_train_tokens, x_test_tokens = data_helper.tokenize_data(x_train_seg, x_test_seg)
x_train_pad, x_test_pad = data_helper.pad_tokenize(x_train_tokens, x_test_tokens)


Building prefix dict from /home/yyliu/code/NLP/data/dict.txt.big ...


total number of train set: 784
sentence number of dementia subject: 394
sentence number of control normal subject: 390


In [5]:
x_train_tokens[0]

[150, 115]

In [6]:
# test_text = []
# for i in x_train_tokens[0]:
#     test_text.append(data_helper.tokens_to_string(i))
# print(test_text)
data_helper.tokens_to_string(x_train_tokens[0])

'三個 人'

In [7]:
oov = []
# for sentences in x_train_seg:
#     for token in sentences.split():
#         try:
#             vocab.add(str('V'+token), w2v_dict[token])
#         except KeyError:
#             oov.append(token)
#             value = vocab.create_pointer(attempts=100)
#             vocab.add(str('V'+token), value)
#             continue
#         except ValidationError:
#             pass
# data_helper.tokenizer.word_index
for token, i in data_helper.tokenizer.word_index.items():
    try:
        vocab.add(str('V'+token), w2v_dict[token])
    except KeyError:
        oov.append(token)
        value = vocab.create_pointer(attempts=100)
        vocab.add(str('V'+token), value)
        continue
vocab.add('Start', np.zeros(DIM))

In [8]:
print(len(oov))

100


In [12]:
assert len(vocab.keys)-1 == len(data_helper.tokenizer.word_index.keys())
print(len(vocab.keys))
print(len(data_helper.tokenizer.word_index.keys()))

901
900


In [13]:
# print(vocab.keys[0], vocab[vocab.keys[0]])
print(vocab.keys[-5:-1])

['V椅去', 'V分給', 'V一片', 'V心事重重']


In [14]:
model_path = 'edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz'

In [15]:
s = u"媽媽 在 洗 盤子"

# 依存分析
from nltk.parse.stanford import StanfordDependencyParser
parser = StanfordDependencyParser(model_path=model_path)
result = list(parser.parse(s.split()))
for row in result[0].triples():
    print(row)

# 句法结构分析
from nltk.parse.stanford import StanfordParser
parser = StanfordParser(model_path=model_path)
result_2 = list(parser.parse(s.split()))
for r in result_2:
    print(r)
#     print(r.draw())


(('洗', 'VV'), 'nsubj', ('媽媽', 'NR'))
(('洗', 'VV'), 'advmod', ('在', 'AD'))
(('洗', 'VV'), 'dobj', ('盤子', 'NN'))
(ROOT (IP (NP (NR 媽媽)) (VP (ADVP (AD 在)) (VP (VV 洗) (NP (NN 盤子))))))


In [16]:
for i, r in enumerate(result_2[0].subtrees()):
    print(r, i)
# result_2[0].subtrees()

(ROOT (IP (NP (NR 媽媽)) (VP (ADVP (AD 在)) (VP (VV 洗) (NP (NN 盤子)))))) 0
(IP (NP (NR 媽媽)) (VP (ADVP (AD 在)) (VP (VV 洗) (NP (NN 盤子))))) 1
(NP (NR 媽媽)) 2
(NR 媽媽) 3
(VP (ADVP (AD 在)) (VP (VV 洗) (NP (NN 盤子)))) 4
(ADVP (AD 在)) 5
(AD 在) 6
(VP (VV 洗) (NP (NN 盤子))) 7
(VV 洗) 8
(NP (NN 盤子)) 9
(NN 盤子) 10


In [17]:
tree = result_2[0]
print(tree.pos())

[('媽媽', 'NR'), ('在', 'AD'), ('洗', 'VV'), ('盤子', 'NN')]


In [18]:
for s in tree.subtrees(lambda tree: tree.height() == 2):
    print(s.productions())

[NR -> '媽媽']
[AD -> '在']
[VV -> '洗']
[NN -> '盤子']


In [20]:
# parser = StanfordParser(model_path=model_path)
# def parser_tree(sentence):
#     result = list(parser.parse(sentence.split()))
#     return result

In [21]:
# x_train_tree = []
# for s in x_train_seg:
#     tree_tmp = list(parser.parse(s.split()))
#     x_train_tree.append(tree_tmp)

In [22]:
import jieba.posseg as pseg

In [23]:
x_train_postag = []
for s in x_train:
    x_train_postag.append(pseg.lcut(s))

In [24]:
flag_dict = {}
for s in x_train_postag:
    for word, flag in s:
        if flag not in flag_dict:
            flag_dict[flag] = 1
#         print(word, flag)

In [25]:
print(len(flag_dict))

39


In [26]:
sp = []
for word, flag in x_train_postag[0]:
    print(word)
    sp.append(vocab['V'+word])

三個
人


In [27]:
x_train_postag[0]

[pair('三個', 'm'), pair('人', 'n')]

In [28]:
for i in flag_dict:
    vocab.parse(i.upper())

In [29]:
len(vocab.keys)

940

In [30]:
sentence_bind_pos = []
for s in x_train_postag:
    new_s = vocab['Start']
    for word, flag in s:
        new_token = vocab['V'+str(word)]*vocab[flag.upper()]
        new_s += new_token
    sentence_bind_pos.append(new_s)

In [32]:
x_train_sp = np.zeros((len(x_train),DIM))

In [33]:
for i in range(len(sentence_bind_pos)):
    x_train_sp[i] = sentence_bind_pos[i].v