In [1]:
import data_preprocess
import nengo
from nengo.exceptions import SpaParseError, ValidationError
import sys
import numpy as np
from nengo.spa import pointer
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences


In [2]:
class Data_helper():

    def __init__(self, fit_data, num_words):
        self.tokenizer = self.tokenize(fit_data, num_words)
        self.idx = self.tokenizer.word_index
        self.inverse_map = dict(zip(self.idx.values(), self.idx.keys()))
    
    def tokenize(self, fit_data, num_words=1000):
        self.tokenizer = Tokenizer(num_words=num_words)
        self.tokenizer.fit_on_texts(fit_data)
        return self.tokenizer

    def tokenize_data(self, x_train, x_test):
        return self.tokenizer.texts_to_sequences(x_train), self.tokenizer.texts_to_sequences(x_test)

    def pad_tokenize(self, x_train_tokens, x_test_tokens, pad='post'):
        num_tokens = [len(tokens) for tokens in x_train_tokens+x_test_tokens]
        global max_tokens
        max_tokens = np.max(num_tokens)
        x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens, padding=pad, truncating=pad)
        x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens, padding=pad, truncating=pad)
        return x_train_pad, x_test_pad

    def tokens_to_string(self, tokens):
        words = [self.inverse_map[token] for token in tokens if token != 0]
        text = ' '.join(words)
        return text
    
    def embedding_matrix(self, pretrained_dict):
        word_index = self.tokenizer.word_index
        word_embedding = np.zeros((num_words, EMBEDDING_DIM))
        for word, i in word_index.items():
            embedding_vector = pretrained_dict.get(word)
            if embedding_vector is not None:
                word_embedding[i] = embedding_vector
        return word_embedding

In [3]:
DIM = 100
W2V_MODEL = '100features_20context_20mincount_zht'
CONTROL_TRAIN = 'control.txt'
DEMENTIA_TRAIN = 'dementia.txt'
CONTROL_TEST = 'control_test.txt'
DEMENTIA_TEST = 'dementia_test.txt'

In [4]:
w2v_model, _, w2v_dict = data_preprocess.load_wordvec_model(W2V_MODEL)
vocab = nengo.spa.Vocabulary(100, max_similarity=0.3) # optional max_similarity: 0.1

Load word2vec model sucess ...
Number of token: 259425
Dimensions of word vector: 100


In [5]:
num_words = 1000
x_train, y_train = data_preprocess.read_sentence(DEMENTIA_TRAIN, CONTROL_TRAIN)
x_train_seg = data_preprocess.segmentation(x_train)
x_test, y_test = data_preprocess.read_sentence(DEMENTIA_TEST, CONTROL_TEST)
x_test_seg = data_preprocess.segmentation(x_test)
data_helper = Data_helper(x_train_seg, num_words)
x_train_tokens, x_test_tokens = data_helper.tokenize_data(x_train_seg, x_test_seg)
x_train_pad, x_test_pad = data_helper.pad_tokenize(x_train_tokens, x_test_tokens)


Building prefix dict from /home/yyliu/code/NLP/data/dict.txt.big ...
Loading model from cache /tmp/jieba.u74f96b08eeb68fe4b0ac4c13a6f276ed.cache


total number of train set: 784
sentence number of dementia subject: 394
sentence number of control normal subject: 390


Loading model cost 1.319 seconds.
Prefix dict has been built succesfully.


total number of train set: 89
sentence number of dementia subject: 48
sentence number of control normal subject: 41


In [6]:
x_train_tokens[0]

[151, 115]

In [7]:
# test_text = []
# for i in x_train_tokens[0]:
#     test_text.append(data_helper.tokens_to_string(i))
# print(test_text)
data_helper.tokens_to_string(x_train_tokens[0])

'三個 人'

In [8]:
oov = []
# for sentences in x_train_seg:
#     for token in sentences.split():
#         try:
#             vocab.add(str('V'+token), w2v_dict[token])
#         except KeyError:
#             oov.append(token)
#             value = vocab.create_pointer(attempts=100)
#             vocab.add(str('V'+token), value)
#             continue
#         except ValidationError:
#             pass
# data_helper.tokenizer.word_index
for token, i in data_helper.tokenizer.word_index.items():
    try:
        vocab.add(str('V'+token), w2v_dict[token])
    except KeyError:
        oov.append(token)
        value = vocab.create_pointer(attempts=100)
        vocab.add(str('V'+token), value)
        continue

In [9]:
oov

['水溢',
 '水滿',
 '洗碗盤',
 '站不穩',
 '漏出來',
 '流理台',
 '點心吃',
 '流理',
 '流下來',
 '爬高',
 '櫥子',
 '沒關',
 '椅凳',
 '水都溢',
 '沒站',
 '穩快',
 '裡拿',
 '嘩啦啦',
 '椅腳',
 '餅乾盒',
 '踮',
 '還拿著',
 '有草',
 '水噴',
 '他採',
 '洗出來',
 '歪歪斜斜',
 '這開',
 '淋濕',
 '沒拿到',
 '水太多',
 '翻掉',
 '滴下來',
 '擦乾淨',
 '布要',
 '跌跤',
 '在搖',
 '關了',
 '有樹',
 '裡流',
 '關家裡',
 '擦洗',
 '這想個',
 '講些',
 '太超',
 '滿到',
 '水倒',
 '水都流',
 '淹到',
 '洗盤',
 '水漫',
 '洗手台',
 '洗手盆',
 '中溢',
 '洗到',
 '槽裡',
 '關緊',
 '關窗',
 '貪嘴',
 '則看',
 '神情自若',
 '矮房',
 '著連',
 '搶著',
 '上樑不正下樑歪',
 '淹過',
 '則站',
 '餐碗',
 '他失',
 '水瀉',
 '出流理',
 '2',
 '不太穩',
 '溢水',
 '已滿出',
 '溼答答',
 '並遞',
 '給站',
 '台放',
 '沒洗',
 '水漏',
 '穩要',
 '兩人齊',
 '水開',
 '台流',
 '罐裡',
 '連水',
 '地不受',
 '流著水',
 '洗著',
 '著腳',
 '碗櫥',
 '拿些',
 '能接',
 '還在流',
 '邊玩',
 '水都濺',
 '不館',
 '瓶瓶罐罐',
 '正忙',
 '椅去']

In [20]:
assert len(vocab.keys) == len(data_helper.tokenizer.word_index.keys())

In [21]:
# print(vocab.keys[0], vocab[vocab.keys[0]])