### Imports

In [1]:
import sys
sys.path.append("../")
from IPython.display import display, Markdown, HTML
import jieba
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import matplotlib.pyplot as plt

from preprocessing.document_manager import DocumentManager

### Load Dataset

In [2]:
seed_constant = 1338

doc_manager = DocumentManager(seed_constant=seed_constant)
doc_manager.cache_documents()

### Get Sentences Data

In [3]:
sentences_texts = doc_manager.get_all_sentence_data()
print("Example Sentence:\n%s" % sentences_texts[0])

Example Sentence:
她们都睡了，我蹑手蹑脚摸黑上了床，凑上去想亲嫣一下，她突然一个转身，小手“啪”地搭在了我的脸颊上，我便被施了魔法似地定住了，每次抱着嫣的时候总想让她的小手搂着我的脖子，可她总是不肯，她的两只小手要指挥着我的方向，要指着她感兴趣的东西，一刻也不肯停闲。


### Create Tokenizer from Vocabulary

In [4]:
max_length = 200

def cut_text(text):
    seg_list = jieba.cut(text, cut_all=False)
    new_text = " ".join(seg_list)
    return new_text

# Tokenize and join with spaces
tokenized_raw_texts = [ cut_text(text) for text in sentences_texts ]

# Create and fit Tokenizer
def create_tokenizer(tokenized_raw_texts):
    input_tokenizer = Tokenizer()
    input_tokenizer.fit_on_texts(tokenized_raw_texts)
    return input_tokenizer

input_tokenizer = create_tokenizer(tokenized_raw_texts)
input_vocab_size = len(input_tokenizer.word_index) + 1

tokenized_input = input_tokenizer.texts_to_sequences(tokenized_raw_texts)
padded_input = np.array(pad_sequences(tokenized_input, maxlen=max_length))

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.442 seconds.
Prefix dict has been built successfully.


### Inspect Vocabulary

In [5]:
print("Total token count: %s" % len(input_tokenizer.word_index))

Total token count: 53489


In [8]:
word_counts = sorted(input_tokenizer.word_counts.items(), 
                     key=lambda tup: tup[1], reverse=True)

print("Most common words:", word_counts[:50], "\n")
print("Somewhat common words:", word_counts[800:850], "\n")
print("Less common words:", word_counts[2000:2050], "\n")
print("Uncommon words:", word_counts[25000:25050], "\n")

Most common words: [('，', 80380), ('的', 54837), ('。', 28038), ('我', 16515), ('了', 14869), ('是', 11138), ('在', 9895), ('你', 6147), ('也', 5545), ('都', 4502), ('有', 4412), ('就', 4354), ('人', 4240), ('他', 4153), ('“', 4141), ('”', 4129), ('不', 3929), ('一个', 3919), ('我们', 3763), ('和', 3692), ('自己', 3465), ('着', 3368), ('、', 3334), ('？', 3130), ('说', 3084), ('她', 2828), ('！', 2757), ('没有', 2735), ('会', 2580), ('让', 2453), ('上', 2423), ('很', 2391), ('这', 2301), ('那', 2149), ('中', 2027), ('去', 2019), ('到', 1952), ('对', 1946), ('要', 1876), ('：', 1803), ('但', 1773), ('又', 1760), ('还', 1755), ('而', 1727), ('…', 1642), ('时候', 1589), ('他们', 1578), ('却', 1536), ('０', 1523), ('这样', 1484)] 

Somewhat common words: [('第一', 95), ('打电话', 95), ('寻找', 95), ('上班', 95), ('深深', 95), ('比赛', 95), ('忽然', 95), ('夜晚', 95), ('全部', 94), ('满足', 94), ('方面', 94), ('之外', 94), ('利益', 94), ('家长', 94), ('等等', 94), ('正常', 94), ('一颗', 94), ('挺', 94), ('哪怕', 94), ('穿', 94), ('总会', 93), ('考试', 93), ('小时候', 93), ('很大', 93), ('笑

### Create and Save Embedding Matrix for Vocabulary

In [9]:
from os.path import join
from tqdm import tqdm

from settings.settings import (
    EMBEDDING_DATA_ROOT,
    RAW_WORD_EMBEDDING_PATH,
    EMBEDDING_MATRIX_PATH,
    KERAS_TOKENIZER_PATH,
)
from preprocessing.word_embeddings import save_word_embeddings_and_tokenizer


embedding_input_path = RAW_WORD_EMBEDDING_PATH
embedding_output_path = EMBEDDING_MATRIX_PATH
tokenizer_output_path = KERAS_TOKENIZER_PATH

save_word_embeddings_and_tokenizer(
    tokenizer=input_tokenizer,
    embedding_input_path=embedding_input_path,
    embedding_output_path=embedding_output_path,
    tokenizer_output_path=tokenizer_output_path,
)


100%|██████████| 8824330/8824330 [01:29<00:00, 98083.15it/s] 


Found 51497 words
Did not find 1992 words: {'哺成鹅苗', '换过去', '称不上', '搀草', '火面合', '但肤质', '这一别', '可寄去', '从狗妈', '制卷费', '终之美', '挽里', '看不上', '有名气', '千千万万个', '诞欲滴', '问西西', '解决不了', '吃镜', '没干过', '由上向下', '闪若亮', '水起浪', '一肚血', '憧地', '清清的', '放上去', '回和程', '来得快', '暂不多言', '冷冷的', '黑咕龙咚', '两耳不闻', '用之于', '老姥爱', '自己自足', '三百多个', '网渔说', '或者说', '打牙往', '烹苦', '奶农倒奶', '说好听', '用火针', '心盛满', '交叉着', '冰噶哒', '捐什款', '老超变', '抄来抄去', '愧间', '挥不掉', '说话声', '手合于', '拉腔拉调', '帐算到', '衷畅', '就会徒', '大斗彩瓶', '著有义', '掉下去', '有害无益', '尚知用', '有求于人', '串不起', '往水里', '所能学', '初道者', '小平带', '如梦似', '地拿起', '折截', '张华舅', '用太多', '省州县', '背诵着', '课不太多', '书架上', '巴单', '得太大', '真至善', '事袭', '手牵着', '医百痛', '之乐而乐', '却没能', '窃非', '人人平等', '对小宋', '加花虹', '剑勒', '一阡一', '见不着', '水击浪', '无数个', '伴花魂', '它会用', '再就是', '睡过去', '于敬庆', '可总为', '被选为', '比不上', '着眼于', '几千个', '愿佛组', '拾垢', '人之广', '上不出', '飞过去', '聚拢在', '有吃有喝', '礁堤', '宽宽的', '债无旁', '因艺考', '一下一下', '火四地', '义童', '谈论着', '有话要说', '一歪说', '大大的', '地震局', '这么一来', '凉量', '或事物', '彻姆', '灿若星眸', '在世界上', '想秀想', '发给你', '范美忠别', '目的旨在', '肥肥的', '