### Imports

In [1]:
import sys
sys.path.append("../")
from IPython.display import display, Markdown, HTML
import jieba
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import matplotlib.pyplot as plt

from preprocessing.document_manager import DocumentManager

### Load Dataset

In [2]:
seed_constant = 1338

doc_manager = DocumentManager(seed_constant=seed_constant)
doc_manager.cache_documents()

### Get Sentences Data

In [3]:
sentences_texts = doc_manager.get_all_sentence_data()
print("Example Sentence:\n%s" % sentences_texts[0])

Example Sentence:
她们都睡了，我蹑手蹑脚摸黑上了床，凑上去想亲嫣一下，她突然一个转身，小手“啪”地搭在了我的脸颊上，我便被施了魔法似地定住了，每次抱着嫣的时候总想让她的小手搂着我的脖子，可她总是不肯，她的两只小手要指挥着我的方向，要指着她感兴趣的东西，一刻也不肯停闲。


### Create Tokenizer from Vocabulary

In [4]:
max_length = 200

def cut_text(text):
    seg_list = jieba.cut(text, cut_all=False)
    new_text = " ".join(seg_list)
    return new_text

# Tokenize and join with spaces
tokenized_raw_texts = [ cut_text(text) for text in sentences_texts ]

# Create and fit Tokenizer
def create_tokenizer(tokenized_raw_texts):
    input_tokenizer = Tokenizer()
    input_tokenizer.fit_on_texts(tokenized_raw_texts)
    return input_tokenizer

input_tokenizer = create_tokenizer(tokenized_raw_texts)
input_vocab_size = len(input_tokenizer.word_index) + 1

tokenized_input = input_tokenizer.texts_to_sequences(tokenized_raw_texts)
padded_input = np.array(pad_sequences(tokenized_input, maxlen=max_length))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.363 seconds.
Prefix dict has been built successfully.


### Inspect Vocabulary

In [38]:
print("Total token count: %s" % len(input_tokenizer.word_index))

Total token count: 53489


In [39]:
print("Most common words:", word_counts[:50], "\n")
print("Somewhat common words:", word_counts[800:850], "\n")
print("Less common words:", word_counts[2000:2050], "\n")
print("Uncommon words:", word_counts[25000:25050], "\n")

Most common words: [('，', 80380), ('的', 54837), ('。', 28038), ('我', 16515), ('了', 14869), ('是', 11138), ('在', 9895), ('你', 6147), ('也', 5545), ('都', 4502), ('有', 4412), ('就', 4354), ('人', 4240), ('他', 4153), ('“', 4141), ('”', 4129), ('不', 3929), ('一个', 3919), ('我们', 3763), ('和', 3692), ('自己', 3465), ('着', 3368), ('、', 3334), ('？', 3130), ('说', 3084), ('她', 2828), ('！', 2757), ('没有', 2735), ('会', 2580), ('让', 2453), ('上', 2423), ('很', 2391), ('这', 2301), ('那', 2149), ('中', 2027), ('去', 2019), ('到', 1952), ('对', 1946), ('要', 1876), ('：', 1803), ('但', 1773), ('又', 1760), ('还', 1755), ('而', 1727), ('…', 1642), ('时候', 1589), ('他们', 1578), ('却', 1536), ('０', 1523), ('这样', 1484)] 

Somewhat common words: [('第一', 95), ('打电话', 95), ('寻找', 95), ('上班', 95), ('深深', 95), ('比赛', 95), ('忽然', 95), ('夜晚', 95), ('全部', 94), ('满足', 94), ('方面', 94), ('之外', 94), ('利益', 94), ('家长', 94), ('等等', 94), ('正常', 94), ('一颗', 94), ('挺', 94), ('哪怕', 94), ('穿', 94), ('总会', 93), ('考试', 93), ('小时候', 93), ('很大', 93), ('笑

### Create and Save Embedding Matrix for Vocabulary

In [6]:
from os.path import join
from tqdm import tqdm

from settings.settings import (
    EMBEDDING_DATA_ROOT,
    RAW_WORD_EMBEDDING_PATH,
)
from preprocessing.word_embeddings_txt_to_keras import save_word_embeddings_and_tokenizer


embedding_input_path = RAW_WORD_EMBEDDING_PATH
embedding_output_path = join(EMBEDDING_DATA_ROOT, "tencent_keras_embedding.pkl")
tokenizer_output_path = join(EMBEDDING_DATA_ROOT, "tencent_keras_tokenizer.pkl")

save_word_embeddings_and_tokenizer(
    tokenizer=input_tokenizer,
    embedding_input_path=embedding_input_path,
    embedding_output_path=embedding_output_path,
    tokenizer_output_path=tokenizer_output_path,
)


100%|██████████| 8824330/8824330 [01:25<00:00, 103769.67it/s]


Found 51497 words
Did not find 1992 words: {'再说下去', '那家店', '一个角', '抢走了', '加花虹', '继续下去', '打监球', '责系', '客不多', '过干礼', '小五养', '没到院', '斩六将', '这雪下', '那清碎', '钻一到', '说时迟那时快', '黑咕龙咚', '能买个', '一个双', '成整额', '死死的', '这一看', '先拉回', '受恩萧', '绽静', '这头长', '比如说', '缠绕着', '谈过去', '不离手', '红红绿绿的', '王将瓜', '发表演说', '闻香润齿', '稍大点', '哥炖', '如虞姬', '窄窄的', '没人会', '拉进去', '多凉牙', '我买花', '红红的', '不爱读', '在兮然', '地飞到', '华老去', '考虑一下', '寻睨', '说者无意', '熬过去', '晕过去', '仇玲磊', '不多怪', '屏泪', '地捏动', '般满树', '送伟明', '杨希雪', '着兰儿', '这么些', '说话声', '簇拥着', '管柳笛', '那半高', '水起浪', '有吃有喝', '如梦八', '不识秋', '没人烧', '当藏独', '氛都', '一声令下', '一个终生', '嗜而敬', '厉智昏', '游美东', '千河结', '反过来说', '次梦中', '七八个', '不爱爱', '老拿着', '这几年来', '发如命', '回津头', '误以为', '人不爱', '需有球', '有想会', '照理说', '这一剂', '二十几个', '很扎脚', '任如刀', '如画般', '篡本', '以暖昧', '无穷的', '赠送给', '说说话', '喻而明', '扁兄', '碎碎的', '据港媒', '将梦儿', '染一地', '地拿起', '渔博友', '手合于', '围过去', '医百痛', '有别于', '不产瓜', '我瞪大', '三过晌', '没等我', '之真趣', '有趣儿', '人之广', '为五斗米折腰', '沈公听', '有情人终', '哺鹅苗', '讲信用', '最多时', '是否是', '噪中', '考记员', '长不出', '个别人', '车几拐', '望出去', '我班好'