### Imports

In [None]:
import sys
sys.path.append("../")
from IPython.display import display, Markdown, HTML
import jieba
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import matplotlib.pyplot as plt

from preprocessing.document_manager import DocumentManager
from preprocessing.utils import cut_text
from settings.settings import PROJECT_SETTINGS

### Load Dataset

In [None]:
doc_manager = DocumentManager()
doc_manager.cache_documents()

### Get Sentences Data

In [None]:
sentences_texts = doc_manager.get_all_sentence_data()
print("Example Sentence:\n%s" % sentences_texts[0])

### Create Tokenizer from Vocabulary

In [None]:
max_length = 200

# Tokenize and join with spaces
tokenized_raw_texts = [ cut_text(text) for text in sentences_texts ]

print(tokenized_raw_texts[:10])

# Create and fit Tokenizer
def create_tokenizer(tokenized_raw_texts):
    input_tokenizer = Tokenizer()
    input_tokenizer.fit_on_texts(tokenized_raw_texts)
    return input_tokenizer

input_tokenizer = create_tokenizer(tokenized_raw_texts)
input_vocab_size = len(input_tokenizer.word_index) + 1

tokenized_input = input_tokenizer.texts_to_sequences(tokenized_raw_texts)
padded_input = np.array(pad_sequences(tokenized_input, maxlen=max_length))

### Inspect Vocabulary

In [None]:
print("Total token count: %s" % len(input_tokenizer.word_index))

In [None]:
word_counts = sorted(input_tokenizer.word_counts.items(), 
                     key=lambda tup: tup[1], reverse=True)

print("Most common words:", word_counts[:50], "\n")
print("Somewhat common words:", word_counts[800:850], "\n")
print("Less common words:", word_counts[2000:2050], "\n")
print("Uncommon words:", word_counts[25000:25050], "\n")

### Create and Save Embedding Matrix for Vocabulary

In [7]:
from os.path import join

from settings.settings import (
    EMBEDDING_DATA_ROOT,
    RAW_WORD_EMBEDDING_PATH,
)
from preprocessing.word_embeddings import save_word_embeddings_and_tokenizer


embedding_input_path = RAW_WORD_EMBEDDING_PATH
embedding_output_path = PROJECT_SETTINGS.EMBEDDING_MATRIX_PATH
tokenizer_output_path = PROJECT_SETTINGS.TOKENIZER_PATH

save_word_embeddings_and_tokenizer(
    tokenizer=input_tokenizer,
    embedding_input_path=embedding_input_path,
    embedding_output_path=embedding_output_path,
    tokenizer_output_path=tokenizer_output_path,
)


100%|██████████| 8824330/8824330 [01:24<00:00, 103933.84it/s]


Found 51497 words
Did not find 1992 words: {'抵福', '令太多', '谈不上', '随冬而来', '人之为', '布吉再', '一般来说', '他会象', '结框', '王蒙对', '提不动', '爱作章', '它会漏', '没人烧', '凑上去', '确切的说', '说兰惠', '就此结束', '唯恐天下', '谈过去', '课牌', '先拉来', '没人屑', '收不抵', '送书顶', '对爱过', '吸进去', '托费差', '当藏独', '三十多个', '轻松自在', '有大有小', '吴师志诚', '杨佩宜', '用火先', '汤之山', '课不太多', '我太想', '刮来刮去', '美不染', '我常为', '老姨告', '有种乎', '这一劫', '照不亮', '赵多为', '手牵着', '成励', '租出去', '试比低', '宽宽的', '无一不是', '斤余重', '几十个', '地吐出', '自印供', '照理说', '户玉', '方夜潭', '不亦乐呼', '当暖昧', '屏泪', '脸朝天', '地飘着', '一帘水幔', '天高雁', '卓国', '手舞纸', '有多深', '录和声', '我拉到', '他太忙', '难红上', '绽静', '这么久', '夹杂着', '假虎照', '更不怪', '如花时', '昨天晚上', '相对来说', '这雪下', '没有经费', '深而幽', '华老去', '接不上', '用太多', '掩盖着', '终无蕾', '这假花', '歌党伟', '交还给', '从技术上', '茅以轼', '爱谈书', '睡过去', '愿佛组', '頓时', '一泓水', '来代司', '象漏', '退一步说', '反过来说', '北伸延', '窗台上', '这光茫', '尽之时', '手术台上', '吃不上', '老不牛', '笼育', '个性化', '气死我了', '弯弯的', '我主说', '闪烁着', '卓国珍', '可兮然', '博协中', '周忆清', '说得好', '一个班', '略大点', '紧跟着', '好攒足', '埋长', '晕过去', '尝到了', '零四个', '水击浪', '冷冷的', '总在射', '别埠', '一下一下', '风太大', '