### Imports

In [None]:
import sys
sys.path.append("../")
from IPython.display import display, Markdown, HTML
import jieba
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import matplotlib.pyplot as plt

from preprocessing.document_manager import DocumentManager
from preprocessing.utils import cut_text
from settings.settings import PROJECT_SETTINGS

### Load Dataset

In [None]:
doc_manager = DocumentManager()
doc_manager.cache_documents()

### Get Sentences Data

In [None]:
sentences_texts = doc_manager.get_all_sentence_data()
print("Example Sentence:\n%s" % sentences_texts[0])

### Create Tokenizer from Vocabulary

In [None]:
max_length = 200

# Tokenize and join with spaces
tokenized_raw_texts = [ cut_text(text) for text in sentences_texts ]

print(tokenized_raw_texts[:10])

# Create and fit Tokenizer
def create_tokenizer(tokenized_raw_texts):
    input_tokenizer = Tokenizer()
    input_tokenizer.fit_on_texts(tokenized_raw_texts)
    return input_tokenizer

input_tokenizer = create_tokenizer(tokenized_raw_texts)
input_vocab_size = len(input_tokenizer.word_index) + 1

tokenized_input = input_tokenizer.texts_to_sequences(tokenized_raw_texts)
padded_input = np.array(pad_sequences(tokenized_input, maxlen=max_length))

### Inspect Vocabulary

In [None]:
print("Total token count: %s" % len(input_tokenizer.word_index))

In [None]:
word_counts = sorted(input_tokenizer.word_counts.items(), 
                     key=lambda tup: tup[1], reverse=True)

print("Most common words:", word_counts[:50], "\n")
print("Somewhat common words:", word_counts[800:850], "\n")
print("Less common words:", word_counts[2000:2050], "\n")
print("Uncommon words:", word_counts[25000:25050], "\n")

### Create and Save Embedding Matrix for Vocabulary

In [None]:
from os.path import join
from tqdm import tqdm

from settings.settings import (
    EMBEDDING_DATA_ROOT,
    RAW_WORD_EMBEDDING_PATH,
)
from preprocessing.word_embeddings import save_word_embeddings_and_tokenizer


embedding_input_path = RAW_WORD_EMBEDDING_PATH
embedding_output_path = PROJECT_SETTINGS.EMBEDDING_MATRIX_PATH
tokenizer_output_path = PROJECT_SETTINGS.TOKENIZER_PATH

save_word_embeddings_and_tokenizer(
    tokenizer=input_tokenizer,
    embedding_input_path=embedding_input_path,
    embedding_output_path=embedding_output_path,
    tokenizer_output_path=tokenizer_output_path,
)
