In [None]:
%load_ext autoreload
%autoreload 2

from bailarn.utils import utils

# Define tokenizer and word_embedder

In [None]:
from bailarn.tokenizer import constant as tokenizer_constant
from bailarn.tokenizer.tokenizer import Tokenizer

# Create index for character and tag
char_index = utils.build_tag_index(tokenizer_constant.CHARACTER_LIST, tokenizer_constant.CHAR_START_INDEX)
tag_index = utils.build_tag_index(tokenizer_constant.TAG_LIST, tokenizer_constant.TAG_START_INDEX)

tokenizer_model = Tokenizer(char_index, tag_index)

def tokenize_func(sentence):
    return tokenizer_model.predict(sentence)

In [None]:
from bailarn.word_embedder.word2vec import Word2Vec

w2v_model = Word2Vec()

# Load text collection

In [None]:
texts = utils.TextCollection(corpus_directory="./data/sample_Pantip_mbk_room", tokenize_function=tokenize_func)
print("Corpus size : {}\n".format(texts.count))
print("Example corpus text : {}\n".format(texts.get_content(0)[:50]))

# Create word_index

In [None]:
# # Build word_index
# word_index = utils.build_word_index(texts, word2vec_vocab=w2v_model.model.wv.vocab)

# Load saved word index
import json

with open('./bailarn/categorization/categorization_word_index.json', 'r') as f:
    word_index = json.load(f)

In [None]:
sorted(word_index.items(), key=lambda x:x[1], reverse=False)[-10:]

In [None]:
# with open("./bailarn/categorization/categorization_word_index.json", "w") as write_file:
#     json.dump(word_index, write_file)

# Create embedding_matrix

In [None]:
# Build embedding_matrix
embedding_matrix = utils.get_embedding_matrix(word2vec_model=w2v_model, word_index=word_index, fasttext=False)

# Create tag_index

In [None]:
from bailarn.categorization import constant as categorization_constant
from bailarn.categorization.categorization import Categorization

categorization_tag_index = utils.build_tag_index(categorization_constant.TAG_LIST, categorization_constant.TAG_START_INDEX)

# Transform text into input

In [None]:
vs = utils.build_input(texts,
                       word_index,
                       categorization_tag_index,
                       categorization_constant.SEQUENCE_LENGTH,
                       target='categorization')

In [None]:
print(vs.x[0][:10])

In [None]:
print(vs.readable_x[0][:10])

In [None]:
print(vs.y[0][:10])

In [None]:
print(vs.readable_y[0][:10])

# Train new model

In [None]:
new_categorization_model = Categorization(new_model=True)

# # If you want to train new model with pre-trained word embedding, you can define `embedding_matrix`
# categorization_model = Categorization(embedding_matrix=embedding_matrix, new_model=True)

In [None]:
new_categorization_model.train(X_train=vs.x, y_train=vs.y, epochs=1,batch_size=64, 
                               validate_ratio=0.1, sensitive_learning=False, learning_rate=0.001)

In [None]:
new_categorization_model.predict(vs.x[:1], decode_tag=False)

In [None]:
new_categorization_model.predict(vs.x[:1], decode_tag=True)

In [None]:
new_categorization_model.evaluate(vs.x, vs.y)

In [None]:
new_categorization_model.save("./bailarn/categorization/models/abc.h5")

# Load trained model

In [None]:
loaded_categorization = Categorization(model_path="./bailarn/categorization/models/abc.h5", new_model=False)

In [None]:
loaded_categorization.predict(vs.x, decode_tag=True)

In [None]:
loaded_categorization.evaluate(vs.x, vs.y)

# Use default multi-label text categorization model

In [None]:
categorization = Categorization()

In [None]:
# Predict using constant sigmoid thershold for each label 
categorization.predict(vs.x[:1], threshold_selection=0.1, decode_tag=True)

In [None]:
# Predict using best (from json) sigmoid thershold for each label 
categorization.predict(vs.x[:1], threshold_selection=0.1, decode_tag=True)

In [None]:
# Evaluate using constant sigmoid thershold for each label 
categorization.evaluate(vs.x, vs.y, threshold_selection=0.1)

In [None]:
# Evaluate using best (from json) sigmoid thershold for each label 
categorization.evaluate(vs.x, vs.y)