In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from bailarn.sentiment.analyzer import SentimentAnalyzer
from bailarn.sentiment import constant
from bailarn.utils import utils
import numpy as np

# Read Text File

In [None]:
from bailarn.tokenizer import constant as tokenizer_constant
from bailarn.tokenizer.tokenizer import Tokenizer

# Create index for character and tag
char_index = utils.build_tag_index(tokenizer_constant.CHARACTER_LIST, tokenizer_constant.CHAR_START_INDEX)
tag_index = utils.build_tag_index(tokenizer_constant.TAG_LIST, tokenizer_constant.TAG_START_INDEX)

tokenizer_model = Tokenizer(char_index, tag_index)

def tokenize_func(sentence):
    return tokenizer_model.predict(sentence)

In [None]:
texts = utils.TextCollection(corpus_directory="./data/sample_Wongnai_review", tokenize_function=tokenize_func)
print("Corpus size : {}\n".format(texts.count))
print("Example corpus text : {}\n".format(texts.get_content(0)))

# Create word and tag indices

In [None]:
# Load saved word index

import json

with open('./bailarn/sentiment/sentiment_word_index.json', 'r') as f:
    word_index = json.load(f)

In [None]:
# with open("./bailarn/ner/ner_word_index.json", "w") as write_file:
#     json.dump(word_index, write_file)

In [None]:
sorted(word_index.items(), key=lambda x:x[1], reverse=False)[-10:]

In [None]:
tag_index = utils.build_tag_index(constant.TAG_LIST, start_index=0)

# Transform text into input

In [None]:
vs = utils.build_input(text_collection=texts,
                        word_index=word_index,
                        tag_index=tag_index,
                        sequence_length=constant.SEQUENCE_LENGTH, # padding size
                        target='sentiment')

In [None]:
print(vs.x[0][:10])

In [None]:
print(vs.readable_x[0][:10])

In [None]:
print(vs.y[0][:10])

In [None]:
print(vs.readable_y[0][:10])

# Use Pre-train model

In [None]:
default_model = SentimentAnalyzer()

In [None]:
default_model.predict(vs.x, decode_tag=True)

In [None]:
default_model.evaluate(vs.x, vs.y)

# Train new model

In [None]:
# Load w2v model

from bailarn.word_embedder.word2vec import Word2Vec
w2v_model = Word2Vec()

embedding_matrix = utils.get_embedding_matrix(word2vec_model=w2v_model, word_index=word_index)   

In [None]:
model = SentimentAnalyzer(new_model=True, embedding_matrix=embedding_matrix)

In [None]:
model.train(vs.x, vs.y, validation_split=0.1, epochs=1, batch_size=64)

In [None]:
model.predict(vs.x, decode_tag=True)

In [None]:
model.evaluate(vs.x, vs.y)