<a href="https://colab.research.google.com/github/athensclub/Thai-Word-Cutter/blob/master/test_model_with_splitted_sentences.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Importing and Housekeeping**

In [13]:
from google.colab import files
from tensorflow.keras import Model
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import Input, Dense, Embedding, Concatenate, Flatten,LSTM
import tensorflow as tf
import os
import numpy as np

# **Create a mapping from a character to an integer**

In [14]:
characters = 'กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙abcdefghijklmnopqrstuvwxyz"\'0123456789,.!?/\\:;%()[]{}+_-*@#><=^$& \t\n'
char_encode = {}
char_decode = {}
i = 1
for c in characters:
  char_encode[c] = i;
  char_decode[i] = c;
  i += 1

# **Function: `encode(data)`**

> Accepts: (data)
*   data: the string to be converted to list of integers.


> Returns: (encoded)
*   encoded: the list of integers encoded from the given data string







In [15]:
def encode(data):
  encoded = []
  data = data.lower()
  for c in data:
    encoded.append(char_encode[c])
  return encoded

# **Function: `decode(data)`**

> Accepts: (data)
*   data: the list of integers to be converted to string

> Returns: (decoded)
*   decoded: the string that is decoded from list of integers given from data





In [16]:
def decode(data):
  decoded = ''
  for c in data:
    if c != 0:
      decoded = decoded + char_decode[c]
  return decoded

# **Function: `convert_data(data)`**
> Accepts: (data)
*   data: raw string read from news file

> Returns: (encoded,ans)
*  encoded: The string that is created from removing separator '|' from original raw string, which is then encoded into integers by function ```encode(data)```
*   ans: The array that is of the same length as combined string and has value be either 0 or 1. The 0 means that the character at that index of combined string should not be cut while 1 means that it should be cut







 

In [17]:
def convert_data(data):
  splitted = data.split('|')
  encoded = encode(data.replace('|',''))
  ans = np.zeros(len(encoded))
  i = 0;
  for s in splitted:
    if(len(s) > 0):
      i += len(s) 
      ans[i - 1] = 1
  return encoded,ans

# **Function: `split_data(encoded,ans,length=256)`**
A function that split by whitespace and then combine them together into array that the size does not exceed the length and return the result of combining all the combined all of the splitted data into one list, mapped to another answer array by index

> Accepts: (encoded,ans,length=256)
*   encoded: the encoded data that is going to be splitted
*   ans: the array of the answer of the data
*   length: the maximum size of each splitted data (default value is 256)

> Returns: (splitted,splitted_ans)
*   splitted: the list of the splitted data
*   splitted_ans: the list of the answer to the splitted data, mapped by index








In [18]:
def split_data(encoded,ans,length=256):
  splitted = []
  splitted_ans = []
  ans_chunk = []
  chunk = []
  temp = []
  ans_temp = []
  for i in range(len(encoded)):
    c = encoded[i]
    temp.append(c)
    ans_temp.append(ans[i])
    if c == char_encode[' ']:
      if len(temp) > 0:
        if len(temp) + len(chunk) < length:
          chunk.extend(temp)
          ans_chunk.extend(ans_temp)
          ans_temp = []
          temp = []
        else:
          splitted.append(chunk)
          splitted_ans.append(ans_chunk)
          chunk = []
          ans_chunk = []
          chunk.extend(temp)
          ans_chunk.extend(ans_temp)
          ans_temp = []
          temp = []
  #cleaning leftovers
  if len(temp) > 0:
    if len(temp) + len(chunk) < length:
      chunk.extend(temp)
      ans_chunk.extend(ans_temp)
    else:
      splitted.append(chunk)
      splitted_ans.append(ans_chunk)
      chunk.extend(temp)
      ans_chunk.extend(ans_temp)
  if len(chunk) > 0:
    splitted.append(chunk)
    splitted_ans.append(ans_chunk)
  return splitted,splitted_ans

# **Function: `split_text_data(encoded,length=256)`**
Similar to ```split_data(encoded,ans,length=256)``` but does not split the answer data

> Accepts: (encoded,length=256)
*   encoded: the encoded data that is going to be splitted
*   length: the maximum size of each splitted data (default value is 256)

> Returns: (splitted)
*   splitted: the list of the splitted data








In [19]:
def split_text_data(encoded,length=256):
  splitted = []
  chunk = []
  temp = []
  for i in range(len(encoded)):
    c = encoded[i]
    temp.append(c)
    if c == char_encode[' ']:
      if len(temp) > 0:
        if len(temp) + len(chunk) < length:
          chunk.extend(temp)
          temp = []
        else:
          splitted.append(chunk)
          chunk = []
          chunk.extend(temp)
          temp = []
  #cleaning leftovers
  if len(temp) > 0:
    if len(temp) + len(chunk) < length:
      chunk.extend(temp)
    else:
      splitted.append(chunk)
      chunk.extend(temp)
  if len(chunk) > 0:
    splitted.append(chunk)
  return splitted

# **Function: ```create_training_data(splitted,splitted_ans)```**
Take in a list of chunks of text and where to cut and turn it into a training data

> Accepts: (splitted,splitted_ans)
*   splitted: The list of chunk of the text
*   splitted_ans: the list of chunk of location to cut the text,mapped to the splitted chunk by index

> Returns: (before,current,after,ans) Note that every data returned is mapped to one another by index
*   before: the training data for model in 'before' input layer
*   current: the training data for model in 'current' input layer
*   after: the training data for model in 'after' input layer
*   ans: the training data that contains answer for the model








In [20]:
def create_training_data(splitted,splitted_ans):
  before = []
  current = []
  after = []
  ans = []
  for i in range(len(splitted)):
    chunk = splitted[i]
    chunk_ans = splitted_ans[i]
    temp = []
    chunk_before = []
    chunk_after = chunk.copy()
    for j in range(len(chunk)):
      temp = temp.copy()
      temp.append(chunk[j])
      chunk_after = chunk_after.copy()
      chunk_after.pop(0)
      before.append(chunk_before)
      current.append(temp)
      after.append(chunk_after)
      ans.append(chunk_ans[j])
      if chunk_ans[j] == 1:
        chunk_before = chunk_before.copy()
        chunk_before.extend(temp)
        temp = []
    if len(temp) > 0:
      before.append(chunk_before)
      current.append(temp)
      after.append(chunk_after)
      ans.append(chunk_ans[len(chunk_ans)-1])
  before = sequence.pad_sequences(before,256)
  current = sequence.pad_sequences(current,256)
  after = sequence.pad_sequences(after,256)
  return before,current,after,np.asarray(ans)

# **Function: ```train(_model,text)```**
Train the given model using the input text that is tokenized, splitting each word by the character '|'

> Accepts (_model,text)
*   _model: the model that is going to be trained
*   text: the raw text data that is tokenized, splitting each word by the character '|'

> Returns: None









In [None]:
def train(_model,text):
  (train_encoded,train_ans) = convert_data(text)
  (train_splitted,train_splitted_ans) = split_data(train_encoded,train_ans)
  (train_data_before,train_data_current,train_data_after,train_data_ans) = create_training_data(train_splitted,train_splitted_ans)
  _model.fit([train_data_before,train_data_current,train_data_after],train_data_ans)

# **Function: ```evaluate(_model,text)```**
Simular to ```train(_model,text)``` but instead of using the given text data to train the model, it is used to evaluate the model



> Accepts (_model,text)
*   _model: the model that is going to be evaluated
*   text: the raw text data that is tokenized, splitting each word by the character '|'

> Returns: (result)
*   result: Result of the evaluation. A list of 2 elements, the first element is loss, and the second element is accuracy









In [8]:
@timed
def evaluate(_model,text):
  (train_encoded,train_ans) = convert_data(text)
  (train_splitted,train_splitted_ans) = split_data(train_encoded,train_ans)
  (train_data_before,train_data_current,train_data_after,train_data_ans) = create_training_data(train_splitted,train_splitted_ans)
  result = _model.evaluate([train_data_before,train_data_current,train_data_after],train_data_ans)
  return result

# **Create and compile the model**

In [None]:
model = tf.keras.models.load_model('model.h5')

# **Train the model**

In [None]:
for i in range(80):
  file = open('news_{:05d}.txt'.format(i+1))
  if file.mode == 'r':
    train_text = file.read()
  file.close()
  train(model,train_text)

In [11]:
import time

# **Evaluate the model**

In [22]:
raw_data = "";
for i in range(5):
  target_file = open('train_{:05d}.txt'.format(445+1),'r')
  if(target_file.mode == 'r'):
    raw_data += target_file.read()
  target_file.close()
evaluate(model,raw_data)

evaluate took 37.352768898010254 seconds to complete its execution.
[0.04034298658370972, 0.9855148792266846]


# **Function: ```tokenize(text)```**
Take in a normal string and tokenize it. Each sentence length must be below 256 characters and each sentence must be splitted by space bar

> Accepts: (text)
*   text: the raw text that is going to be tokenized

> Returns: (tokenized)
*   tokenized: the result of tokenization, in form of list of strings, where each string is the word tokenized






In [None]:
def tokenize(text):
  tokenized = []
  splitted = split_text_data(encode(text))
  for chunk in splitted:
    temp = []
    before = []
    after = chunk.copy()
    for c in chunk:
      temp.append(c)
      after.pop(0)
      pred = model.predict([sequence.pad_sequences([before],256),sequence.pad_sequences([temp],256),sequence.pad_sequences([after],256)])[0]
      if pred > 0.5:
        tokenized.append(decode(temp))
        before.extend(temp)
        temp = []
    #cleaning leftovers
    if len(temp) > 0:
      tokenized.append(decode(temp))
  return tokenized

In [7]:
def timed(func):
   def function_timer(*args, **kwargs):
      start = time.time()
      value = func(*args, **kwargs)
      end = time.time()
      runtime = end - start
      msg = "{func} took {time} seconds to complete its execution."
      print(msg.format(func = func.__name__,time = runtime))
      print(value)
   return function_timer