<a href="https://colab.research.google.com/github/athensclub/Thai-Word-Cutter/blob/master/model_with_fixed_vision_file_181_to_190.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#imports
from google.colab import files
from tensorflow.keras import Model
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import Input, Dense, Embedding, Concatenate, Flatten,LSTM
import tensorflow as tf
import os
import numpy as np

#Create a mapping from a character to an integer
characters = 'กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙abcdefghijklmnopqrstuvwxyz"\'0123456789,.!?/\\:;%()[]{}+_-*@#><=^$& \t\n'
char_encode = {}
char_decode = {}
i = 1
for c in characters:
  char_encode[c] = i;
  char_decode[i] = c;
  i += 1

def encode(data):
  encoded = []
  data = data.lower()
  for c in data:
    if c in char_encode:
      encoded.append(char_encode[c])
    else:
      encoded.append(0) #unknown character
  return encoded

def decode(data):
  decoded = ''
  for c in data:
    if c != 0:
      decoded = decoded + char_decode[c]
  return decoded

#convert from raw data, a text which words are splitted by '|' will be converted
#to a list of numver encoded by function encode and a list of the position of
#where to cut the word
def convert_data(data):
  splitted = data.split('|')
  encoded = encode(data.replace('|',''))
  ans = np.zeros(len(encoded))
  i = 0;
  for s in splitted:
    if(len(s) > 0):
      i += len(s) 
      ans[i - 1] = 1
  return encoded,ans

#create a data for model with vision of length. used for training, evaluation, and predictions
def create_model_data(encoded,ans,length):
  before = []
  current = []
  after = []
  temp = []
  for i in range(len(encoded)):
    temp.append(encoded[i])
    a = []
    b = []
    for x in range(length):
      if i - x - 1 >= 0:
        a.insert(0,encoded[i-x-1])
      if i + x + 1 < len(encoded):
        b.append(encoded[i+x+1])
    before.append(a)
    current.append(temp.copy())
    after.append(b)
    if ans[i] == 1:
      temp = []
  return sequence.pad_sequences(before,length),sequence.pad_sequences(current,length),sequence.pad_sequences(after,length)

#create a model with the vision of given length
def create_model(length):
  num_chars = len(characters)

  before_input = Input(shape=(length,), name='before')  
  current_input = Input(shape=(length,), name='current') 
  after_input = Input(shape=(length,), name='after')  

  before_features = Embedding(num_chars, 64)(before_input)
  current_features = Embedding(num_chars, 64)(current_input)
  after_features = Embedding(num_chars, 64)(after_input)

  before_features = LSTM(128)(before_features)
  current_features = LSTM(128)(current_features)
  after_features = LSTM(128)(after_features)

  x = Concatenate()([before_features, current_features, after_features])
  x = Dense(128,activation='relu')(x)
  out = Dense(1,activation='sigmoid',name='output')(x)

  model = Model(inputs=[before_input, current_input, after_input],
                    outputs=[out])

  model.compile(loss="binary_crossentropy",optimizer="adam",metrics=['acc'])

  return model

#train the given model with the given vision length with the given raw data
def train(model,data,length):
  (encoded,ans) = convert_data(data)
  (before,current,after) = create_model_data(encoded,ans,length)
  model.fit([before,current,after],np.asarray(ans))

#evaluate the given model with the given vision length with the given raw data
def evaluate(model,data,length):
  (encoded,ans) = convert_data(data)
  (before,current,after) = create_model_data(encoded,ans,length)
  model.evaluate([before,current,after],np.asarray(ans))

visions = [10,20,30,50,100,200]
models = []

for v in visions:
  models.append(create_model(v))


In [3]:
for i in range(10):
  target_file = open('train_{:05d}.txt'.format(i+181),'r')
  if(target_file.mode == 'r'):
    raw_data = target_file.read()
  target_file.close()
  for j in range(len(models)):
    train(models[j],raw_data,visions[j])




In [0]:
for i in range(len(models)):
  models[i].save('model_vision_' + str(visions[i]) + '.h5')

for i in range(len(models)):
  files.download('model_vision_' + str(visions[i]) + '.h5')
