# **Chatbot Java writing using Seq2Seq LSTM models**

This project is to create conversational chatbot using Sequence to sequence LSTM models. 
Sequence to sequence learning is about training models to convert from one domain to sequences another domain. 

# Import all the packages 

In [None]:
#Preprocessing Data
import pandas as pd
import numpy as np

import re
from nltk.stem.porter import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

#Extract Csv
import os
import csv

#NLP
import tensorflow as tf
import pickle
from tensorflow.keras import layers, activations, models, utils, preprocessing

from gensim.models import Word2Vec
import re

# Preprocessing the data

Create List for Question and Answers

In [None]:
dataframe = pd.read_csv("Corpus 2 Terbaru Gabungan - Sheet1.csv")

In [None]:
def text_clean(corpus):
    '''
    Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)
    
    Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained
            even after the cleaning process
    
    Output : Returns the cleaned text corpus
    
    '''
    cleaned_corpus = pd.Series()
    for row in corpus:
        qs = []
        for word in row.split():
            p1 = re.sub(pattern='[^a-zA-Z]',repl=' ',string=word)
            p1 = p1.lower()
            qs.append(p1)
        cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
    return cleaned_corpus

In [None]:
def stopwords_removal(corpus):
    wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
    stop = set(stopwords.words('english'))
    for word in wh_words:
        stop.remove(word)
    corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    return corpus

In [None]:
def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    return corpus

In [None]:
def stem(corpus, stem_type = None):
    if stem_type == 'snowball':
        stemmer = SnowballStemmer(language = 'english')
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    else :
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    return corpus

In [None]:
def preprocess(corpus, cleaning = False, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = False):
    
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus' - Text corpus on which pre-processing tasks will be performed
    
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should 
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    if cleaning == True:
        corpus = text_clean(corpus)
    
    if remove_stopwords == True:
        corpus = stopwords_removal(corpus)
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        corpus = lemmatize(corpus)
        
        
    if stemming == True:
        corpus = stem(corpus, stem_type)
    
    corpus = [' '.join(x) for x in corpus]
        

    return corpus

In [None]:
#Fast 15x cleaning row
slice = set(range(dataframe.shape[0])) - set([0,1001,1002]) #set(range[500-750]) for specific target
#dataframe2 = dataframe.take(list(slice))

DcolQ = [0,2,3,4,5,6,7,8] # 0 = Indo Question
dfQ = dataframe.drop(dataframe.columns[DcolQ], axis=1).take(list(slice))

DcolA = [0,1,2,3,4,6,7,8] # 5 = Jawa Answer
dfA = dataframe.drop(dataframe.columns[DcolA], axis=1).take(list(slice))

In [None]:
dfC = dfA.rename(columns={'Unnamed: 5':'Answer'}) #Unnamed -> Answer because the columns name is empty

corpusQ = pd.Series(dfQ.INDONESIA.tolist()).astype(str)
corpusA = pd.Series(dfC.Answer.tolist()).astype(str)

In [None]:
Quest = preprocess(corpusQ, lemmatization=True, stemming=False, remove_stopwords=True, cleaning= False)
Answ = preprocess(corpusA, lemmatization=True, stemming=False, remove_stopwords=True, cleaning= False)

Start Here

In [None]:
questions = Quest
answers = Answ

answers_with_tags = list()
for i in range( len( answers ) ):
    if type( answers[i] ) == str:
        answers_with_tags.append( answers[i] )
    else:
        questions.pop( i )

answers = list()
for i in range( len( answers_with_tags ) ) :
    answers.append( '<START> ' + answers_with_tags[i] + ' <END>' )

tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts( questions + answers )
VOCAB_SIZE = len( tokenizer.word_index )+1
print( 'VOCAB SIZE : {}'.format( VOCAB_SIZE ))

### b) Preparing data for Seq2Seq model

This model requires 3 arrays encoder_input_data, decoder_input_data and decoder_output_data.

For encoder_input_data:
Tokensize the Questions and Pad them to their maximum Length.

For decoder_input_data:
Tokensize the Answers and Pad them to their maximum Length.

For decoder_output_data:
Tokensize the Answers and Remove the 1st element from all the tokenized_answers. This is the <START> element which was added earlier.

In [None]:
vocab = []
for word in tokenizer.word_index:
  vocab.append(word)

def tokenize(sentences):
  tokens_list = []
  vocabulary = []
  for sentence in sentences:
    sentence = sentence.lower()
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    tokens = sentence.split()
    vocabulary += tokens
    tokens_list.append(tokens)
  return tokens_list, vocabulary

In [None]:
#encoder_input_data
tokenized_questions = tokenizer.texts_to_sequences( questions )
maxlen_questions = max( [len(x) for x in tokenized_questions ] )
padded_questions = preprocessing.sequence.pad_sequences( tokenized_questions, maxlen = maxlen_questions, padding = 'post')
encoder_input_data = np.array(padded_questions)
print(encoder_input_data.shape, maxlen_questions)

In [None]:
# decoder_input_data
tokenized_answers = tokenizer.texts_to_sequences( answers )
maxlen_answers = max( [ len(x) for x in tokenized_answers ] )
padded_answers = preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers , padding='post' )
decoder_input_data = np.array( padded_answers )
print( decoder_input_data.shape , maxlen_answers )

In [None]:
# decoder_output_data
tokenized_answers = tokenizer.texts_to_sequences( answers )
for i in range(len(tokenized_answers)) :
    tokenized_answers[i] = tokenized_answers[i][1:]
padded_answers = preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers , padding='post' )
onehot_answers = utils.to_categorical( padded_answers , VOCAB_SIZE )
decoder_output_data = np.array( onehot_answers )
print( decoder_output_data.shape )

# Defining Encoder Decoder Model





In [None]:
encoder_inputs = tf.keras.layers.Input(shape=( maxlen_questions , ))
encoder_embedding = tf.keras.layers.Embedding( VOCAB_SIZE, 200 , mask_zero=True ) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM( 200 , return_state=True )( encoder_embedding )
encoder_states = [ state_h , state_c ]

decoder_inputs = tf.keras.layers.Input(shape=( maxlen_answers ,  ))
decoder_embedding = tf.keras.layers.Embedding( VOCAB_SIZE, 200 , mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM( 200 , return_state=True , return_sequences=True )
decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )
decoder_dense = tf.keras.layers.Dense( VOCAB_SIZE , activation=tf.keras.activations.softmax ) 
output = decoder_dense ( decoder_outputs )

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy')

model.summary()

# Training the Model

We train the model for a number of epochs with RMSprop optimizer and categorical_crossentropy loss function.

In [None]:
model.fit([encoder_input_data , decoder_input_data], decoder_output_data, batch_size=50, epochs=300 ) #epoch 200/loss 0.03

# Defining Inference Models

Encoder Inference Model: Takes questions as input and outputs LSTM states (h and c)

Decoder Inference Model: Takes in 2 inputs one are the LSTM states, second are the answer input sequences. it will o/p the answers for questions which fed to the encoder model and it's state values.

In [None]:
def make_inference_models():
    
    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
    
    decoder_state_input_h = tf.keras.layers.Input(shape=( 200 ,))
    decoder_state_input_c = tf.keras.layers.Input(shape=( 200 ,))
    
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_embedding , initial_state=decoder_states_inputs)
    
    decoder_states = [state_h, state_c]

    decoder_outputs = decoder_dense(decoder_outputs)
    
    decoder_model = tf.keras.models.Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)
    
    return encoder_model , decoder_model

# Talking with the Chatbot

define a method str_to_tokens which converts str questions to Integer tokens with padding.

1. First, we take a question as input and predict the state values using enc_model.
2. We set the state values in the decoder's LSTM.
3. Then, we generate a sequence which contains the <start> element.
4. We input this sequence in the dec_model.
5. We replace the <start> element with the element which was predicted by the dec_model and update the state values.
6. We carry out the above steps iteratively till we hit the <end> tag or the maximum answer length.



In [None]:
def str_to_tokens( sentence : str ):

    words = sentence.lower().split()
    tokens_list = list()
  
    for word in words:
        tokens_list.append( tokenizer.word_index[ word ] ) 
    return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=maxlen_questions , padding='post')


In [None]:
#Java Writing function
#!/usr/bin/env python
#-*- coding: utf-8 -*-

"""
__version__     =   "0.0.1"
__author__      =   "@lantip"
__date__        =   "2019/04/02"
__description__ =   "Latin to Javanese Transliterator"
""" 

import sys

HURUF = {
    'h': 'ꦲ',
    'n': 'ꦤ',
    'c': 'ꦕ',
    'r': 'ꦫ',
    'k': 'ꦏ',
    'd': 'ꦢ',
    't': 'ꦠ',
    's': 'ꦱ',
    'w': 'ꦮ',
    'l': 'ꦭ',
    'p': 'ꦥ',
    'dh': 'ꦝ',
    'j': 'ꦗ',
    'y': 'ꦪ',
    'ny': 'ꦚ',
    'm': 'ꦩ',
    'g': 'ꦒ',
    'b': 'ꦧ',
    'th': 'ꦛ',
    'ng': 'ꦔ',
    ',': '꧈',
    '.': '꧉'
}

PASANGAN = {
    'h': '꧀ꦲ',
    'n': '꧀ꦤ',
    'c': '꧀ꦕ',
    'r': '꧀ꦫ',
    'k': '꧀ꦏ',
    'd': '꧀ꦢ',
    't': '꧀ꦠ',
    's': '꧀ꦱ',
    'w': '꧀ꦮ',
    'l': '꧀ꦭ',
    'p': '꧀ꦥ',
    'dh': '꧀ꦓ',
    'j': '꧀ꦗ',
    'y': '꧀ꦪ',
    'ny': '꧀ꦚ',
    'm': '꧀ꦩ',
    'g': '꧀ꦒ',
    'b': '꧀ꦧ',
    'th': '꧀ꦛ',
    'ng': '꧀ꦔ'
}

SANDHANGAN = {
    'wulu': 'ꦶ',
    'suku': 'ꦸ',
    'pepet': 'ꦼ',
    'taling': 'ꦺ',
    'taling-tarung': 'ꦺꦴ',
    'cecak': 'ꦁ',
    'wignyan': 'ꦃ',
    'layar': 'ꦂ',
    'cakra': 'ꦿ',
    'keret': 'ꦽ',
    'pengkal': 'ꦾ',
    'pangkon': '꧀'
}

def transliterate(hrf, isend, prv, nxt):
    ltr = ''
    dobel = ['th', 'dh', 'ny']
    iskeret = False
    if hrf.find('ng') == 0:
        if len(hrf) == 2:
            ltr += SANDHANGAN['cecak']
        else:
            ltr += HURUF['ng']
        if len(hrf) > 3:
            if hrf[2] == 'l':
                ltr += PASANGAN['l']
            elif hrf[2] == 'y':
                ltr += SANDHANGAN['pengkal']
            elif hrf[2] == 'r':
                if hrf[3] == 'e':
                    ltr += SANDHANGAN['keret']
                    iskeret = True
                else:
                    ltr += SANDHANGAN['cakra']
    elif hrf.find('ny') == 0:
        if prv:
            if len(prv) == 1:
                ltr += PASANGAN['ny']
            else:
                ltr += HURUF['ny']
        else:
            ltr += HURUF['ny']
        if len(hrf) > 3:
            if hrf[2] == 'l':
                ltr += PASANGAN['l']
            elif hrf[2] == 'y':
                ltr += SANDHANGAN['pengkal']
            elif hrf[2] == 'r':
                if hrf[3] == 'e':
                    ltr += SANDHANGAN['keret']
                    iskeret = True
                else:
                    ltr += SANDHANGAN['cakra']
    elif hrf.find('th') == 0:
        if prv:
            if len(prv) == 1:
                ltr += PASANGAN['th']
            else:
                ltr += HURUF['th']
        else:
            ltr += HURUF['th']
        if len(hrf) > 3:
            if hrf[2] == 'l':
                ltr += PASANGAN['l']
            elif hrf[2] == 'y':
                ltr += SANDHANGAN['pengkal']
            elif hrf[2] == 'r':
                if hrf[3] == 'e':
                    ltr += SANDHANGAN['keret']
                    iskeret = True
                else:
                    ltr += SANDHANGAN['cakra']
    elif hrf.find('dh') == 0:
        if prv:
            if len(prv) == 1:
                ltr += PASANGAN['dh']
            else:
                ltr += HURUF['dh']
        else:
            ltr += HURUF['dh']
        if len(hrf) > 3:
            if hrf[2] == 'l':
                ltr += PASANGAN['l']
            elif hrf[2] == 'y':
                ltr += SANDHANGAN['pengkal']
            elif hrf[2] == 'r':
                if hrf[4] == 'e':
                    ltr += SANDHANGAN['keret']
                    iskeret = True
                else:
                    ltr += SANDHANGAN['cakra']
    if len(hrf) == 2:
        if hrf == 'ng':
            ltr += SANDHANGAN['cecak']
        else:
            if prv:
                if len(prv) == 1:
                    if prv not in  ['h', 'r', 'y']:
                        ltr += PASANGAN[hrf[0]]
                    else:
                        ltr += HURUF[hrf[0]]
                else:
                    ltr += HURUF[hrf[0]]
            else:
                ltr += HURUF[hrf[0]]
    elif len(hrf) == 1:
        if hrf[0] not in [',', '.']:
            if hrf == 'r':
                ltr += SANDHANGAN['layar']
            elif hrf == 'h':
                ltr += SANDHANGAN['wignyan']
            elif hrf == ',':
                pass
            else:
                if isend:
                    ltr += HURUF[hrf[0]]
                    ltr += SANDHANGAN['pangkon']
                else:
                    ltr += HURUF[hrf[0]]

    elif len(hrf) > 2:
        if hrf[1] == 'l':
            ltr += HURUF[hrf[0]]
            ltr += PASANGAN['l']
        elif hrf[1] == 'y' and hrf[0] != 'n':
            ltr += HURUF[hrf[0]]
            ltr += SANDHANGAN['pengkal']
        elif hrf[1] == 'r':
            if prv:
                if len(prv) == 1:
                    if prv not in  ['h', 'r', 'y']:
                        ltr += PASANGAN[hrf[0]]
                        ltr += SANDHANGAN['cakra']
                    else:
                        ltr += HURUF[hrf[0]]    
                        ltr += SANDHANGAN['cakra']
                else:
                    ltr += HURUF[hrf[0]]
                    ltr += SANDHANGAN['cakra']
            else:
                ltr += HURUF[hrf[0]]
                ltr += SANDHANGAN['cakra']
    if hrf.find('u') == (len(hrf) - 1):
        ltr += SANDHANGAN['suku']
    
    if 'é' in hrf or 'è' in hrf:
        if prv:
            ltr += SANDHANGAN['taling']
        else:
            ltr += SANDHANGAN['taling']
    if hrf.find('e') == (len(hrf) - 1):
        if not iskeret:
            ltr += SANDHANGAN['pepet']
    if hrf.find('i') == (len(hrf) - 1):
        ltr += SANDHANGAN['wulu']
    if 'o' in hrf:
        ltr += SANDHANGAN['taling-tarung']
    if nxt == '.':
        ltr += HURUF[nxt]
    return ltr


def translate(word):
    ltr = []
    start = 0
    consonant = ['c','k','s','w','l','p','j','m','b']
    specials = ['t','d']
    dobel = ['th', 'dh', 'ny', 'ng']
    insrt = [ 'h','y','g','n']
    vowels = "AaEeÈèÉéIiOoUuÊêĚěĔĕṚṛXxôâāīūō"
    for dob in dobel:
        if word.find(dob) == 0:
            if len(word) >= 3:
                if word[2] in vowels:
                    ltr.append(dob+word[2])
                    start = 3
            elif len(word) >= 4:
                if word[2] == 'r':
                    if word[3] in vowels:
                        ltr.append(dob+'r'+word[3])
                        start = 4
    for ins in insrt:
        if word.find(ins) == 0:
            if len(word) >=2:
                if word[1] in vowels:
                    ltr.append(ins+word[1])
                    start = 2
                elif word[1] in ['l', 'r', 'y']:
                    if word[2] in vowels:
                        ltr.append(ins+word[1]+word[2])
                        start = 3
    if word[0] in vowels:
        ltr.append('h'+word[0])
        start = 1
    for i in range(start,len(word)):
        if word[i] in consonant:
            try:
                if len(word[i:]) > 1:
                    if word[i+1] in vowels and word[i] != 'l':
                        ltr.append(word[i]+word[i+1])
                        i = i + 2
                    else:
                        if word[i+1] in ['l', 'r','y']:
                            if len(word[i:]) > 2:
                                if word[i+2] in vowels:
                                    ltr.append(word[i]+word[i+1]+word[i+2])
                                    i = i + 3
                                else:
                                    ltr.append(word[i]+word[i+1])
                                    i = i + 2
                            else:
                                if (i-2) >= 0:
                                    if len(word[i:]) > 1:
                                        if word[i] not in word[i-2]+word[i-1]:
                                            ltr.append(word[i]+word[i+1])
                                            i = i + 2
                        else:
                            if word[i] != 'l':
                                ltr.append(word[i])
                                i = i + 1
                            else:
                                if len(word[i:]) > 1:
                                    if word[i+1] in vowels:
                                        if len(ltr) > 0:
                                            if not word[i]+word[i+1] in ltr[len(ltr)-1]:
                                                ltr.append(word[i]+word[i+1])
                                                i = i + 2
                                        else:
                                            ltr.append(word[i]+word[i+1])
                                            i = i + 2
                else:
                    ltr.append(word[i])
                    i = i + 1
            except:
                ltr.append(word[i])
                i = i + 1
        elif word[i] in specials:
            try:
                if len(word[i:]) >=2:
                    if word[i+1] == 'h' and word[i+2] in vowels:
                        ltr.append(word[i]+word[i+1]+word[i+2])
                        i = i + 3
                    elif word[i+1] in ['l', 'r']:
                        if len(word[i:]) > 2:
                            if word[i+2] in vowels:
                                ltr.append(word[i]+word[i+1]+word[i+2])
                                i = i + 3
                            else:
                                ltr.append(word[i]+word[i+1])
                                i = i + 2
                        else:
                            ltr.append(word[i]+word[i+1])
                            i = i + 2
                    elif word[i+1] in vowels:
                        ltr.append(word[i]+word[i+1])
                        i = i + 2
                elif len(word[i:]) == 1:
                    if word[i+1] == 'h':
                        ltr.append(word[i]+word[i+1])
                        i = i + 2
                    elif word[i+1] in vowels:
                        ltr.append(word[i]+word[i+1])
                        i = i + 2
            except:
                ltr.append(word[i])
                i = i + 1
        elif word[i] == 'n':
            if len(word[i:]) > 2:
                if word[i+1] in ['g','y'] and word[i+2] in vowels:
                    ltr.append(word[i]+word[i+1]+word[i+2])
                    i = i + 3
                elif word[i+1] in ['g','y'] and word[i+2] not in vowels:
                    ltr.append(word[i]+word[i+1])
                    i = i + 2
                else:
                    if word[i+1] in vowels:
                        ltr.append(word[i]+word[i+1])
                        i = i + 2
                    else:
                        ltr.append(word[i])
                        i = i + 1
            else:
                try:
                    nxt = word[i+1]
                except:
                    nxt = None
                if nxt:
                    if nxt in vowels:
                        ltr.append(word[i]+nxt)
                        i = i + 2
                    elif nxt == 'g':
                        ltr.append(word[i]+nxt)
                        i = i + 2
                    else:
                        ltr.append(word[i])
                        i = i + 1
                else:
                    ltr.append(word[i])
                    i = i + 1
        elif word[i] in ['r','y']:
            if i == 0:
                if len(word[i:]) > 1:
                    if word[i+1] in vowels:
                        ltr.append(word[i]+word[i+1])
                        i = i + 2
            else:
                if len(word[i:]) > 1:
                    if word[i+1] in vowels:
                        if word[i-1] not in vowels:
                            if (i-2) >=0:
                                if (word[i-2]+word[i-1]) in dobel:
                                    ltr.append(word[i-2]+word[i-1]+word[i]+word[i+1])
                                    i = i + 2
                                else:        
                                    if not (word[i]+word[i+1]) in ltr[len(ltr)-1]:
                                        ltr[len(ltr)-1] = ltr[len(ltr)-1] + word[i] + word[i+1]
                                        i = i + 1
                        else:
                            ltr.append(word[i]+word[i+1])
                            i = i + 2
                    else:
                        ltr.append(word[i])
                        i = i + 1
                else:
                    ltr.append(word[i])
                    i = i + 1
        elif word[i] == 'g':
            if 'g' in ltr[len(ltr)-1] and len(ltr[len(ltr)-1]) >= 2:
                pass
            else:
                if len(word[i:]) > 1:
                    if word[i+1] in vowels:
                        ltr.append(word[i]+word[i+1])
                        i = i + 2
                    else:
                        if (i-2) > 0:
                            if (word[i-2]+word[i-1]) == 'ng':
                                pass
                        else:
                            if ltr[len(ltr) - 1] != 'ng':
                                ltr.append(word[i])
                                i = i + 1
                            else:
                                i = i + 1
                else:
                    if (i-2) > 0:
                        if (word[i-2] + word[i-1]) == 'ng':
                            pass
                        elif (word[i-1] + word[i]) == 'ng':
                            pass
                        else:
                            ltr.append(word[i])
                            i = i + 1
                    else:
                        ltr.append(word[i])
                        i = i + 1
        elif word[i] == 'h':
            if 'h' in ltr[len(ltr)-1] and len(ltr[len(ltr)-1]) > 2:
                pass
            else:
                if len(word[i:]) > 1:
                    if word[i+1] in vowels:
                        ltr.append(word[i]+word[i+1])
                        i = i + 2
                    else:
                        ltr.append(word[i])
                        i = i + 1
                else:
                    ltr.append(word[i])
                    i = i + 1
    return ltr


def translatethis(text):
    if ',' in text:
        trslt = translate(text.replace(',','')) + [',']
    elif '.' in text:
        trslt = translate(text.replace(',','')) + ['.']
    else:
        trslt = translate(text)
    return trslt

def dotranslate(word):
    trslt = []
    for wrds in word.split():
        if '-' in wrds:
            for wrd in wrds.split('-'):
                trslt = trslt + translatethis(wrd.lower())
        else:
            trslt = trslt + translatethis(wrds.lower())

    return trslt


def dotransliterate(word):
    litr = ''
    if '.' in word:
        for ijk,wrd in enumerate(word.split('.')):
            if ',' in wrd:
                for idx,wr in enumerate(wrd.split(',')):
                    ltr = dotranslate(wr)
                    isend = False
                    for index, lt in enumerate(ltr):
                        if index == len(ltr) - 1:
                            isend = True
                            nxt = None
                        else:
                            nxt = ltr[index+1]
                        if (index - 1) >= 0:
                            prv = ltr[index-1]
                        else:
                            prv = None

                        litr += transliterate(lt, isend, prv, nxt)
                    if idx < (len(wrd.split(',')) - 1):
                        litr += HURUF[',']
            else:
                ltr = dotranslate(wrd)
                isend = False
                for index, lt in enumerate(ltr):
                    if index == len(ltr) - 1:
                        isend = True
                        nxt = None
                    else:
                        nxt = ltr[index+1]
                    if (index - 1) >= 0:
                        prv = ltr[index-1]
                    else:
                        prv = None

                    litr += transliterate(lt, isend, prv, nxt)    
            if ijk  < (len(word.split('.')) - 1):      
                litr += HURUF['.']
    else:
        ltr = dotranslate(word)
        isend = False
        for index, lt in enumerate(ltr):
            if index == len(ltr) - 1:
                isend = True
                nxt = None
            else:
                nxt = ltr[index+1]
            if (index - 1) >= 0:
                prv = ltr[index-1]
            else:
                prv = None

            litr += transliterate(lt, isend, prv, nxt)
    #print(ltr) show per huruf
    return litr

In [None]:
def text_to_javanese(text):
    converted_text = ''
    for char in text:
            converted_text = [dotransliterate(text).lower()]
    return converted_text

In [None]:
enc_model , dec_model = make_inference_models()

for _ in range(10):
    states_values = enc_model.predict( str_to_tokens( input( 'Enter question : ' ) ) )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index :
                decoded_translation += ' {}'.format( word )
                sampled_word = word
        
        if sampled_word == 'end' or len(decoded_translation.split()) > maxlen_answers:
            stop_condition = True
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    output_aksara_jawa = text_to_javanese(decoded_translation)

    print(output_aksara_jawa)
    print(decoded_translation)
    

"""
if __name__ == '__main__':
    jawa = input('> ')
    while (jawa != 'quit'):
        try:
            print (dotransliterate(jawa).lower())
            question = input('> ')
        except:
            sys.exit(1)
"""