In [1]:
from tensorflow.keras import Model, Input, Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Layer, Add, GRU, \
                                    Activation, Softmax, Concatenate, TimeDistributed
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import time
import pandas as pd
import numpy as np
import unicodedata
import re
import json

In [2]:
df = pd.read_excel('../data.xlsx')
df = df.dropna()
print(df.head(5))
print(df.__len__)

                  Tolaki                              Ina
0        aria a mbotingu             dari dalam keranjang
1               aa laika                      dalam rumah
2            aa no monga             pinggangnya langsing
3  monaa wata pe'a pe'aa  menyimpan batang yang berlubang
4                aa enge                    lubang hidung
<bound method DataFrame.__len__ of                                                  Tolaki  \
0                                       aria a mbotingu   
1                                              aa laika   
2                                           aa no monga   
3                                 monaa wata pe'a pe'aa   
4                                               aa enge   
...                                                 ...   
4511                   poworea o hule ariito pinokolako   
4512  ie banggonahakono tai-tai rota ona suui-tudui ...   
4513  ano'ene alei humnggai'ipalako tonggoitongano a...   
4514  tolea laa lako mesamb

In [3]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())  
    w = re.sub(r"([?.!,¿])", r" \1 ", w) 
    w = re.sub(r'[" "]+', " ", w) 
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w = w.strip() 
    if not w.startswith('<start>'):
        w = '<start> ' + w
    if not w.endswith('<end>'):
        w = w + ' <end>'
    return w

In [4]:
df['Ina'] = df['Ina'].apply(preprocess_sentence)
df['Tolaki'] = df['Tolaki'].apply(preprocess_sentence)
df.drop([2], axis=0, inplace=True)
print(df.isnull().sum())


Tolaki    0
Ina       0
dtype: int64


In [5]:
limiter = 50000
df = df.iloc[:limiter, :]
df.reset_index(drop=True, inplace=True)

In [6]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# Buat tokenizer dari data Anda
tokenizer_input = create_tokenizer(df['Ina'])
tokenizer_output = create_tokenizer(df['Tolaki'])


In [7]:
def max_length(lines):
    return max([len(s.split()) for s in lines])

length_input = max_length(df['Ina'])
length_output = max_length(df['Tolaki'])


print(length_output)
print(length_input)

20
19


In [8]:
# For Tolaki
vocab_size_input = len(tokenizer_input.word_index) + 1

# For Ina
vocab_size_output = len(tokenizer_output.word_index) + 1


In [9]:
def encode_text(tokenizer, lines, length):
    encoded = tokenizer.texts_to_sequences(lines)
    padded = pad_sequences(encoded, maxlen=length, padding='post')
    return padded

lang_input = encode_text(tokenizer_input, df['Ina'], length_input)
lang_output = encode_text(tokenizer_output, df['Tolaki'], length_output)


In [10]:
print(type(tokenizer_input))  # Harusnya <class 'tensorflow.keras.preprocessing.text.Tokenizer'>
print(type(tokenizer_output))

<class 'keras.src.legacy.preprocessing.text.Tokenizer'>
<class 'keras.src.legacy.preprocessing.text.Tokenizer'>


In [30]:
with open('ina_tolaki_tokenizer_input.json', 'w') as f:
    json.dump(tokenizer_input.to_json(), f)

with open('ina_tolaki_tokenizer_output.json', 'w') as f:
    json.dump(tokenizer_output.to_json(), f)

In [31]:
from tensorflow.keras.preprocessing.text import tokenizer_from_json

# Muat tokenizer dari JSON
with open('ina_tolaki_tokenizer_input.json', 'r') as f:
    tokenizer_input_json = json.load(f)
    tokenizer_input = tokenizer_from_json(tokenizer_input_json)

with open('ina_tolaki_tokenizer_output.json', 'r') as f:
    tokenizer_output_json = json.load(f)
    tokenizer_output = tokenizer_from_json(tokenizer_output_json)

# Verifikasi tipe
print(type(tokenizer_input)) 
print(type(tokenizer_output))

<class 'keras.src.legacy.preprocessing.text.Tokenizer'>
<class 'keras.src.legacy.preprocessing.text.Tokenizer'>


In [13]:
batch_size = 32
inp_embed_size = 128
inp_lstm_cells = 256
tar_embed_size = 128
tar_lstm_cells = 256
attention_units = 256
hidden_state = [tf.zeros([batch_size, inp_lstm_cells])]

In [14]:
class Encoder(Model):
    def __init__(self, inp_vocab_size, inp_embed_size, inp_lstm_cells, batch_size, inp_len):
        super(Encoder, self).__init__()
        self.inp_embed_size = inp_embed_size 
        self.inp_vocab_size = inp_vocab_size 
        self.inp_lstm_cells = inp_lstm_cells 
        self.batch_size = batch_size         
        self.inp_len = inp_len               
        self.enc_embedding = Embedding(self.inp_vocab_size, self.inp_embed_size, trainable=True)
        self.lstm = LSTM(self.inp_lstm_cells, return_sequences=True, return_state=True)

    def call(self, inp_sequence, hidden_sequence):
        emb_output = self.enc_embedding(inp_sequence)
        inp_lstm_output, state_h, state_c = self.lstm(emb_output, initial_state = hidden_sequence)
        return inp_lstm_output, state_h

    def initialize_hidden_states(self):
        return [tf.zeros([self.batch_size, self.inp_lstm_cells]), tf.zeros([self.batch_size, self.inp_lstm_cells])]

In [15]:
enc_model = Encoder(inp_vocab_size=vocab_size_input, inp_embed_size=inp_embed_size, inp_lstm_cells=inp_lstm_cells,
                    batch_size=batch_size, inp_len=length_input)
initialized_hidden_states = enc_model.initialize_hidden_states()

enc_out, enc_state = enc_model(lang_input[:batch_size], initialized_hidden_states)

In [16]:
class Attention(tf.keras.layers.Layer):
    def __init__(self, attention_units):
        super(Attention, self).__init__()
        self.W1 = tf.keras.layers.Dense(attention_units)
        self.W2 = tf.keras.layers.Dense(attention_units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, hidden, output):
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        score = self.V(tf.nn.tanh(self.W1(hidden_with_time_axis) + self.W2(output)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * output
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

In [17]:
class Decoder(Model):
    def __init__(self, tar_embed_size, tar_vocab_size, tar_lstm_cells, attention_units,
                 batch_size=batch_size, tar_len=length_output):
        super(Decoder, self).__init__()
        self.tar_embed_size = tar_embed_size 
        self.tar_vocab_size = tar_vocab_size 
        self.tar_lstm_cells = tar_lstm_cells 
        self.batch_size = batch_size         
        self.tar_len = tar_len              
        self.attention_units = attention_units  
        self.dec_embedding = Embedding(self.tar_vocab_size, self.tar_embed_size, trainable=True)
        self.lstm = LSTM(self.tar_lstm_cells, return_sequences=True, return_state=True)
        self.attention = Attention(self.attention_units)
        self.final_layer = Dense(self.tar_vocab_size)

    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)
        emb_output = self.dec_embedding(x)
        x_context = tf.concat([tf.expand_dims(context_vector, axis=1), emb_output], axis=-1)
        tar_lstm_output, tar_state_h, tar_state_c = self.lstm(x_context)
        tar_lstm_output_reshaped = tf.reshape(tar_lstm_output, shape=(-1, tar_lstm_output.shape[2]))
        word_prob = self.final_layer(tar_lstm_output_reshaped)
        return word_prob, tar_state_h, attention_weights

In [18]:
dec_model = Decoder(tar_embed_size=tar_embed_size, tar_vocab_size=vocab_size_output, tar_lstm_cells=tar_lstm_cells,
                    attention_units=attention_units)
dec_out, dec_state, atn_w = dec_model(tf.random.uniform((batch_size, 1)), enc_state, enc_out)

In [19]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [20]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

In [21]:
def shuffler(lang_inp, lang_out):
    n_elem = lang_inp.shape[0]
    indices = np.random.choice(n_elem, size=n_elem, replace=False)
    return lang_inp[indices], lang_out[indices]

def generator(batch_number, lang_input, lang_output):
    if len(lang_input) <= batch_number*batch_size+batch_size:
        return (lang_input[batch_number*batch_size:],
            lang_output[batch_number*batch_size:])
    return (lang_input[batch_number*batch_size: batch_number*batch_size+batch_size],
            lang_output[batch_number*batch_size: batch_number*batch_size+batch_size])

In [22]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = enc_model(inp, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([tokenizer_output.word_index['start']] * batch_size, 1)
        for t in range(1, targ.shape[1]):
          predictions, dec_hidden, _ = dec_model(dec_input, dec_hidden, enc_output)
          loss += loss_function(targ[:, t], predictions)
          dec_input = tf.expand_dims(targ[:, t], 1)
    batch_loss = (loss / int(targ.shape[1]))
    variables = enc_model.trainable_variables + dec_model.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss

In [23]:
EPOCHS = 150

loss_history = []

number_of_batches = lang_input.shape[0]//batch_size

lang_input_split = lang_input[:number_of_batches*batch_size]
lang_output_split = lang_output[:number_of_batches*batch_size]


for epoch in range(EPOCHS):
    start = time.time()
    enc_hidden = enc_model.initialize_hidden_states()
    total_loss = 0
    lang_inp, lang_out = shuffler(lang_input_split, lang_output_split)
    for batch_number in range(number_of_batches):
        inp, targ = generator(batch_number, lang_inp, lang_out)
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss
        if batch_number % 200 == 0:
              print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                       batch_number,
                                                       batch_loss.numpy()))
    loss_history.append(total_loss / number_of_batches)
    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / number_of_batches))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 1.6761
Epoch 1 Loss 1.4292
Time taken for 1 epoch 95.58161401748657 sec

Epoch 2 Batch 0 Loss 1.5810
Epoch 2 Loss 1.2877
Time taken for 1 epoch 63.9338903427124 sec

Epoch 3 Batch 0 Loss 1.3390
Epoch 3 Loss 1.2461
Time taken for 1 epoch 64.5710723400116 sec

Epoch 4 Batch 0 Loss 1.2586
Epoch 4 Loss 1.2111
Time taken for 1 epoch 72.28937792778015 sec

Epoch 5 Batch 0 Loss 1.3103
Epoch 5 Loss 1.1716
Time taken for 1 epoch 70.24094796180725 sec

Epoch 6 Batch 0 Loss 0.9230
Epoch 6 Loss 1.1250
Time taken for 1 epoch 74.649982213974 sec

Epoch 7 Batch 0 Loss 0.9480
Epoch 7 Loss 1.0737
Time taken for 1 epoch 72.11609315872192 sec

Epoch 8 Batch 0 Loss 1.0760
Epoch 8 Loss 1.0288
Time taken for 1 epoch 67.8747787475586 sec

Epoch 9 Batch 0 Loss 0.8710
Epoch 9 Loss 0.9799
Time taken for 1 epoch 64.5176432132721 sec

Epoch 10 Batch 0 Loss 1.0823
Epoch 10 Loss 0.9351
Time taken for 1 epoch 63.676095485687256 sec

Epoch 11 Batch 0 Loss 0.9044
Epoch 11 Loss 0.8909
Time taken fo

In [24]:
enc_model.save_weights('ina_tolaki_encoder.weights.h5')
dec_model.save_weights('ina_tolaki_decoder.weights.h5')

In [25]:
def preprocessing_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w = w.strip()
    w = 'start ' + w + ' end'
    return w

In [26]:
def evaluate(sentence):
    sentence = preprocessing_sentence(sentence)
    inputs = [tokenizer_input.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                             maxlen=length_input,
                                                             padding='post')
    inputs = tf.convert_to_tensor(inputs)
    result = ''
    hidden = [tf.zeros([1, inp_lstm_cells]), tf.zeros([1, inp_lstm_cells])]
    enc_out, enc_hidden = enc_model(inputs, hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([tokenizer_output.word_index['start']], 0)

    for t in range(length_output):
        predictions, dec_hidden, attention_weights = dec_model(dec_input,
                                                             dec_hidden,
                                                             enc_out)
        predicted_id = tf.argmax(predictions[0]).numpy()
        if tokenizer_output.index_word[predicted_id] == 'end':
            return result, sentence

        result += tokenizer_output.index_word[predicted_id] + ' '
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence

In [27]:
def load_models():
    inp_vocab_size = len(tokenizer_input.word_index) + 1
    tar_vocab_size = len(tokenizer_output.word_index) + 1

    # Buat instance model
    enc_model = Encoder(inp_vocab_size=inp_vocab_size, inp_embed_size=inp_embed_size, 
                        inp_lstm_cells=inp_lstm_cells, batch_size=batch_size, inp_len=length_input)
    dec_model = Decoder(tar_embed_size=tar_embed_size, tar_vocab_size=tar_vocab_size, 
                        tar_lstm_cells=tar_lstm_cells, attention_units=attention_units, 
                        batch_size=batch_size, tar_len=length_output)

    # Panggil model untuk inisialisasi variabel
    dummy_input_enc = tf.zeros((batch_size, length_input), dtype=tf.int32)
    hidden_states_enc = enc_model.initialize_hidden_states()
    enc_model(dummy_input_enc, hidden_states_enc)

    dummy_input_dec = tf.zeros((batch_size, 1), dtype=tf.int32)
    hidden_state_enc = hidden_states_enc[0]
    dummy_enc_output = tf.zeros((batch_size, length_input, inp_lstm_cells))
    dec_model(dummy_input_dec, hidden_state_enc, dummy_enc_output)

    # Muat bobot
    try:
        enc_model.load_weights('ina_tolaki_encoder.weights.h5')
        dec_model.load_weights('ina_tolaki_decoder.weights.h5')
    except Exception as e:
        print(f"Error loading weights: {e}")
        raise

    # Verifikasi struktur model
    print("Encoder model summary:")
    enc_model.summary()
    print("Decoder model summary:")
    dec_model.summary()

    return enc_model, dec_model

In [32]:
# Muat tokenizer dari JSON
with open('ina_tolaki_tokenizer_input.json', 'r') as f:
    tokenizer_input_json = json.load(f)
    tokenizer_input = tokenizer_from_json(tokenizer_input_json)

with open('ina_tolaki_tokenizer_output.json', 'r') as f:
    tokenizer_output_json = json.load(f)
    tokenizer_output = tokenizer_from_json(tokenizer_output_json)

In [33]:
# # Memuat model
enc_model, dec_model = load_models()

Encoder model summary:


Decoder model summary:


In [34]:
# Contoh penggunaan untuk menerjemahkan kalimat
translated_sentence, original_sentence = evaluate("dari dalam keranjang")
print(f"Kalimat Bahasa Indonesia: {original_sentence}")
print(f"Hasil Terjemahan: {translated_sentence}")

Kalimat Bahasa Indonesia: start dari dalam keranjang end
Hasil Terjemahan: aria a mbotingu 
