In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate
from tensorflow.keras.utils import to_categorical




In [2]:
# Load dataset
file_path = 'dataset.csv'
df = pd.read_csv(file_path, encoding='latin-1', sep=';')

In [18]:
# Extract columns
kuno_texts = df['Kuno'].astype(str).values
ngoko_texts = df['Ngoko'].astype(str).values

In [19]:
# Gabungkan semua teks untuk tokenisasi
all_texts = np.concatenate([kuno_texts, ngoko_texts])

In [20]:

# Gunakan satu tokenizer untuk semua teks
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_texts)

In [21]:
# Ukuran vocabulary
vocab_size = len(tokenizer.word_index) + 1

In [22]:
# Konversi teks ke sequence
kuno_sequences = tokenizer.texts_to_sequences(kuno_texts)
ngoko_sequences = tokenizer.texts_to_sequences(ngoko_texts)

In [24]:
# Padding
kuno_padded = pad_sequences(kuno_sequences, maxlen=max_sequence_length, padding='post')
ngoko_padded = pad_sequences(ngoko_sequences, maxlen=max_sequence_length, padding='post')

In [25]:

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    kuno_padded, ngoko_padded, test_size=0.2, random_state=42
)

In [26]:
# Konfigurasi model
embedding_dim = 128
latent_dim = 256

In [27]:
# Encoder
encoder_inputs = Input(shape=(max_sequence_length,))
encoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

In [28]:
# Decoder
decoder_inputs = Input(shape=(max_sequence_length,))
decoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True)(
    decoder_embedding, initial_state=[state_h, state_c]
)

In [29]:
# Output layer
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_lstm)

In [30]:
# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [31]:
# Persiapan label
y_train = np.expand_dims(y_train, -1)
y_test = np.expand_dims(y_test, -1)

In [None]:
# Training
model.fit(
    [X_train, X_train], y_train, 
    epochs=50, 
    batch_size=32, 
    validation_split=0.2
)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x21db7fd8b10>

In [None]:
# Fungsi untuk menterjemahkan
def translate_text(input_text):
    # Konversi input ke sequence
    input_seq = tokenizer.texts_to_sequences([input_text])
    input_padded = pad_sequences(input_seq, maxlen=max_sequence_length, padding='post')
    
    # Prediksi
    predicted_output = input_padded.copy()
    
    for i in range(max_sequence_length):
        predictions = model.predict([input_padded, predicted_output])
        predicted_idx = np.argmax(predictions[0, i, :])
        predicted_output[0, i] = predicted_idx
    
    # Konversi kembali ke teks
    output_words = [tokenizer.index_word.get(idx, '') for idx in predicted_output[0] if idx != 0]
    return ' '.join(output_words)



In [None]:
# Evaluasi model
test_accuracy = model.evaluate([X_test, X_test], y_test)[1]
print(f'Akurasi model: {test_accuracy:.2f}')



Akurasi model: 0.24


In [37]:
# Interaksi pengguna
while True:
    user_input = input("Masukkan teks Kuno untuk diterjemahkan ke Ngoko (atau ketik 'exit' untuk keluar): ")
    if user_input.lower() == 'exit':
        break
    
    translation = translate_text(user_input)
    print(f"Hasil terjemahan: {translation}")

Hasil terjemahan: dharma tama anane bener anane anane anane
