In [1]:
import string
import re
import random
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

2025-11-02 10:10:14.579173: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-11-02 10:10:14.579298: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-11-02 10:10:14.694732: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
text_path = "/kaggle/input/englishrussian-dictionary-for-machine-translate/rus.txt"
with open(text_path) as file:
    lines = file.read().split("\n")[:-1]  

In [3]:
pairs = []
for line in lines:
    parts = line.split("\t")
    if len(parts) < 2:
        continue  # Skip malformed lines
    english, russian = parts[0], parts[1]
    russian = "[start] " + russian.strip() + " [end]"
    pairs.append((english.strip(), russian))

print(f"Total sentence pairs: {len(pairs)}")
print("Sample pair:", random.choice(pairs))


Total sentence pairs: 363386
Sample pair: ('Things got messy.', '[start] Всё пошло наперекосяк. [end]')


In [4]:
random.seed(42)
random.shuffle(pairs)
total = len(pairs)
num_val = num_test = total // 10
num_train = total - num_val - num_test

train_pairs = pairs[:num_train]
val_pairs = pairs[num_train:num_train + num_val]
test_pairs = pairs[num_train + num_val:]

print(f"Train: {len(train_pairs)}, Val: {len(val_pairs)}, Test: {len(test_pairs)}")

Train: 290710, Val: 36338, Test: 36338


Data dibagi secara acak (dengan seed tetap untuk reprodusibilitas) menjadi 80% pelatihan, 10% validasi, dan 10% pengujian. Sebelum vektorisasi, teks dibersihkan dari tanda baca (kecuali kurung siku) dan diubah ke huruf kecil melalui fungsi custom_standardization.

In [5]:

strip_chars = string.punctuation.replace("[", "").replace("]", "")  
def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    # Escape special regex chars in punctuation
    return tf.strings.regex_replace(lowercase, f"[{re.escape(strip_chars)}]", "")

# Vectorization layers
vocab_size = 15000  # Lebih realistis untuk dataset kecil → hindari overfit
sequence_length = 20

source_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
)

target_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,  # +1 for teacher forcing
    standardize=custom_standardization,
)

# Adapt vocabulary only on training data
train_eng_texts = [pair[0] for pair in train_pairs]
train_rus_texts = [pair[1] for pair in train_pairs]
source_vectorization.adapt(train_eng_texts)
target_vectorization.adapt(train_rus_texts)

Dua lapisan TextVectorization digunakan: satu untuk sumber (Inggris) dan satu untuk target (Rusia). Keduanya dibatasi pada vocabulari 15.000 token dan panjang urutan tetap (20 untuk sumber, 21 untuk target guna mendukung teacher forcing). Pentingnya, adaptasi vocabulari dilakukan hanya pada data pelatihan untuk mencegah kebocoran data (data leakage) ke set validasi dan uji.

In [6]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.position_embeddings = layers.Embedding(input_dim=sequence_length, output_dim=embed_dim)
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        return self.token_embeddings(inputs) + self.position_embeddings(positions)

    def get_config(self):
        return {
            "sequence_length": self.sequence_length,
            "vocab_size": self.vocab_size,
            "embed_dim": self.embed_dim,
        }

class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential([
            layers.Dense(dense_dim, activation="relu"),
            layers.Dense(embed_dim)
        ])
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]  # Expand for attention
        attn_out = self.attention(inputs, inputs, attention_mask=mask)
        x = self.layernorm_1(inputs + attn_out)
        proj_out = self.dense_proj(x)
        return self.layernorm_2(x + proj_out)

    def get_config(self):
        return {"embed_dim": self.embed_dim, "dense_dim": self.dense_dim, "num_heads": self.num_heads}

class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.attention_2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential([
            layers.Dense(dense_dim, activation="relu"),
            layers.Dense(embed_dim)
        ])
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def get_causal_attention_mask(self, inputs):
        seq_len = tf.shape(inputs)[1]
        i = tf.range(seq_len)[:, None]
        j = tf.range(seq_len)
        mask = tf.cast(i >= j, dtype="int32")
        return mask[None, :, :]  # Shape: (1, T, T)

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = mask[:, None, :]  # (B, 1, T)
            combined_mask = tf.minimum(padding_mask, causal_mask)
        else:
            combined_mask = causal_mask

        attn1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out1 = self.layernorm_1(inputs + attn1)

        attn2 = self.attention_2(
            query=out1, value=encoder_outputs, key=encoder_outputs, attention_mask=combined_mask
        )
        out2 = self.layernorm_2(out1 + attn2)

        proj = self.dense_proj(out2)
        return self.layernorm_3(out2 + proj)

    def get_config(self):
        return {"embed_dim": self.embed_dim, "dense_dim": self.dense_dim, "num_heads": self.num_heads}

- PositionalEmbedding: Mengkompensasi ketiadaan informasi urutan dalam mekanisme self-attention dengan menambahkan embedding posisi ke embedding token.
- TransformerEncoder: Menerapkan multi-head self-attention diikuti jaringan feed-forward dua lapis, dengan residual connection dan LayerNormalization untuk stabilitas pelatihan.
- TransformerDecoder: Memiliki dua lapisan multi-head attention: (1) masked self-attention (menggunakan causal mask agar prediksi token ke-t hanya bergantung pada token sebelumnya), dan (2) encoder-decoder attention yang menghubungkan representasi sumber dan target.

In [7]:
# Hyperparameters
embed_dim = 256
dense_dim = 512      # Kurangi dari 2048 → lebih cepat & stabil untuk 1 epoch
num_heads = 8
batch_size = 64      # Efisien di Kaggle GPU/TPU

# Build model
encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="english")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="russian")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs)
x = layers.Dropout(0.1)(x)  # Kurangi dropout agar tidak terlalu agresif di 1 epoch
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)

model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(
    optimizer=keras.optimizers.RMSprop(learning_rate=0.001),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

# Dataset pipeline
def format_dataset(eng, rus):
    eng = source_vectorization(eng)
    rus = target_vectorization(rus)
    return ({"english": eng, "russian": rus[:, :-1]}, rus[:, 1:])

def make_dataset(pairs):
    eng_texts, rus_texts = zip(*pairs)
    ds = tf.data.Dataset.from_tensor_slices((list(eng_texts), list(rus_texts)))
    ds = ds.batch(batch_size)
    ds = ds.map(format_dataset, num_parallel_calls=tf.data.AUTOTUNE)
    return ds.shuffle(1000).prefetch(tf.data.AUTOTUNE)

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

# Callback untuk log loss & acc per batch (opsional, tapi sesuai permintaan)
class BatchLogger(keras.callbacks.Callback):
    def on_batch_end(self, batch, logs=None):
        if batch % 50 == 0:  # Print setiap 50 batch
            print(f"Batch {batch} - Loss: {logs['loss']:.4f}, Acc: {logs['accuracy']:.4f}")

# Train hanya 1 epoch
history = model.fit(
    train_ds,
    epochs=1,
    validation_data=val_ds,
    callbacks=[BatchLogger()]
)

Batch 0 - Loss: 9.7006, Acc: 0.0000
[1m   2/4543[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4:57[0m 66ms/step - accuracy: 0.1717 - loss: 9.2590       

I0000 00:00:1762078245.915613      71 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
W0000 00:00:1762078245.942974      71 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1762078245.975219      71 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1762078245.984250      71 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m  50/4543[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:59[0m 40ms/step - accuracy: 0.6498 - loss: 5.0218Batch 50 - Loss: 3.2169, Acc: 0.7219
[1m 100/4543[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:54[0m 39ms/step - accuracy: 0.6922 - loss: 3.9095Batch 100 - Loss: 2.5071, Acc: 0.7444
[1m 150/4543[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:52[0m 39ms/step - accuracy: 0.7117 - loss: 3.3907Batch 150 - Loss: 2.2237, Acc: 0.7564
[1m 200/4543[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:50[0m 39ms/step - accuracy: 0.7239 - loss: 3.0776Batch 200 - Loss: 2.0628, Acc: 0.7646
[1m 250/4543[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m2:48[0m 39ms/step - accuracy: 0.7327 - loss: 2.8632Batch 250 - Loss: 1.9530, Acc: 0.7708
[1m 300/4543[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m2:46[0m 39ms/step - accuracy: 0.7395 - loss: 2.7045Batch 300 - Loss: 1.8692, Acc: 0.7760
[1m 350/4543[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m2:44[0m 39ms/step - accuracy: 0.7450 - loss: 2.5807Batch 350 - Loss: 

W0000 00:00:1762078435.972297      69 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1762078435.988883      69 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m4543/4543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.8157 - loss: 1.3496

W0000 00:00:1762078439.259833      71 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1762078447.024542      71 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m4543/4543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m216s[0m 44ms/step - accuracy: 0.8157 - loss: 1.3495 - val_accuracy: 0.8641 - val_loss: 0.7600


Model memakai optimizer RMSprop (learning rate = 0.001) dan fungsi kerugian sparse categorical crossentropy, fungsi kerugian sparse categorical crossentropy, sesuai untuk prediksi indeks token diskrit. Pipeline data dibangun menggunakan tf.data untuk efisiensi: batching, transformasi input/output melalui format_dataset (yang menerapkan teacher forcing dengan menggeser target satu langkah), serta prefetching untuk paralelisasi.

Model dilatih dengan callback BatchLogger yang mencatat training loss dan accuracy setiap 50 batch. Hasil akhir training 81,6 %


In [8]:
# Prepare lookup for decoding
target_vocab = target_vectorization.get_vocabulary()
index_to_word = dict(enumerate(target_vocab))

def decode_sequence(input_sentence, max_len=20):
    # Tokenize input
    tokenized_input = source_vectorization([input_sentence])
    decoded = "[start]"

    for i in range(max_len):
        # Tokenize current output (teacher forcing tidak dipakai saat inferensi)
        target_seq = target_vectorization([decoded])[:, :-1]
        # Predict next token
        preds = model([tokenized_input, target_seq])
        next_token_idx = tf.argmax(preds[0, i, :]).numpy()
        next_word = index_to_word.get(next_token_idx, "[UNK]")
        decoded += " " + next_word
        if next_word == "[end]" or next_word == "":
            break
    # Clean output
    return decoded.replace("[start]", "").replace("[end]", "").strip()

# Test translation
test_sentences = [
    "tom knows what we need",
    "how is it going",
    "how are you",
    "it was nice seeing you",
    "till next time",
    "go"
]

print("\n=== TRANSLATION RESULTS ===")
for eng in test_sentences:
    rus_pred = decode_sequence(eng)
    print(f"EN: {eng}")
    print(f"RU: {rus_pred}\n")


=== TRANSLATION RESULTS ===
EN: tom knows what we need
RU: Том знает что нам нужно

EN: how is it going
RU: Как это

EN: how are you
RU: Как ты [UNK]

EN: it was nice seeing you
RU: Это было тебя с тобой

EN: till next time
RU: [UNK] до следующей неделе

EN: go
RU: Идите

