In [1]:
pip install tensorflow



In [2]:
import string
import re
import numpy as np
from pickle import dump, load
from unicodedata import normalize
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, RepeatVector, TimeDistributed, Dense
from nltk.translate.bleu_score import corpus_bleu

def load_doc(filename):
    with open(filename, mode='rt', encoding='utf-8') as f:
        return f.read()

def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t')[:2] for line in lines]  # French, English
    return pairs

def clean_text(text):
    text = normalize('NFD', text).encode('ascii', 'ignore').decode('utf-8')
    text = text.lower()
    text = re.sub(r"[^a-zA-Z ]+", "", text)  # keep only alphabets
    text = text.strip()
    return text

def clean_pairs(pairs):
    cleaned = [[clean_text(src), clean_text(tgt)] for src, tgt in pairs]
    return np.array(cleaned)

filename = "/content/fra.txt"   # Download dataset manually from http://www.manythings.org/anki/
doc = load_doc(filename)
pairs = to_pairs(doc)
pairs = clean_pairs(pairs)



In [3]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

def encode_sequences(tokenizer, length, lines):
    seqs = tokenizer.texts_to_sequences(lines)
    seqs = pad_sequences(seqs, maxlen=length, padding='post')
    return seqs

def encode_output(sequences, vocab_size):
    ylist = []
    for seq in sequences:
        encoded = to_categorical(seq, num_classes=vocab_size)
        ylist.append(encoded)
    y = np.array(ylist)
    return y

In [4]:
# split train/test
train, test = train_test_split(pairs, test_size=0.2, random_state=42)

In [5]:
# prepare tokenizers
src_tokenizer = create_tokenizer(train[:,0])
tar_tokenizer = create_tokenizer(train[:,1])

In [6]:
src_vocab_size = len(src_tokenizer.word_index) + 1
tar_vocab_size = len(tar_tokenizer.word_index) + 1
src_length = max(len(s.split()) for s in train[:,0])
tar_length = max(len(s.split()) for s in train[:,1])

In [7]:
# encode train data
trainX = encode_sequences(src_tokenizer, src_length, train[:,0])[0:100]

In [8]:
trainY = encode_sequences(tar_tokenizer, tar_length, train[:,1])[0:100]

In [9]:
testX = encode_sequences(src_tokenizer, src_length, test[:,0])[0:20]

In [10]:
testY = encode_sequences(tar_tokenizer, tar_length, test[:,1])[0:20]

In [11]:
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units=256):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [12]:
model = define_model(src_vocab_size, tar_vocab_size, src_length, tar_length)
model.summary()



In [13]:
trainY = encode_output(trainY, tar_vocab_size)

In [14]:
testY = encode_output(testY, tar_vocab_size)

In [15]:
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY))

Epoch 1/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2s/step - accuracy: 0.2074 - loss: 10.2924 - val_accuracy: 0.8732 - val_loss: 10.2816
Epoch 2/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 998ms/step - accuracy: 0.8929 - loss: 10.2740 - val_accuracy: 0.8732 - val_loss: 10.2343
Epoch 3/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1s/step - accuracy: 0.8917 - loss: 10.1879 - val_accuracy: 0.8732 - val_loss: 9.8924
Epoch 4/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 978ms/step - accuracy: 0.8923 - loss: 9.7028 - val_accuracy: 0.8732 - val_loss: 8.9764
Epoch 5/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1s/step - accuracy: 0.8912 - loss: 8.7712 - val_accuracy: 0.8732 - val_loss: 8.0309
Epoch 6/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 980ms/step - accuracy: 0.8914 - loss: 7.7986 - val_accuracy: 0.8732 - val_loss: 6.9533
Epoch 7/30
[1m2/2[0m [32m━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7f3c16548830>