In [None]:
import re

In [None]:
#load dataset
with open("/content/fra.txt", "r", encoding="utf-8") as file:
  lines = file.readlines()

In [None]:
#store sentences pairs
pairs = []
for line in lines:
  words = line.strip().split("\t")[:2]
  if len(words) == 2:
    french = words[0].lower()
    english = words[1].lower()

    #remove special characters and numbers
    french = re.sub(r"[^a-zA-Z]+", " ", french)
    english = re.sub(r"[^a-zA-Z]+", " ", english)

    pairs.append((french, english))


In [None]:
pairs

[('go ', 'va '),
 ('go ', 'marche '),
 ('go ', 'en route '),
 ('go ', 'bouge '),
 ('hi ', 'salut '),
 ('hi ', 'salut '),
 ('run ', 'cours '),
 ('run ', 'courez '),
 ('run ', 'prenez vos jambes vos cous '),
 ('run ', 'file '),
 ('run ', 'filez '),
 ('run ', 'cours '),
 ('run ', 'fuyez '),
 ('run ', 'fuyons '),
 ('run ', 'cours '),
 ('run ', 'courez '),
 ('run ', 'prenez vos jambes vos cous '),
 ('run ', 'file '),
 ('run ', 'filez '),
 ('run ', 'cours '),
 ('run ', 'fuyez '),
 ('run ', 'fuyons '),
 ('who ', 'qui '),
 ('wow ', ' a alors '),
 ('wow ', 'waouh '),
 ('wow ', 'wah '),
 ('duck ', ' terre '),
 ('duck ', 'baisse toi '),
 ('duck ', 'baissez vous '),
 ('fire ', 'au feu '),
 ('help ', ' l aide '),
 ('hide ', 'cache toi '),
 ('hide ', 'cachez vous '),
 ('jump ', 'saute '),
 ('jump ', 'saute '),
 ('stop ', ' a suffit '),
 ('stop ', 'stop '),
 ('stop ', 'arr te toi '),
 ('wait ', 'attends '),
 ('wait ', 'attendez '),
 ('wait ', 'attendez '),
 ('wait ', 'attends '),
 ('wait ', 'attendez

In [None]:
#import library
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [None]:
#split data into training and testing sets
train_pairs, test_pairs = train_test_split(pairs, test_size=0.1, random_state=42)

In [None]:
#Tokenize french sentences
french_texts = [pair[0] for pair in train_pairs]
french_tokenizer = Tokenizer()
french_tokenizer.fit_on_texts(french_texts)
french_sentences  = french_tokenizer.texts_to_sequences(french_texts)
french_maxlen = max(len(sentence) for sentence in french_sentences)
french_sentences = pad_sequences(french_sentences, maxlen=french_maxlen, padding="post")

In [None]:
#Tokenize english sentences
english_texts = [pair[1] for pair in train_pairs]
english_tokenizer = Tokenizer()
english_tokenizer.fit_on_texts(english_texts)
english_sentences  = english_tokenizer.texts_to_sequences(english_texts)
french_maxlen = max(len(sentence) for sentence in french_sentences)
french_sentences = pad_sequences(french_sentences, maxlen=french_maxlen, padding="post")

In [None]:
# convert english sentences to one_hot_encoding
englsih_vocab_size = len(english_tokenizer.word_index) + 1
english_maxlen = max(len(sentence) for sentence in english_sentences) # Correctly use english_maxlen for shape
english_one_hot = np.zeros((len(english_sentences), english_maxlen, englsih_vocab_size), dtype="float32") # Corrected the shape
for i in range(len(english_sentences)):
    for j in range(len(english_sentences[i])):
        word_id = english_sentences[i][j]
        if word_id != 0:
            english_one_hot[i, j, word_id] = 1.0 # Now this should be within bounds
# print(english_one_hot)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, RepeatVector, TimeDistributed

In [None]:
#set model parameters
embedding_dim = 300
latent_dim = 512

In [None]:
#define encoder
encoder_inputs = Input(shape=(french_maxlen,))
encoder_embedding = Embedding(input_dim=len(french_tokenizer.word_index) + 1, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=False)(encoder_embedding)
encoder_output = RepeatVector(english_maxlen)(encoder_lstm)


In [None]:
#define decoder
decoder_lstm = LSTM(latent_dim, return_sequences=True)(encoder_output)
decoder_dense = TimeDistributed(Dense(englsih_vocab_size, activation="softmax"))(decoder_lstm)


In [None]:
#compile model
model = Model(inputs=encoder_inputs, outputs=decoder_dense)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [None]:
model.summary()

In [None]:
#train model
model.fit(french_sentences, english_one_hot, epochs=40, validation_split=0.2)

Epoch 1/40
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 27ms/step - accuracy: 0.0213 - loss: 1.7379 - val_accuracy: 0.0368 - val_loss: 1.4235
Epoch 2/40
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - accuracy: 0.0444 - loss: 1.3508 - val_accuracy: 0.0477 - val_loss: 1.3404
Epoch 3/40
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 23ms/step - accuracy: 0.0594 - loss: 1.2227 - val_accuracy: 0.0643 - val_loss: 1.2755
Epoch 4/40
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - accuracy: 0.0708 - loss: 1.1111 - val_accuracy: 0.0686 - val_loss: 1.2351
Epoch 5/40
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 23ms/step - accuracy: 0.0805 - loss: 1.0384 - val_accuracy: 0.0749 - val_loss: 1.2060
Epoch 6/40
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - accuracy: 0.0851 - loss: 0.9679 - val_accuracy: 0.0813 - val_loss: 1.1745
Epoch 7/40
[1m207/20

<keras.src.callbacks.history.History at 0x7ee1626b79d0>

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
references = []
candidates = []

In [None]:

for i in range(100):  # First 100 test sentences
    test_sentence = test_pairs[i][0]
    true_translation = test_pairs[i][1].split()

    # Convert test sentence to sequence
    test_sequence = french_tokenizer.texts_to_sequences([test_sentence])
    test_padded = pad_sequences(test_sequence, maxlen=french_maxlen, padding='post')

    # Make prediction
    prediction = model.predict(test_padded)
    predicted_sequence = np.argmax(prediction, axis=-1)[0]

    # Convert numbers back to words
    output_words = []
    for word_id in predicted_sequence:
        if word_id > 0 and word_id in english_tokenizer.index_word:
            output_words.append(english_tokenizer.index_word[word_id])

    references.append([true_translation])
    candidates.append(output_words)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 347ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3

In [None]:
# Compute BLEU Score
bleu_scores = [sentence_bleu(ref, cand) for ref, cand in zip(references, candidates)]
print("Average BLEU Score:", np.mean(bleu_scores))

Average BLEU Score: 0.008693344347761088


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [None]:
import numpy as np
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences

def translate_input(input_sentence):
    # Preprocess the input sentence
    input_sentence = input_sentence.lower().strip()
    input_sentence = re.sub(r"[^a-zA-Z ]+", " ", input_sentence)

    # Convert sentence to sequence using the French tokenizer
    input_sequence = french_tokenizer.texts_to_sequences([input_sentence])

    # Pad the sequence to match model input length
    input_padded = pad_sequences(input_sequence, maxlen=french_maxlen, padding='post')

    # Make prediction using the model
    prediction = model.predict(input_padded)

    # Convert predicted indices to words (Greedy decoding)
    predicted_sequence = np.argmax(prediction, axis=-1)[0]

    # Convert tokenized output to English words
    output_words = []
    for word_id in predicted_sequence:
        if word_id == 0:  # Ignore padding tokens
            continue
        if word_id in english_tokenizer.index_word:
            word = english_tokenizer.index_word[word_id]
            if word not in output_words:  # Avoid repetition of words
                output_words.append(word)
        else:
            output_words.append("[UNK]")  # Handle unknown words gracefully

    # Join words to form a complete English sentence
    translated_sentence = " ".join(output_words)

    return translated_sentence.capitalize()

# Get input from the user
input_sentence = input("Enter a French sentence to translate: ")

# Translate the input sentence
translated_sentence = translate_input(input_sentence)

# Print the translated sentence
print("Translated sentence:", translated_sentence)


Enter a French sentence to translate: passe devant
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Translated sentence: C est heure nial pas
