# Importing Libraries and downloading Datasets

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
import zipfile
import tensorflow as tf
from tensorflow import keras
import pickle
from keras.models import load_model
from tensorflow.keras.layers import LSTM
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
with zipfile.ZipFile ("/content/eng_-french.csv.zip", "r") as zip_ref:
    zip_ref.extractall()

In [4]:
dt = pd.read_csv("/content/eng_-french.csv")

# Data Processing

In [5]:
dt.head()

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [6]:
dt.tail()

Unnamed: 0,English words/sentences,French words/sentences
175616,"Top-down economics never works, said Obama. ""T...","« L'économie en partant du haut vers le bas, ç..."
175617,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...
175618,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
175619,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...
175620,If someone who doesn't know your background sa...,Si quelqu'un qui ne connaît pas vos antécédent...


In [7]:
dt.columns

Index(['English words/sentences', 'French words/sentences'], dtype='object')

In [8]:
dt.shape

(175621, 2)

In [9]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [10]:
import re

def clean_text(text):
    text = text.lower()  # keep accents
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text

In [11]:
dt['English words/sentences'] = dt['English words/sentences'].apply(clean_text)
dt['French words/sentences'] = dt['French words/sentences'].apply(clean_text)

In [12]:
dt.head()

Unnamed: 0,English words/sentences,French words/sentences
0,hi.,salut!
1,run!,cours !
2,run!,courez !
3,who?,qui ?
4,wow!,ça alors !


In [13]:
dt.tail()

Unnamed: 0,English words/sentences,French words/sentences
175616,"top-down economics never works, said obama. ""t...","« l'économie en partant du haut vers le bas, ç..."
175617,a carbon footprint is the amount of carbon dio...,une empreinte carbone est la somme de pollutio...
175618,death is something that we're often discourage...,la mort est une chose qu'on nous décourage sou...
175619,since there are usually multiple websites on a...,puisqu'il y a de multiples sites web sur chaqu...
175620,if someone who doesn't know your background sa...,si quelqu'un qui ne connaît pas vos antécédent...


In [14]:
dt['French words/sentences'] = dt['French words/sentences'].apply(lambda x: "<start> " + x + " <end>")

In [15]:
dt.head()

Unnamed: 0,English words/sentences,French words/sentences
0,hi.,<start> salut! <end>
1,run!,<start> cours ! <end>
2,run!,<start> courez ! <end>
3,who?,<start> qui ? <end>
4,wow!,<start> ça alors ! <end>


In [16]:
eng_texts = dt['English words/sentences']
fr_texts = dt['French words/sentences']

In [17]:
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(eng_texts)
eng_sequences = eng_tokenizer.texts_to_sequences(eng_texts)

fr_tokenizer = Tokenizer()
fr_tokenizer.fit_on_texts(fr_texts)
fr_sequences = fr_tokenizer.texts_to_sequences(fr_texts)

In [18]:
max_eng_len = max(len(s) for s in eng_sequences)
max_fr_len = max(len(s) for s in fr_sequences)

In [33]:
max_eng_len, max_fr_len

(44, 57)

In [19]:
encode = pad_sequences(eng_sequences, maxlen=max_eng_len, padding='post')
decode = pad_sequences(fr_sequences, maxlen=max_fr_len, padding='post')

In [20]:
encode.shape, decode.shape

((175621, 44), (175621, 57))

In [21]:
decoder_target = np.zeros_like(decode)
decoder_target[:, :-1] = decode[:, 1:]
decoder_target[:, -1] = 0

In [22]:
encoder_input = encode
decoder_input = decode
decoder_output = decoder_target

In [23]:
eng_vocab = len(eng_tokenizer.word_index) + 1
fr_vocab  = len(fr_tokenizer.word_index) + 1

print("English Vocab Size:", eng_vocab)
print("French Vocab Size:", fr_vocab)

English Vocab Size: 14515
French Vocab Size: 27307


In [24]:
X_train_enc, X_test_enc, X_train_dec, X_test_dec, Y_train, Y_test = train_test_split(
    encoder_input, decoder_input, decoder_output, test_size=0.2, random_state=42)

# Model

In [25]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

# Encoder
encoder_inputs = Input(shape=(max_eng_len,))
enc_emb = Embedding(input_dim=eng_vocab, output_dim=256)(encoder_inputs)

# LSTM returns internal states
encoder_lstm, state_h, state_c = LSTM(256, return_state=True)(enc_emb)

# Decoder
decoder_inputs = Input(shape=(max_fr_len,))
dec_emb = Embedding(input_dim=fr_vocab, output_dim=256)(decoder_inputs)

# Decoder LSTM uses encoder states as initial states
decoder_lstm, _, _ = LSTM(256, return_sequences=True, return_state=True)(dec_emb, initial_state=[state_h, state_c])

# Final output layer (predict each French word)
decoder_dense = Dense(fr_vocab, activation='softmax')
decoder_outputs = decoder_dense(decoder_lstm)

# Build Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


In [26]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()

In [27]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',  metrics=['accuracy']  )

In [28]:
history = model.fit(
    [X_train_enc, X_train_dec],
    Y_train,
    epochs=2,
    batch_size=64,
    validation_data=([X_test_enc, X_test_dec], Y_test)
)

Epoch 1/2
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m514s[0m 231ms/step - accuracy: 0.8823 - loss: 1.1045 - val_accuracy: 0.9117 - val_loss: 0.5402
Epoch 2/2
[1m2196/2196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m509s[0m 232ms/step - accuracy: 0.9167 - loss: 0.4908 - val_accuracy: 0.9258 - val_loss: 0.4154


In [32]:
loss , accuracy = model.evaluate([X_test_enc, X_test_dec], Y_test)

[1m1098/1098[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 93ms/step - accuracy: 0.9259 - loss: 0.4149


In [None]:
plt.title ("accuracy")
plt.plot (history.history["accuracy"], label = "train")
plt.plot (history.history["val_accuracy"], label = "test")
plt.legend ()
plt.show ()

In [None]:
plt.title('loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

In [None]:
model.save("eng_2_french.h5")

In [42]:
import numpy as np
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 1️⃣ Load the trained model
model = load_model("eng_2_french.h5")

# --- Identify layers automatically ---
encoder_emb_layer = None
encoder_lstm_layer = None
decoder_emb_layer = None
decoder_lstm_layer = None
decoder_dense_layer = None

for layer in model.layers:
    if isinstance(layer, Embedding) and encoder_emb_layer is None:
        encoder_emb_layer = layer
    elif isinstance(layer, LSTM) and encoder_lstm_layer is None:
        encoder_lstm_layer = layer
    elif isinstance(layer, Embedding) and encoder_emb_layer is not None and decoder_emb_layer is None:
        decoder_emb_layer = layer
    elif isinstance(layer, LSTM) and encoder_lstm_layer is not None and decoder_lstm_layer is None:
        decoder_lstm_layer = layer
    elif isinstance(layer, Dense):
        decoder_dense_layer = layer

print("✅ Layers identified successfully:")
print("Encoder Embedding:", encoder_emb_layer.name)
print("Encoder LSTM:", encoder_lstm_layer.name)
print("Decoder Embedding:", decoder_emb_layer.name)
print("Decoder LSTM:", decoder_lstm_layer.name)
print("Dense Output:", decoder_dense_layer.name)

# --- Rebuild Encoder ---
encoder_inputs = Input(shape=(max_eng_len,))
enc_emb = Embedding(input_dim=eng_vocab, output_dim=256)
encoder_lstm = LSTM(256, return_state=True)
enc_out, state_h, state_c = encoder_lstm(enc_emb(encoder_inputs))
encoder_model = Model(encoder_inputs, [state_h, state_c])

# ✅ Copy weights directly using variable names
enc_emb.set_weights(encoder_emb_layer.get_weights())
encoder_lstm.set_weights(encoder_lstm_layer.get_weights())

# --- Rebuild Decoder ---
decoder_inputs = Input(shape=(1,))
dec_emb = Embedding(input_dim=fr_vocab, output_dim=256)
dec_emb_out = dec_emb(decoder_inputs)

decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
dec_out, dec_h, dec_c = decoder_lstm(dec_emb_out, initial_state=decoder_states_inputs)
decoder_dense = Dense(fr_vocab, activation='softmax')
final_outputs = decoder_dense(dec_out)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [final_outputs] + [dec_h, dec_c]
)

# ✅ Copy weights using variables
dec_emb.set_weights(decoder_emb_layer.get_weights())
decoder_lstm.set_weights(decoder_lstm_layer.get_weights())
decoder_dense.set_weights(decoder_dense_layer.get_weights())

# --- Translation Function ---
def translate_sentence(sentence):
    seq = eng_tokenizer.texts_to_sequences([sentence.lower()])
    seq = pad_sequences(seq, maxlen=max_eng_len, padding='post')

    # Encode input sentence to get initial states
    states_value = encoder_model.predict(seq)

    # Initialize target sequence with a valid token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = 1  # assuming '1' is the <start> or most frequent token index

    decoded_sentence = ""

    for _ in range(max_fr_len):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = fr_tokenizer.index_word.get(sampled_index, "")

        if sampled_word in ["", "end", "eos", "stop"]:
            break

        decoded_sentence += " " + sampled_word

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_index
        states_value = [h, c]

    return decoded_sentence.strip()

# --- Test Translation ---
print("🔤 English: how are you")
print("🇫🇷 French:", translate_sentence("how are you"))




✅ Layers identified successfully:
Encoder Embedding: embedding
Encoder LSTM: lstm
Decoder Embedding: embedding_1
Decoder LSTM: lstm_1
Dense Output: dense
🔤 English: how are you
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step
🇫🇷 French: 
