In [1]:
# Install Kaggle package
!pip install -q kaggle

# Upload kaggle.json manually
from google.colab import files
files.upload()


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"yashagarwal8354","key":"8fafefe707a83321d7cb5ce8724b07c6"}'}

In [2]:
# Create the Kaggle directory and move kaggle.json
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/

# Set the required permissions
!chmod 600 ~/.kaggle/kaggle.json


In [3]:
# Download the dataset
!kaggle datasets download -d devicharith/language-translation-englishfrench


Dataset URL: https://www.kaggle.com/datasets/devicharith/language-translation-englishfrench
License(s): CC0-1.0
Downloading language-translation-englishfrench.zip to /content
100% 3.51M/3.51M [00:01<00:00, 3.89MB/s]
100% 3.51M/3.51M [00:01<00:00, 2.88MB/s]


In [4]:
# Unzip the dataset
!unzip language-translation-englishfrench.zip -d ./data


Archive:  language-translation-englishfrench.zip
  inflating: ./data/eng_-french.csv  


In [5]:
import numpy as np
import pandas as pd
import string
from string import digits
import matplotlib.pyplot as plt
import re
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model


In [6]:
# Assuming you unzipped the dataset after downloading
lines = pd.read_csv("/content/data/eng_-french.csv", encoding='utf-8')
lines.drop_duplicates(inplace=True)
lines = lines.sample(n=25000, random_state=42)


In [7]:
lines

Unnamed: 0,English words/sentences,French words/sentences
2785,Take a seat.,Prends place !
29880,I wish Tom was here.,J'aimerais que Tom soit là.
53776,How did the audition go?,Comment s'est passée l'audition ?
154386,I've no friend to talk to about my problems.,Je n'ai pas d'ami avec lequel je puisse m'entr...
149823,I really like this skirt. Can I try it on?,"J'aime beaucoup cette jupe, puis-je l'essayer ?"
...,...,...
14525,I feel powerless.,Je me sens impuissante.
149622,I don't know the reason why he was absent.,Je ne connais pas la raison de son absence.
17481,You're a problem.,Tu constitues un problème.
1779,We're done.,Nous en avons fini.


In [8]:
lines.rename(columns={
    'English words/sentences': 'English',
    'French words/sentences': 'French'
}, inplace=True)


In [9]:
lines['English'] = lines['English'].apply(lambda x: x.lower())
lines['French'] = lines['French'].apply(lambda x: x.lower())


In [10]:
lines['English'] = lines['English'].apply(lambda x: re.sub("'", '', x))
lines['French'] = lines['French'].apply(lambda x: re.sub("'", '', x))

exclude = set(string.punctuation)
lines['English'] = lines['English'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines['French'] = lines['French'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

remove_digits = str.maketrans('', '', digits)
lines['English'] = lines['English'].apply(lambda x: x.translate(remove_digits))
lines['French'] = lines['French'].apply(lambda x: x.translate(remove_digits))


In [11]:
lines['English'] = lines['English'].apply(lambda x: x.strip())
lines['French'] = lines['French'].apply(lambda x: x.strip())
lines['English'] = lines['English'].apply(lambda x: re.sub(" +", " ", x))
lines['French'] = lines['French'].apply(lambda x: re.sub(" +", " ", x))
lines['French'] = lines['French'].apply(lambda x: 'START_ '+ x + ' _END')


In [12]:
all_eng_words = set()
for eng in lines['English']:
    for word in eng.split():
        all_eng_words.add(word)

all_fr_words = set()
for fr in lines['French']:
    for word in fr.split():
        all_fr_words.add(word)


In [13]:
lines['length_eng_sentence'] = lines['English'].apply(lambda x: len(x.split(" ")))
lines['length_fr_sentence'] = lines['French'].apply(lambda x: len(x.split(" ")))


In [14]:
lines = lines[lines['length_eng_sentence'] <= 20]
lines = lines[lines['length_fr_sentence'] <= 20]


In [15]:
max_length_src = max(lines['length_fr_sentence'])
max_length_tar = max(lines['length_eng_sentence'])


In [16]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_fr_words))

num_encoder_tokens = len(all_eng_words) + 1
num_decoder_tokens = len(all_fr_words) + 1

input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())


In [17]:
lines = shuffle(lines)
X, y = lines['English'], lines['French']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)


In [18]:
X_train.to_pickle('X_train_fr.pkl')
X_test.to_pickle('X_test_fr.pkl')


In [19]:
# Add at the top of model definition section
latent_dim = 256  # or 300 like Group 6 used

In [20]:
def generate_batch(X=X_train, y=y_train, batch_size=128):
    while True:
        for j in range(0, len(X), batch_size):
            actual_batch_size = min(batch_size, len(X) - j)

            encoder_input_data = np.zeros((actual_batch_size, max_encoder_seq_length), dtype='int32')
            decoder_input_data = np.zeros((actual_batch_size, max_decoder_seq_length), dtype='int32')
            decoder_target_data = np.zeros((actual_batch_size, max_decoder_seq_length, num_decoder_tokens), dtype='float32')

            for i, (input_text, target_text) in enumerate(zip(X[j:j + batch_size], y[j:j + batch_size])):
                for t, word in enumerate(input_text.split()):
                    if t < max_encoder_seq_length:
                        encoder_input_data[i, t] = input_token_index.get(word, 0)

                target_words_seq = target_text.split()
                for t, word in enumerate(target_words_seq):
                    if t < max_decoder_seq_length:
                        decoder_input_data[i, t] = target_token_index.get(word, 0)
                        if t > 0:
                            decoder_target_data[i, t - 1, target_token_index.get(word, 0)] = 1.0

            yield (
                {
                    "encoder_input": encoder_input_data,
                    "decoder_input": decoder_input_data
                },
                decoder_target_data
            )


In [24]:
max_encoder_seq_length = max(lines['length_eng_sentence'])
max_decoder_seq_length = max(lines['length_fr_sentence'])


In [25]:
encoder_inputs = Input(shape=(max_encoder_seq_length,), name='encoder_input')
decoder_inputs = Input(shape=(max_decoder_seq_length,), name='decoder_input')


In [26]:
embedding_dim = 300  # or 256 or 128

# Encoder
encoder_inputs = Input(shape=(max_encoder_seq_length,), name='encoder_input')
enc_emb = Embedding(input_dim=num_encoder_tokens, output_dim=embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_decoder_seq_length,), name='decoder_input')
dec_emb_layer = Embedding(input_dim=num_decoder_tokens, output_dim=embedding_dim, mask_zero=True)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


In [28]:
num_encoder_tokens = len(all_eng_words) + 1  # +1 for padding token
num_decoder_tokens = len(all_fr_words) + 1  # +1 for padding token


In [29]:
# Define max sequence lengths
max_encoder_seq_length = max(lines['length_eng_sentence'])  # English is source
max_decoder_seq_length = max(lines['length_fr_sentence'])   # French is target

# Define vocabulary sizes
num_encoder_tokens = len(input_token_index) + 1  # +1 for padding token
num_decoder_tokens = len(target_token_index) + 1  # +1 for padding token

# Define batch size and training params
batch_size = 128
epochs = 50
train_samples = len(X_train)
val_samples = len(X_test)


In [30]:
model.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)


In [31]:
model.fit(
    generate_batch(X_train, y_train, batch_size=batch_size),
    steps_per_epoch=train_samples // batch_size,
    epochs=50,
    validation_data=generate_batch(X_test, y_test, batch_size=batch_size),
    validation_steps=val_samples // batch_size
)


Epoch 1/50
[1m 18/155[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m24s[0m 182ms/step - accuracy: 0.0394 - loss: 8.2726

KeyboardInterrupt: 

In [None]:
# Encoder model for inference
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = dec_emb_layer(decoder_inputs)  # Reuse the same embedding layer from training
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2
)


In [None]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())


In [None]:
reverse_target_word_index = reverse_target_char_index
target_word_index = target_token_index


In [None]:
def decode_sequence(input_seq):
    # Encode the input
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1 with only the 'START_' token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = target_token_index['START_']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample the next word
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_target_word_index.get(sampled_token_index, '')

        if sampled_word == '_END' or len(decoded_sentence.split()) > max_decoder_seq_length:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

        # Update the target sequence
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence.strip()


In [None]:
from keras.preprocessing.sequence import pad_sequences


In [None]:
def convert_input_to_seq(input_sentence):
    input_sentence = input_sentence.lower()
    input_sentence = ''.join([ch for ch in input_sentence if ch not in set(string.punctuation)])
    input_sentence = re.sub(" +", " ", input_sentence.strip())
    seq = [input_token_index.get(word, 0) for word in input_sentence.split()]
    seq = pad_sequences([seq], maxlen=max_encoder_seq_length, padding='post')
    return seq

# Example translation
test_input = "I am very happy today"
input_seq = convert_input_to_seq(test_input)
translated_sentence = decode_sequence(input_seq)
print(f"English: {test_input}")
print(f"French: {translated_sentence}")


In [None]:
test_input = "i love my parents"
input_seq = convert_input_to_seq(test_input)
translated_sentence = decode_sequence(input_seq)
print(f"English: {test_input}")
print(f"French: {translated_sentence}")

In [None]:
model.save('eng_french_seq2seq.h5')
encoder_model.save('encoder_model.h5')
decoder_model.save('decoder_model.h5')


In [None]:
!pip install nltk
import nltk
nltk.download('punkt')


In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize

def evaluate_bleu_score(input_texts, target_texts):
    smoothie = SmoothingFunction().method4  # Handles 0 counts better
    total_score = 0
    sample_count = 0

    for input_text, target_text in zip(input_texts, target_texts):
        input_seq = convert_input_to_seq(input_text)
        predicted_sentence = decode_sequence(input_seq)

        # Tokenize both predicted and reference sentences
        reference = [word_tokenize(target_text.replace("START_ ", "").replace(" _END", ""))]
        hypothesis = word_tokenize(predicted_sentence)

        score = sentence_bleu(reference, hypothesis, smoothing_function=smoothie, weights=(0.5, 0.5))
        total_score += score
        sample_count += 1

    return total_score / sample_count
