# Developing a sequence-to-sequence transformer model for language translation

In [12]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/language-translation-englishfrench/eng_-french.csv


In [None]:
!pip install transformers datasets

Loading the necessary libraries

In [34]:
import os
import pandas as pd
import random
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_metric
from transformers import T5Tokenizer, T5ForConditionalGeneration
import re
from tqdm import tqdm

In [13]:
model_name = "t5-base" 
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

Loading the dataset

In [49]:
df=pd.read_csv('/kaggle/input/language-translation-englishfrench/eng_-french.csv')
df.head()

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [50]:
df.shape

(175621, 2)

In [51]:
df.dtypes

English words/sentences    object
French words/sentences     object
dtype: object

Taking 500 rows

In [52]:
df=df[:500]

In [53]:
df.shape

(500, 2)

In [54]:
df.isnull().sum()

English words/sentences    0
French words/sentences     0
dtype: int64

In [55]:
english_text = df['English words/sentences']
french_text = df['French words/sentences']

In [56]:
df.head()

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


Encoding

In [57]:
# Encoding using 'START' and 'END' keywords

english = []
french = []
for i in range(len(english_text)):
    text = english_text[i].lower()
    text = re.sub('[^a-zA-Z]',' ',text)
    english.append(text)
    

for i in range(len(french_text)):
    ftext = french_text[i].lower()
    ftext = (re.sub("[^a-zA-Z' àâäèéêëîïôœùûüÿçÀÂÄÈÉÊËÎÏÔŒÙÛÜŸÇ]",' ',ftext))
    french.append("START_ " + ftext + " _END")

In [58]:
french[:3]

['START_ salut  _END', 'START_ cours   _END', 'START_ courez   _END']

In [59]:
english[:3]

['hi ', 'run ', 'run ']

In [60]:
df['source']=english
df['target']=french
df.head()

Unnamed: 0,English words/sentences,French words/sentences,source,target
0,Hi.,Salut!,hi,START_ salut _END
1,Run!,Cours !,run,START_ cours _END
2,Run!,Courez !,run,START_ courez _END
3,Who?,Qui ?,who,START_ qui _END
4,Wow!,Ça alors !,wow,START_ ça alors _END


In [61]:
df = df[['source','target']]
df.head()

Unnamed: 0,source,target
0,hi,START_ salut _END
1,run,START_ cours _END
2,run,START_ courez _END
3,who,START_ qui _END
4,wow,START_ ça alors _END


Preprocessing the Dataset

In [62]:
class TranslationDataset(Dataset):
    def __init__(self, df, tokenizer, source_max_len=50, target_max_len=50):
        self.tokenizer = tokenizer
        self.df = df
        self.source_max_len = source_max_len
        self.target_max_len = target_max_len
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        source_text = self.df.iloc[index]['source']
        target_text = self.df.iloc[index]['target']
        
        source_encodings = self.tokenizer(source_text, truncation=True, padding='max_length', max_length=self.source_max_len, return_tensors="pt")
        target_encodings = self.tokenizer(target_text, truncation=True, padding='max_length', max_length=self.target_max_len, return_tensors="pt")
        
        # Returning input_ids and labels
        return {
            'input_ids': source_encodings['input_ids'].flatten(),
            'attention_mask': source_encodings['attention_mask'].flatten(),
            'labels': target_encodings['input_ids'].flatten()
        }

In [63]:
dataset = TranslationDataset(df, tokenizer)
data_loader = DataLoader(dataset, batch_size=2, shuffle=True)

In [64]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

Model Training

In [65]:
# Training Loop
epochs = 1
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(data_loader, desc=f"Epoch {epoch + 1}"):
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(data_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_loss}")

Epoch 1: 100%|██████████| 250/250 [12:34<00:00,  3.02s/it]

Epoch 1, Loss: 1.595836112201214





In [66]:
def translate_sentence(sentence):
    model.eval()
    inputs = tokenizer(sentence, return_tensors="pt").input_ids.to(device)
    outputs = model.generate(inputs, max_length=50)
    translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_sentence

for idx, row in df.tail(10).iterrows():
    print(f"Source: {row['source']}")
    print(f"Target: {row['target']}")
    print(f"Prediction: {translate_sentence(row['source'])}")
    print("")

Source: be seated 
Target: START_ assieds toi   _END
Prediction: - Sit down and be seated.

Source: be seated 
Target: START_ asseyez vous   _END
Prediction: - Sit down and be seated.

Source: birds fly 
Target: START_ les oiseaux volent  _END
Prediction: Birds fly flies - birds fly - birds fly - birds fly - birds fly - birds fly - birds fly - birds fly - birds fly - birds fly - birds fly - birds

Source: bless you 
Target: START_ à tes souhaits   _END
Prediction: Vielen Dank, d'avoir eu l'occasion de m'exprimer et de me bénéficier de votre aide.

Source: call home 
Target: START_ appelle à la maison   _END
Prediction: 

Source: calm down 
Target: START_ calmez vous   _END
Prediction: 

Source: calm down 
Target: START_ calme toi  _END
Prediction: 

Source: can we go 
Target: START_ pouvons nous partir   _END
Prediction: 

Source: can we go 
Target: START_ pouvons nous nous en aller   _END
Prediction: 

Source: can we go 
Target: START_ pouvons nous y aller   _END
Prediction: 



Metrics

In [67]:
def clean_text(text):
    return text.replace('START_ ', '').replace(' _END', '').strip()

bleu = load_metric("bleu")

predictions = []
references = []

for idx, row in df.tail(10).iterrows():
    pred = translate_sentence(row['source'])
    
    # Clean prediction and reference
    cleaned_pred = clean_text(pred.lower()).split()
    cleaned_ref = clean_text(row['target'].lower()).split()

    # Append cleaned and tokenized versions to the lists
    predictions.append(cleaned_pred)
    references.append([cleaned_ref]) 

# Calculating BLEU score
result = bleu.compute(predictions=predictions, references=references)
print(f"BLEU Score: {result['bleu']}")

  bleu = load_metric("bleu")


Downloading builder script:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

The repository for bleu contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/bleu.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


BLEU Score: 0.0
