# English to French Translation

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AdamW
from torch.utils.data import DataLoader, Dataset
import json
from tqdm import tqdm
import pandas as pd

In [2]:
# Check if GPU is available and move the model accordingly
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [3]:
# Load the custom dataset
dataset_train = pd.read_csv("Dataset/nlp_intel_train.csv")
dataset_test = pd.read_csv("Dataset/nlp_intel_test.csv")

In [4]:
dataset_train.head()

Unnamed: 0.1,Unnamed: 0,en,fr
0,1000,"In 1981, he founded the Astronomy Club of Rimo...","En 1981, il fonde le Club d'Astronomie de Rimo..."
1,1001,The club was very active and they twice organi...,Le club est très actif et organise à deux occa...
2,1002,"In 1983, Lemay initiated the first joint meeti...","En 1983, il est l'instigateur à Québec du cong..."
3,1003,"The conference took place in Quebec City, and ...",Le congrès est un franc succès et regroupe pas...
4,1004,"From 1990 to 1992, he was the National Preside...","De 1990 à 1992, il est président national de l..."


## Get data

In [5]:
# Aranging source and target sentences into lists
def read_dataset(dataset):
    source = [str(elem) for elem in list(dataset["en"])]
    target = [str(elem) for elem in list(dataset["fr"])]
    return source, target

In [6]:
train_source, train_target = read_dataset(dataset_train)
test_source, test_target = read_dataset(dataset_test)

In [7]:
train_source[0]

'In 1981, he founded the Astronomy Club of Rimouski in Quebec.'

## Tokenization

In [8]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-fr')
print(tokenizer)
print(tokenizer("Hello world"))

MarianTokenizer(name_or_path='Helsinki-NLP/opus-mt-en-fr', vocab_size=59514, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	59513: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
{'input_ids': [10537, 522, 0], 'attention_mask': [1, 1, 1]}


## Creating a Class for Datasets

In [9]:
# Define the dataset class
class TranslationDataset(Dataset):
    def __init__(self, encodings, tokenizer):
        self.encoded_inputs = encodings

    def __len__(self):
        return len(self.encoded_inputs.input_ids)

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encoded_inputs.items()}

In [15]:
# Define the dataset class
class TranslationDataset(Dataset):
    def __init__(self, source_text, target_text, tokenizer):
        self.source = source_text
        self.target = target_text
        self.tokenizer = tokenizer
        self.encodings = self.tokenizer.prepare_seq2seq_batch(
            src_texts=self.source,
            tgt_texts=self.target,
            return_tensors="pt",
        )

    def __len__(self):
        return len(self.encodings.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.encodings["labels"][idx],
        }

In [16]:
train_dataset = TranslationDataset(train_source, train_target, tokenizer)
valid_dataset = TranslationDataset(test_source, test_target, tokenizer)

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



In [17]:
train_dataset[0]["input_ids"]

tensor([   97,  8133,     2,   173,  9981,     4,   433,  2451, 19472,  5729,
            7,  5382,   122,  2130,  5049,    18,  2656,     3,     0, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
        59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 

## Loading the Model

In [18]:
# Load the pre-trained model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained('Helsinki-NLP/opus-mt-en-fr')
model.to(device)

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(59514, 512, padding_idx=59513)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(59514, 512, padding_idx=59513)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLUActivation()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,),

In [19]:
# Create the data loader
batch_size = 1 # out of memory error for high values
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)

## Fine Tuning

In [20]:
num_layers = model.config.num_hidden_layers
print(f"Number of layers: {num_layers}")

Number of layers: 6


In [21]:
# Tuning only the last 3 layers of the model
num_encoder_layers_to_freeze = 5
num_decoder_layers_to_freeze = 5

# Fix embedding parameters
for param in model.model.shared.parameters():
    param.requires_grad = False

# Fix all but the last encoder and decoder layer
for i, layer in enumerate(model.model.encoder.layers):
    if i < len(model.model.encoder.layers) - num_encoder_layers_to_freeze:
        for param in layer.parameters():
            param.requires_grad = False

for i, layer in enumerate(model.model.decoder.layers):
    if i < len(model.model.decoder.layers) - num_decoder_layers_to_freeze:
        for param in layer.parameters():
            param.requires_grad = False

In [22]:
epochs = 1
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

## Training Loop

In [None]:
# Training loop
model.train()

for epoch in range(epochs):
    total_loss = 0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}', dynamic_ncols=True):
        inputs = {key: value.to(device) for key, value in batch.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss}")

In [None]:
# Save the fine-tuned model
model.save_pretrained("model_params")
tokenizer.save_pretrained("model_params")

## Inference

In [None]:
def get_prediction(text_data):
    with torch.no_grad():
        inputs = tokenizer(text_data, return_tensors='pt').to(device)
        outputs = model.generate(**inputs)
        answer = tokenizer.batch_decode(outputs)
        final_ans = answer[0][6:-4] # omitting some tokens
    return final_ans

In [None]:
def display_results(num, dataset):
    #avg_bleu_score = 0
    for i in range(len(dataset)):
        pred = get_prediction(dataset["en"][i])
        #bleu = calculate_bleu_score(dataset["fr"][i], pred)
        avg_bleu_score += bleu
        if i<num:
            print(f'English Sentence: {dataset["en"][i]}')
            print(f'French Ground-Truth: {dataset["fr"][i]}')
            print(f'French Prediction: {pred}')
           # print(f'BLEU Score: {bleu}\n')
    #print(f'Avg BLEU Score: {avg_bleu_score/len(dataset)}')
    return

In [None]:
display_results(5, dataset_test)