In [1]:
# Standard library imports
import pandas as pd
import numpy as np
import torch

# Third-party library imports
from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset
from evaluate import load as load_metric
from matplotlib import pyplot as plt

# Transformers and related libraries
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    pipeline,
)

# LoRA (optional, if you still want to use it)
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training

In [2]:
with torch.no_grad():
    torch.cuda.empty_cache()

In [3]:
# Load datasets
dataset_english_to_hindi = load_dataset(
    "csv", data_files={"train": "../Datasets/WikiMatrix/Processed/clean_en-hi.csv"},
    split="train[:10]"  # Use a subset for faster debugging
)

dataset_english_to_greek = load_dataset(
    "csv", data_files={"train": "../Datasets/WikiMatrix/Processed/clean_en-el.csv"},
    split="train[:10]"  # Use a subset for faster debugging
)

In [4]:
def preprocess_datasets(dataset1, dataset2, lang1_token, lang2_token, col_mapping1, col_mapping2):
    """
    Preprocess two datasets to rename columns, add language tokens, and combine them.

    Args:
        dataset1: First dataset.
        dataset2: Second dataset.
        lang1_token: Language token for dataset1 (e.g., "hi" for Hindi).
        lang2_token: Language token for dataset2 (e.g., "el" for Greek).
        col_mapping1: Dictionary mapping for dataset1 column renaming (e.g., {"English": "source", "Hindi": "target"}).
        col_mapping2: Dictionary mapping for dataset2 column renaming (e.g., {"English": "source", "Greek": "target"}).

    Returns:
        Combined dataset with consistent formatting.
    """
    # Rename columns for the first dataset
    dataset1 = dataset1.rename_columns(col_mapping1)

    # Add language token to the source column of dataset1
    dataset1 = dataset1.map(lambda x: {"source": f"<{lang1_token}> " + x["source"], "tgt_lang": lang1_token})

    # Rename columns for the second dataset
    dataset2 = dataset2.rename_columns(col_mapping2)

    # Add language token to the source column of dataset2
    dataset2 = dataset2.map(lambda x: {"source": f"<{lang2_token}> " + x["source"], "tgt_lang": lang2_token})

    # Combine both datasets
    combined_dataset = concatenate_datasets([dataset1, dataset2])

    return combined_dataset

# Preprocess and combine datasets
combined_dataset = preprocess_datasets(
    dataset_english_to_hindi,
    dataset_english_to_greek,
    lang1_token="hi",
    lang2_token="el",
    col_mapping1={"English": "source", "Hindi": "target"},
    col_mapping2={"English": "source", "Greek": "target"}
)

# Shuffle the combined dataset
combined_dataset = combined_dataset.shuffle(seed=42)

# Verify the result
print(combined_dataset[0])  # Should show a sample from the combined dataset with <hi> token
print(combined_dataset[-1]) # Should show a sample from the combined dataset with <el> token

{'source': '<el> they ask him to come in for questioning.', 'target': 'Ήρθαν λοιπόν για να το ζητήσουν.', 'tgt_lang': 'el'}
{'source': '<hi> have we not made for him two eyes?', 'target': 'क्या हमने उसके लिए दो आंखें नहीं बनाई हैं?', 'tgt_lang': 'hi'}


In [5]:
# Split the combined dataset into train, validation, and test sets
train_test_split = combined_dataset.train_test_split(test_size=0.2, seed=42)
validation_test_split = train_test_split["test"].train_test_split(test_size=0.5, seed=42)

# Create a DatasetDict
final_dataset = DatasetDict({
    "train": train_test_split["train"],
    "validation": validation_test_split["train"],
    "test": validation_test_split["test"]
})

# Verify the splits
print(final_dataset)

DatasetDict({
    train: Dataset({
        features: ['source', 'target', 'tgt_lang'],
        num_rows: 16
    })
    validation: Dataset({
        features: ['source', 'target', 'tgt_lang'],
        num_rows: 2
    })
    test: Dataset({
        features: ['source', 'target', 'tgt_lang'],
        num_rows: 2
    })
})


In [6]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

model_name = "facebook/m2m100_418M"
model = M2M100ForConditionalGeneration.from_pretrained(model_name)
model.gradient_checkpointing_enable()
tokenizer = M2M100Tokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using a model of type t5 to instantiate a model of type m2m_100. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Some weights of M2M100ForConditionalGeneration were not initialized from the model checkpoint at t5-base and are newly initialized: ['decoder.layer_norm.bias', 'decoder.layer_norm.weight', 'decoder.layers.0.encoder_attn.k_proj.bias', 'decoder.layers.0.encoder_attn.k_proj.weight', 'decoder.layers.0.encoder_attn.out_proj.bias', 'decoder.layers.0.encoder_attn.out_proj.weight', 'decoder.layers.0.encoder_attn.q_proj.bias', 'decoder.layers.0.encoder_attn.q_proj.weight', 'decoder.layers.0.encoder_attn.v_proj.bias', 'decoder.layers.0.encoder_attn.v_proj.weight', 'decoder.layers.0.encoder_attn_layer_norm.bias', 'decoder.layers.0.encoder_attn_layer_norm.weight', 'decoder.layers.0.fc1.bias', 'decoder.layers.0.fc1.weight', 'decoder.layers.0.fc2.bias', 'decoder.layers.0.fc2.weight', 'decoder.layers.0.final_layer_norm.bias', 'decoder.layers.0.final_layer_norm.weight', 'decoder.layers.0.self_attn.k_proj.bias', 'decoder.layers.0.self_attn.k_proj.weight', 'decoder.layers.0.self_attn.out_proj.bias', 'de

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'M2M100Tokenizer'.


TypeError: expected str, bytes or os.PathLike object, not NoneType

In [7]:
def preprocess_function(examples):
    # Set the source language
    tokenizer.src_lang = "en"
    tokenizer.tgt_lang = examples["tgt_lang"][0]  # Dynamically set the target language based on the dataset

    # Tokenize source and target texts
    model_inputs = tokenizer(examples["source"], max_length=128, truncation=True)
    labels = tokenizer(examples["target"], max_length=128, truncation=True)
    
    # Add labels to the model inputs
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
tokenized_dataset = final_dataset.map(preprocess_function, batched=True, remove_columns=["source", "target", "tgt_lang"])


In [9]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


In [10]:
import evaluate

metric = evaluate.load("sacrebleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Replace ignored index (-100) in labels with pad_token_id
    decoded_labels = [
        label.replace(tokenizer.pad_token, "").strip() for label in decoded_labels
    ]
    result = metric.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    return {"bleu": result["score"]}


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",               # Directory to save model and results
    do_train=True,                        # Enable training
    do_eval=True,                         # Enable evaluation
    evaluation_strategy="epoch",          # Evaluate after each epoch
    save_strategy="epoch",                # Save model after each epoch
    num_train_epochs=4,                  # Number of training epochs
    learning_rate=2e-5,                   # Learning rate
    warmup_steps=500,                     # Number of warmup steps for learning rate scheduler
    per_device_train_batch_size=4,        # Batch size for training
    per_device_eval_batch_size=4,         # Batch size for evaluation
    weight_decay=0.1,                     # Weight decay for regularization
    gradient_accumulation_steps=8,        # Accumulate gradients across 4 steps
    fp16=True,                            # Enable mixed precision (FP16) training
    logging_steps=10,                     # Log training metrics every 10 steps
    lr_scheduler_type="linear",           # Linear learning rate decay after warmup
    metric_for_best_model="eval_loss",    # Use validation loss to select the best model
    report_to=None,                       # Disable reporting to third-party tools (e.g., "wandb")
    save_total_limit=2,                   # Save only the last 2 checkpoints
    logging_dir="./logs",                 # Directory for logs
    load_best_model_at_end=True           # Load the best model at the end of training
)


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,                          # The model to fine-tune
    args=training_args,                   # Training configuration
    train_dataset=tokenized_dataset["train"],  # Training dataset
    eval_dataset=tokenized_dataset["validation"],  # Validation dataset
    tokenizer=tokenizer,                  # Tokenizer
    data_collator=data_collator,          # Data collator
    compute_metrics=compute_metrics,      # Metric for evaluation
)


In [None]:
trainer.train()
