In [1]:
import torch

# Clear CUDA memory
torch.cuda.empty_cache()

# Reset CUDA
from numba import cuda
cuda.select_device(0)
cuda.close()
cuda.select_device(0)

print("GPU memory cleared!")

GPU memory cleared!


# Import Libraries 

In [2]:
import warnings
warnings.filterwarnings('ignore')
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from transformers import T5ForConditionalGeneration
from transformers import Trainer, TrainingArguments

# Load Dataset

In [3]:
# Load the dataset
dataset = load_dataset("SKNahin/bengali-transliteration-data")

In [5]:

# Split into training and validation sets
train_test_split = dataset['train'].train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

# Data Preprocessing

In [6]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")

In [7]:
# Tokenization function
def preprocess_data(example):
    source = tokenizer(example['bn'], padding="max_length", truncation=True, max_length=128)
    target = tokenizer(example['rm'], padding="max_length", truncation=True, max_length=128)
    source['labels'] = target['input_ids']
    return source

# Apply preprocessing
train_dataset = train_dataset.map(preprocess_data, batched=True)
val_dataset = val_dataset.map(preprocess_data, batched=True)

# Set the datasets to PyTorch format
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/4004 [00:00<?, ? examples/s]

Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

# Model Selection

In [8]:
# Load the pre-trained model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [9]:
# Define training arguments with memory optimization
training_args = TrainingArguments(
    output_dir="./results",                # Directory to save checkpoints
    evaluation_strategy="epoch",           # Evaluate at the end of each epoch
    learning_rate=5e-5,                    # Learning rate
    per_device_train_batch_size=8,         # Reduced batch size
    per_device_eval_batch_size=8,          # Reduced evaluation batch size         # Simulate larger batch size
    num_train_epochs=3,                    # Number of epochs for testing
    fp16=True,                             # Enable mixed precision training
    logging_steps=10,                      # Log progress every 10 steps
    save_steps=500,                        # Save model every 500 steps
    save_total_limit=2,                    # Limit saved checkpoints to avoid storage issues
    weight_decay=0.01,                     # Regularization to prevent overfitting
    logging_dir='./logs',                  # Directory for logging
    disable_tqdm=False, 
    report_to="none",                      # Disable logging to external services (like WandB)
)


In [10]:
!pip install --quiet evaluate

In [11]:
!pip install --quiet sacrebleu

In [12]:
# import evaluate

# # Load the metric
# metric = evaluate.load("sacrebleu")

# # Evaluation function
# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     # Decode the predictions and labels
#     decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
#     # SacreBLEU expects a list of references for each prediction
#     decoded_labels = [[label] for label in decoded_labels]
    
#     # Compute BLEU score
#     result = metric.compute(predictions=decoded_preds, references=decoded_labels)
#     return {"bleu": result["score"]}

# Define the Trainer
trainer = Trainer(
    model=model,                           # Model to train
    args=training_args,                    # Training arguments
    train_dataset=train_dataset,           # Training dataset
    eval_dataset=val_dataset,              # Validation dataset
    tokenizer=tokenizer,                   # Tokenizer
    # compute_metrics=compute_metrics        # Custom metrics function
)


# Train Model

In [13]:
# Train the model
trainer.train()

# # Evaluate the model on the validation dataset
# evaluation_results = trainer.evaluate()

# # Print BLEU score
# print(f"BLEU score: {evaluation_results['eval_bleu']}")

Epoch,Training Loss,Validation Loss
1,0.6048,0.567085
2,0.5876,0.534203
3,0.5464,0.525056


TrainOutput(global_step=1503, training_loss=0.8876930682245129, metrics={'train_runtime': 179.6154, 'train_samples_per_second': 66.876, 'train_steps_per_second': 8.368, 'total_flos': 406431429820416.0, 'train_loss': 0.8876930682245129, 'epoch': 3.0})