In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import re
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
import torch
import matplotlib.pyplot as plt

# File paths for Telugu and English data
telugu_file_path = "/content/drive/MyDrive/NLP -1/English-Telugu-20241116T070209Z-001 (3)/English-Telugu-20241116T070209Z-001/Telugu.txt"
english_file_path = "/content/drive/MyDrive/NLP -1/English-Telugu-20241116T070209Z-001 (3)/English-Telugu-20241116T070209Z-001/English.txt"

# Load Telugu and English data
with open(telugu_file_path, 'r', encoding='utf-8') as te_file, open(english_file_path, 'r', encoding='utf-8') as en_file:
    telugu_sentences = te_file.readlines()
    english_sentences = en_file.readlines()

# Ensure the datasets have the same number of sentences
assert len(telugu_sentences) == len(english_sentences), "Mismatch in number of Telugu and English sentences."

# Combine Telugu and English sentences into a single dataset
data = list(zip(telugu_sentences, english_sentences))

# Split into train (80%) and test (20%) sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Separate Telugu and English sentences for each split
train_telugu_sentences, train_english_sentences = zip(*train_data)
test_telugu_sentences, test_english_sentences = zip(*test_data)

# Preprocessing functions
def clean_telugu_sentence(sentence):
    sentence = re.sub(r"[^\u0C00-\u0C7F\s]", "", sentence)  # Keep only Telugu characters
    sentence = re.sub(r"\s+", " ", sentence)  # Remove extra spaces
    return sentence.strip()

def clean_english_sentence(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r"[^\w\s]", "", sentence)  # Remove special characters
    sentence = re.sub(r"\d+", "", sentence)  # Remove digits
    sentence = re.sub(r"\s+", " ", sentence)  # Remove extra spaces
    return sentence.strip()

# Clean sentences
train_telugu_sentences = [clean_telugu_sentence(sent) for sent in train_telugu_sentences]
train_english_sentences = [clean_english_sentence(sent) for sent in train_english_sentences]
test_telugu_sentences = [clean_telugu_sentence(sent) for sent in test_telugu_sentences]
test_english_sentences = [clean_english_sentence(sent) for sent in test_english_sentences]

# Load tokenizer
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

# Set source and target languages for Telugu-to-English translation
tokenizer.src_lang = "te_IN"  # Source: Telugu
tokenizer.tgt_lang = "en_XX"  # Target: English

# Custom dataset class
class TranslationDataset(Dataset):
    def __init__(self, source_sentences, target_sentences, tokenizer, max_length=128):
        self.source_sentences = source_sentences
        self.target_sentences = target_sentences
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.source_sentences)

    def __getitem__(self, idx):
        source = self.source_sentences[idx]
        target = self.target_sentences[idx]

        # Tokenize source and target
        source_encodings = self.tokenizer(
            source, return_tensors="pt", padding="max_length", truncation=True, max_length=self.max_length
        )
        target_encodings = self.tokenizer(
            target, return_tensors="pt", padding="max_length", truncation=True, max_length=self.max_length
        )

        input_ids = source_encodings["input_ids"].squeeze(0)
        attention_mask = source_encodings["attention_mask"].squeeze(0)
        labels = target_encodings["input_ids"].squeeze(0)

        # Replace padding token ID with -100 for loss calculation
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }

# Create datasets
train_dataset = TranslationDataset(train_telugu_sentences, train_english_sentences, tokenizer)
test_dataset = TranslationDataset(test_telugu_sentences, test_english_sentences, tokenizer)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the model
model_name = "facebook/mbart-large-50-one-to-many-mmt"
model = MBartForConditionalGeneration.from_pretrained(model_name).to(device)

# Modify dropout rates for regularization
model.config.decoder_dropout = 0.4
model.config.attention_dropout = 0.4

# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the last 4 layers of the decoder
for layer in model.model.decoder.layers[-4:]:
    for param in layer.parameters():
        param.requires_grad = True

# Verify trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable Parameters: {trainable_params:,}")

# Define the optimizer with weight decay
optimizer = AdamW(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=1e-5,
    weight_decay=0.05  # Apply weight decay for regularization
)

# Training loop with loss tracking
model.train()
epochs = 7
training_losses = []

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    train_loss = 0.0

    for batch in train_loader:
        # Move batch to device
        batch = {key: value.to(device) for key, value in batch.items()}

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        train_loss += loss.item()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_train_loss = train_loss / len(train_loader)
    training_losses.append(avg_train_loss)
    print(f"Average Training Loss: {avg_train_loss:.4f}")

# Save the fine-tuned model
output_dir = "fine_tuned_mbart_tel_to_eng"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model and tokenizer saved to {output_dir}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Using device: cuda
Trainable Parameters: 67,186,688
Epoch 1/7
Average Training Loss: 2.7294
Epoch 2/7
Average Training Loss: 2.0109
Epoch 3/7
Average Training Loss: 1.7329
Epoch 4/7
Average Training Loss: 1.5503
Epoch 5/7
Average Training Loss: 1.4167
Epoch 6/7
Average Training Loss: 1.3108
Epoch 7/7
Average Training Loss: 1.2225




Model and tokenizer saved to fine_tuned_mbart_tel_to_eng


In [7]:
fine_tuned_model = MBartForConditionalGeneration.from_pretrained(output_dir).to(device)
fine_tuned_tokenizer = MBart50TokenizerFast.from_pretrained(output_dir)

# Translate a custom Telugu sentence
fine_tuned_tokenizer.src_lang = "te_IN"
fine_tuned_tokenizer.tgt_lang = "en_XX"

input_sentence = "అతని కాళ్ళు పొడవుగా ఉన్నాయి"
input_ids = fine_tuned_tokenizer(input_sentence, return_tensors="pt", truncation=True, max_length=128).input_ids.to(device)

generated_tokens = fine_tuned_model.generate(
    input_ids=input_ids,
    max_length=128,
    num_beams=5,
    forced_bos_token_id=fine_tuned_tokenizer.lang_code_to_id["en_XX"]
)

translated_text = fine_tuned_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
print(f"Input: {input_sentence}")
print(f"Translated Text: {translated_text}")

Input: అతని కాళ్ళు పొడవుగా ఉన్నాయి
Translated Text: ['his legs are longs']


In [8]:
pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [16]:
from bert_score import score
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from torch.utils.data import DataLoader
import torch

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the fine-tuned model and tokenizer
fine_tuned_model = MBartForConditionalGeneration.from_pretrained(output_dir).to(device)
fine_tuned_tokenizer = MBart50TokenizerFast.from_pretrained(output_dir)
fine_tuned_tokenizer.src_lang = "te_IN"  # Source language: Telugu
fine_tuned_tokenizer.tgt_lang = "en_XX"  # Target language: English

# Test dataset (use your actual test data here)
test_telugu_sentences = test_telugu_sentences  # Replace with Telugu test sentences
test_english_references = test_english_sentences  # Replace with English reference translations

# Ensure the number of sentences matches
assert len(test_telugu_sentences) == len(test_english_references), "Mismatch in test data sizes."

# Create a DataLoader for the test dataset
test_dataset = list(zip(test_telugu_sentences, test_english_references))
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Lists to store candidate and reference translations
candidate_translations = []
reference_translations = []

# Set model to evaluation mode
fine_tuned_model.eval()

with torch.no_grad():
    for batch in test_loader:
        # Unzip batch into input sentences and references
        input_sentences, reference_sentences = batch[0],batch[1]

        # Tokenize input sentences
        input_ids = fine_tuned_tokenizer(
            list(input_sentences),
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=128
        ).input_ids.to(device)

        # Generate translations
        generated_tokens = fine_tuned_model.generate(
            input_ids=input_ids,
            max_length=128,
            num_beams=5,
            forced_bos_token_id=fine_tuned_tokenizer.lang_code_to_id["en_XX"]
        )

        # Decode translations and collect results
        batch_translations = fine_tuned_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        candidate_translations.extend(batch_translations)
        reference_translations.extend(list(reference_sentences))

# Calculate BERTScore for the entire test dataset
P, R, F1 = score(candidate_translations, reference_translations, lang="en", verbose=True)

# Print BERTScore metrics
print(f"Precision: {P.mean().item():.4f}")
print(f"Recall: {R.mean().item():.4f}")
print(f"F1 Score: {F1.mean().item():.4f}")


Using device: cuda


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/862 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/464 [00:00<?, ?it/s]



done in 41.99 seconds, 706.97 sentences/sec
Precision: 0.9043
Recall: 0.8987
F1 Score: 0.9014


