In [6]:
import re
from transformers import MBart50TokenizerFast
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# File paths for English and Telugu data
english_file_path = "/content/drive/MyDrive/NLP -1/English-Telugu-20241116T070209Z-001 (3)/English-Telugu-20241116T070209Z-001/English.txt"
telugu_file_path = "/content/drive/MyDrive/NLP -1/English-Telugu-20241116T070209Z-001 (3)/English-Telugu-20241116T070209Z-001/Telugu.txt"

# Load English and Telugu data
with open(english_file_path, 'r', encoding='utf-8') as en_file, open(telugu_file_path, 'r', encoding='utf-8') as te_file:
    english_sentences = en_file.readlines()
    telugu_sentences = te_file.readlines()

# Ensure the datasets have the same number of sentences
assert len(english_sentences) == len(telugu_sentences), "Mismatch in number of English and Telugu sentences."

# Combine English and Telugu sentences into a single dataset
data = list(zip(english_sentences, telugu_sentences))

# Split into train (80%) and test (20%) sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Separate English and Telugu sentences for each split
train_english_sentences, train_telugu_sentences = zip(*train_data)
test_english_sentences, test_telugu_sentences = zip(*test_data)

# Preprocessing functions
def clean_english_sentence(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r"[^\w\s]", "", sentence)  # Remove special characters
    sentence = re.sub(r"\d+", "", sentence)  # Remove digits
    sentence = re.sub(r"\s+", " ", sentence)  # Remove extra spaces
    return sentence.strip()

def clean_telugu_sentence(sentence):
    sentence = re.sub(r"[^\u0C00-\u0C7F\s]", "", sentence)  # Keep only Telugu characters
    sentence = re.sub(r"\s+", " ", sentence)  # Remove extra spaces
    return sentence.strip()

# Clean sentences
train_english_sentences = [clean_english_sentence(sent) for sent in train_english_sentences]
train_telugu_sentences = [clean_telugu_sentence(sent) for sent in train_telugu_sentences]
test_english_sentences = [clean_english_sentence(sent) for sent in test_english_sentences]
test_telugu_sentences = [clean_telugu_sentence(sent) for sent in test_telugu_sentences]

# Load tokenizer
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

# Set source and target languages
tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "te_IN"

# Custom dataset class
class TranslationDataset(Dataset):
    def __init__(self, source_sentences, target_sentences, tokenizer, max_length=128):
        self.source_sentences = source_sentences
        self.target_sentences = target_sentences
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.source_sentences)

    def __getitem__(self, idx):
        source = self.source_sentences[idx]
        target = self.target_sentences[idx]

        # Tokenize source and target
        source_encodings = self.tokenizer(
            source, return_tensors="pt", padding="max_length", truncation=True, max_length=self.max_length
        )
        target_encodings = self.tokenizer(
            target, return_tensors="pt", padding="max_length", truncation=True, max_length=self.max_length
        )

        input_ids = source_encodings["input_ids"].squeeze(0)
        attention_mask = source_encodings["attention_mask"].squeeze(0)
        labels = target_encodings["input_ids"].squeeze(0)

        # Replace padding token ID with -100 for loss calculation
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }

# Create datasets
train_dataset = TranslationDataset(train_english_sentences, train_telugu_sentences, tokenizer)
test_dataset = TranslationDataset(test_english_sentences, test_telugu_sentences, tokenizer)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Example usage: Inspect a single batch
for batch in train_loader:
    print("Batch Input IDs:", batch["input_ids"])
    print("Batch Attention Mask:", batch["attention_mask"])
    print("Batch Labels:", batch["labels"])
    break


Batch Input IDs: tensor([[250004,    831,    398,  ...,      1,      1,      1],
        [250004, 127773,    442,  ...,      1,      1,      1],
        [250004,     54,    398,  ...,      1,      1,      1],
        ...,
        [250004,    903,   5551,  ...,      1,      1,      1],
        [250004,   3627,      7,  ...,      1,      1,      1],
        [250004,     17,  13319,  ...,      1,      1,      1]])
Batch Attention Mask: tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
Batch Labels: tensor([[250004,  22735, 218206,  ...,   -100,   -100,   -100],
        [250004, 166015,   2127,  ...,   -100,   -100,   -100],
        [250004,  22735, 204218,  ...,   -100,   -100,   -100],
        ...,
        [250004,   1767,  30049,  ...,   -100,   -100,   -100],
        [250004,      6,  20319,  ...,   -100,   -100,   -100]

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import matplotlib.pyplot as plt
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
import torch

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the model and tokenizer
model_name = "facebook/mbart-large-50-one-to-many-mmt"
model = MBartForConditionalGeneration.from_pretrained(model_name).to(device)
tokenizer = MBart50TokenizerFast.from_pretrained(model_name, src_lang="en_XX")

# Modify dropout rates for regularization
model.config.decoder_dropout = 0.4
model.config.attention_dropout = 0.4

# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the last 4 layers of the decoder
for layer in model.model.decoder.layers[-4:]:
    for param in layer.parameters():
        param.requires_grad = True

# Verify trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable Parameters: {trainable_params:,}")

# Define the optimizer with weight decay
optimizer = AdamW(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=1e-5,
    weight_decay=0.05  # Apply weight decay for regularization
)

# Training loop with loss tracking
model.train()
epochs = 7
training_losses = []
validation_losses = []

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    train_loss = 0.0

    for batch in train_loader:
        # Move batch to device
        batch = {key: value.to(device) for key, value in batch.items()}

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        train_loss += loss.item()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_train_loss = train_loss / len(train_loader)
    training_losses.append(avg_train_loss)
    print(f"Average Training Loss: {avg_train_loss:.4f}")
    # Switch back to training mode
    model.train()

# Save the fine-tuned model
output_dir = "fine_tuned_mbart"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model and tokenizer saved to {output_dir}")



Using device: cuda
Trainable Parameters: 67,186,688
Epoch 1/7
Average Training Loss: 1.5153
Epoch 2/7
Average Training Loss: 0.9253
Epoch 3/7
Average Training Loss: 0.7496
Epoch 4/7
Average Training Loss: 0.6389
Epoch 5/7
Average Training Loss: 0.5598
Epoch 6/7
Average Training Loss: 0.4996
Epoch 7/7
Average Training Loss: 0.4477




Model and tokenizer saved to fine_tuned_mbart


In [4]:
# Reload the fine-tuned model
fine_tuned_model = MBartForConditionalGeneration.from_pretrained("fine_tuned_mbart").to(device)
fine_tuned_tokenizer = MBart50TokenizerFast.from_pretrained("fine_tuned_mbart")

# Set source and target languages
fine_tuned_tokenizer.src_lang = "en_XX"  # English
fine_tuned_tokenizer.tgt_lang = "te_IN"  # Telugu

# Translate a custom sentence
input_sentence = "Iam going to play"
input_ids = fine_tuned_tokenizer(input_sentence, return_tensors="pt", truncation=True, max_length=128).input_ids.to(device)

generated_tokens = fine_tuned_model.generate(
    input_ids=input_ids,
    max_length=128,
    num_beams=5,
    forced_bos_token_id=fine_tuned_tokenizer.lang_code_to_id["te_IN"]
)

translated_text = fine_tuned_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
print(f"Input: {input_sentence}")
print(f"Translated Text: {translated_text}")


Input: Iam going to play
Translated Text: ['నేను ఆడబోతున్నాను']


In [5]:
pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [6]:
from bert_score import score
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from torch.utils.data import DataLoader
import torch

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the fine-tuned model and tokenizer
fine_tuned_model = MBartForConditionalGeneration.from_pretrained("fine_tuned_mbart").to(device)
fine_tuned_tokenizer = MBart50TokenizerFast.from_pretrained("fine_tuned_mbart")
fine_tuned_tokenizer.src_lang = "en_XX"  # Source language: English
fine_tuned_tokenizer.tgt_lang = "te_IN"  # Target language: Telugu

# Test dataset (use your actual test data here)
test_english_sentences =  test_english_sentences # Replace with all English test sentences
test_telugu_references = test_telugu_sentences  # Replace with all Telugu reference translations

# Ensure the number of sentences matches
assert len(test_english_sentences) == len(test_telugu_references), "Mismatch in test data sizes."

# Create a DataLoader for the test dataset
test_dataset = list(zip(test_english_sentences, test_telugu_references))
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Lists to store candidate and reference translations
candidate_translations = []
reference_translations = []

# Set model to evaluation mode
fine_tuned_model.eval()

with torch.no_grad():
    for batch in test_loader:
        # Unzip batch into input sentences and references
        input_sentences, reference_sentences = batch[0],batch[1]

        # Tokenize input sentences
        input_ids = fine_tuned_tokenizer(
            list(input_sentences),
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=128
        ).input_ids.to(device)

        # Generate translations
        generated_tokens = fine_tuned_model.generate(
            input_ids=input_ids,
            max_length=128,
            num_beams=5,
            forced_bos_token_id=fine_tuned_tokenizer.lang_code_to_id["te_IN"]
        )

        # Decode translations and collect results
        batch_translations = fine_tuned_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        candidate_translations.extend(batch_translations)
        reference_translations.extend(reference_sentences)

# Calculate BERTScore for the entire test dataset
P, R, F1 = score(candidate_translations, reference_translations, lang="te", verbose=True)

# Print BERTScore metrics
print(f"Precision: {P.mean().item():.4f}")
print(f"Recall: {R.mean().item():.4f}")
print(f"F1 Score: {F1.mean().item():.4f}")


Using device: cuda


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

calculating scores...
computing bert embedding.


  0%|          | 0/387 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/232 [00:00<?, ?it/s]



done in 20.65 seconds, 718.98 sentences/sec
Precision: 0.9369
Recall: 0.9350
F1 Score: 0.9358


In [None]:
pip install wordcloud matplotlib


