In [None]:
# ========================================
# English-Telugu Translation with mBART
# ========================================
# This notebook fine-tunes mBART for English‚ÜîTelugu translation
# Import this notebook into Google Colab to run it.

# ========================================
# CELL 1: Install Dependencies
# ========================================
!pip install -q transformers datasets pandas torch accelerate huggingface_hub

In [2]:
# ========================================
# CELL 2: Import Libraries
# ========================================
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
import os

# Disable W&B if you don't want to use it
os.environ["WANDB_DISABLED"] = "true"

print("‚úÖ Libraries imported successfully!")
print(f"üñ•Ô∏è  Using device: {'GPU' if torch.cuda.is_available() else 'CPU'}")

‚úÖ Libraries imported successfully!
üñ•Ô∏è  Using device: GPU


In [3]:
# ========================================
# CELL 3: Download and Prepare Dataset
# ========================================
def prepare_telugu_dataset():
    """
    Downloads Telugu-English parallel corpus and prepares train/val splits
    """
    print("üì• Downloading dataset...")

    # Load the Telugu translation dataset
    df = pd.read_parquet(
        "hf://datasets/Shreya3095/TeluguTranslator/data/train-00000-of-00001.parquet"
    )

    # Rename columns for clarity
    df = df.rename(columns={'question': 'english', 'answer': 'telugu'})

    # Clean the data
    print("üßπ Cleaning data...")
    df = df.dropna()
    df['english'] = df['english'].astype(str).str.strip()
    df['telugu'] = df['telugu'].astype(str).str.strip()

    # Filter by length (remove very short/long sentences)
    df = df[(df['english'].str.len() > 3) & (df['english'].str.len() < 200)]
    df = df[(df['telugu'].str.len() > 3) & (df['telugu'].str.len() < 300)]

    print(f"üìä Dataset size after cleaning: {len(df)} samples")

    # Split into train (90%) and validation (10%)
    from sklearn.model_selection import train_test_split
    train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

    print(f"üìö Train samples: {len(train_df)}")
    print(f"üìñ Validation samples: {len(val_df)}")

    # Save to disk (optional, for inspection)
    os.makedirs('data', exist_ok=True)
    train_df.to_csv('data/train.csv', index=False)
    val_df.to_csv('data/val.csv', index=False)

    # Convert to Hugging Face Dataset format
    train_dataset = Dataset.from_pandas(train_df[['english', 'telugu']])
    val_dataset = Dataset.from_pandas(val_df[['english', 'telugu']])

    return train_dataset, val_dataset

# Run the preparation
train_dataset, val_dataset = prepare_telugu_dataset()

# Show a sample
print("\nüîç Sample data:")
print(train_dataset[0])

üì• Downloading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


üßπ Cleaning data...
üìä Dataset size after cleaning: 1101 samples
üìö Train samples: 990
üìñ Validation samples: 111

üîç Sample data:
{'english': "It's time to get up.", 'telugu': '‡∞á‡∞¶‡∞ø ‡∞≤‡±á‡∞µ‡∞°‡∞æ‡∞®‡∞ø‡∞ï‡∞ø ‡∞∏‡∞Æ‡∞Ø‡∞Ç.', '__index_level_0__': 758}


In [4]:
# ========================================
# CELL 4: Load Pre-trained mBART Model
# ========================================
print("\nü§ñ Loading pre-trained mBART model...")

# Model name from Hugging Face
model_name = "facebook/mbart-large-50-many-to-many-mmt"

# Load tokenizer and model
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

# Set source and target languages
tokenizer.src_lang = "en_XX"  # English
tokenizer.tgt_lang = "te_IN"  # Telugu

print("‚úÖ Model loaded successfully!")
print(f"üì¶ Model size: {sum(p.numel() for p in model.parameters()):,} parameters")


ü§ñ Loading pre-trained mBART model...


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

‚úÖ Model loaded successfully!
üì¶ Model size: 610,879,488 parameters


In [5]:
# ========================================
# CELL 5: Preprocess Data for Training
# ========================================
def preprocess_function(examples):
    """
    Tokenizes English and Telugu text for training
    """
    # Tokenize English (source)
    inputs = tokenizer(
        examples["english"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    # Tokenize Telugu (target)
    targets = tokenizer(
        examples["telugu"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    # Labels for the model to learn from
    inputs["labels"] = targets["input_ids"]

    return inputs

print("üîÑ Tokenizing datasets...")
tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["english", "telugu"]
)

tokenized_val = val_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["english", "telugu"]
)

print("‚úÖ Tokenization complete!")

üîÑ Tokenizing datasets...


Map:   0%|          | 0/990 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

‚úÖ Tokenization complete!


In [6]:
# ========================================
# CELL 6: Setup Training Configuration
# ========================================
print("\n‚öôÔ∏è Configuring training parameters...")

# Data collator - handles batching
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",                    # Where to save checkpoints
    eval_strategy="epoch",                      # Evaluate after each epoch
    save_strategy="epoch",                      # Save after each epoch
    learning_rate=5e-5,                         # Learning rate (standard for fine-tuning)
    per_device_train_batch_size=8,              # Batch size per GPU
    per_device_eval_batch_size=8,
    num_train_epochs=10,                        # Number of training epochs
    weight_decay=0.01,                          # Regularization
    save_total_limit=3,                         # Keep only last 3 checkpoints
    predict_with_generate=True,                 # Use generation for evaluation
    fp16=torch.cuda.is_available(),             # Use mixed precision if GPU available
    logging_dir="./logs",
    logging_steps=100,                          # Log every 100 steps
    load_best_model_at_end=True,               # Load best model at end
    metric_for_best_model="loss",               # Use loss to determine best model
    push_to_hub=False,                          # Don't auto-push to HF Hub
)

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("‚úÖ Trainer configured!")


‚öôÔ∏è Configuring training parameters...


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Seq2SeqTrainer(


‚úÖ Trainer configured!


In [7]:
# ========================================
# CELL 7: Train the Model
# ========================================
print("\nüöÄ Starting training...")
print("‚è±Ô∏è  This will take ~20-30 minutes depending on your GPU\n")

# Start training
trainer.train()

print("\nüéâ Training complete!")


üöÄ Starting training...
‚è±Ô∏è  This will take ~20-30 minutes depending on your GPU



Epoch,Training Loss,Validation Loss
1,2.6668,0.159782
2,0.123,0.14052
3,0.0635,0.132383
4,0.0344,0.134058
5,0.0102,0.13284
6,0.0095,0.134899
7,0.0069,0.133653
8,0.0044,0.134699
9,0.0036,0.135549
10,0.0035,0.135425


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].



üéâ Training complete!


In [10]:
# ========================================
# CELL 8: Save the Fine-tuned Model
# ========================================
print("\nüíæ Saving model...")

# Save to local directory
output_dir = "./mbart-finetuned-en-te"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"‚úÖ Model saved to: {output_dir}")


üíæ Saving model...
‚úÖ Model saved to: ./mbart-finetuned-en-te


In [8]:
# ========================================
# CELL 9: Test the Model
# ========================================
def translate(text, model, tokenizer, device="cuda", max_len=128):
    """
    Translates English text to Telugu
    """
    # Tokenize input
    inputs = tokenizer(
        text,
        return_tensors="pt",
        max_length=max_len,
        truncation=True
    ).to(device)

    # Generate translation
    generated_ids = model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.lang_code_to_id["te_IN"],
        max_length=max_len,
        num_beams=5,  # Beam search for better quality
        early_stopping=True
    )

    # Decode the output
    translation = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return translation

# Load the fine-tuned model for inference
print("\nüîÆ Testing translations...")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()

# Test sentences
test_sentences = [
    "Hello, how are you?",
    "Good morning everyone",
    "I will go to the temple daily",
    "What is your name?",
    "Thank you very much",
    "I went to school yesterday"
]

print("\n" + "="*60)
print("TRANSLATION RESULTS")
print("="*60)

for english_text in test_sentences:
    telugu_text = translate(english_text, model, tokenizer, device)
    print(f"\nüìù EN: {english_text}")
    print(f"üî§ TE: {telugu_text}")
    print("-" * 60)


üîÆ Testing translations...

TRANSLATION RESULTS

üìù EN: Hello, how are you?
üî§ TE: ‡∞π‡∞≤‡±ã, ‡∞é‡∞≤‡∞æ ‡∞â‡∞®‡±ç‡∞®‡∞æ‡∞µ‡±Å?
------------------------------------------------------------

üìù EN: Good morning everyone
üî§ TE: ‡∞Ö‡∞Ç‡∞¶‡∞∞‡±Ç ‡∞â‡∞¶‡∞Ø‡∞Ç ‡∞ó‡±Å‡∞°‡±ç ‡∞Æ‡∞æ‡∞∞‡±ç‡∞®‡∞ø‡∞Ç‡∞ó‡±ç
------------------------------------------------------------

üìù EN: I will go to the temple daily
üî§ TE: ‡∞®‡±á‡∞®‡±Å ‡∞™‡±ç‡∞∞‡∞§‡∞ø ‡∞∞‡±ã‡∞ú‡±Å ‡∞Ü‡∞≤‡∞Ø‡∞Ç‡∞ï‡∞ø ‡∞µ‡±Ü‡∞≥‡±ç‡∞§‡∞æ‡∞®‡±Å
------------------------------------------------------------

üìù EN: What is your name?
üî§ TE: ‡∞Æ‡±Ä ‡∞™‡±á‡∞∞‡±Å ‡∞è‡∞Æ‡∞ø‡∞ü‡∞ø?
------------------------------------------------------------

üìù EN: Thank you very much
üî§ TE: ‡∞ö‡∞æ‡∞≤‡∞æ ‡∞ß‡∞®‡±ç‡∞Ø‡∞µ‡∞æ‡∞¶‡∞æ‡∞≤‡±Å
------------------------------------------------------------

üìù EN: I went to school yesterday
üî§ TE: ‡∞®‡±á‡∞®‡±Å ‡∞®‡∞ø‡∞®‡±ç‡∞® ‡∞¨‡∞°‡∞ø‡∞ï‡∞ø ‡∞µ‡±Ü‡∞≥‡±ç‡∞≥‡∞æ‡∞®‡±Å
-----------------------------------------

In [None]:
# ========================================
# CELL 10: (Optional) Push to Hugging Face Hub
# ========================================
print("\nüì§ To upload your model to Hugging Face Hub:")
print("1. Run: !huggingface-cli login")
print("2. Enter your token from https://huggingface.co/settings/tokens")
print("3. Uncomment and run the code below:\n")

# Uncomment these lines after logging in:
# from huggingface_hub import HfApi
#
# # Replace with your username and desired model name
# repo_id = "your-username/mbart-en-te"
#
# model.push_to_hub(repo_id)
# tokenizer.push_to_hub(repo_id)
#
# print(f"‚úÖ Model uploaded to: https://huggingface.co/{repo_id}")

In [9]:
# ========================================
# CELL 11: Compare with Base Model
# ========================================
print("\nüìä Comparing Fine-tuned vs Base Model...")

# Load base model for comparison
base_model = MBartForConditionalGeneration.from_pretrained(model_name).to(device)
base_tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
base_tokenizer.src_lang = "en_XX"
base_tokenizer.tgt_lang = "te_IN"

test_text = "I love learning new languages"

print("\n" + "="*60)
print("BASE MODEL vs FINE-TUNED MODEL")
print("="*60)
print(f"\nüìù Input: {test_text}")
print(f"\nüîπ Base Model Output:")
base_output = translate(test_text, base_model, base_tokenizer, device)
print(f"   {base_output}")

print(f"\nüî∏ Fine-tuned Model Output:")
finetuned_output = translate(test_text, model, tokenizer, device)
print(f"   {finetuned_output}")
print("="*60)

print("\n‚ú® Notice the difference in quality!")
print("Fine-tuning adapts the model to your specific data distribution.")


üìä Comparing Fine-tuned vs Base Model...

BASE MODEL vs FINE-TUNED MODEL

üìù Input: I love learning new languages

üîπ Base Model Output:
   ‡∞®‡±á‡∞®‡±Å ‡∞ï‡±ä‡∞§‡±ç‡∞§ ‡∞≠‡∞æ‡∞∑‡∞≤‡±Å ‡∞§‡±Ü‡∞≤‡±Å‡∞∏‡±Å‡∞ï‡±ã‡∞µ‡∞°‡∞æ‡∞®‡∞ø‡∞ï‡∞ø ‡∞™‡±ç‡∞∞‡±á‡∞Æ

üî∏ Fine-tuned Model Output:
   ‡∞®‡∞æ‡∞ï‡±Å ‡∞ï‡±ä‡∞§‡±ç‡∞§ ‡∞≠‡∞æ‡∞∑‡∞≤‡±Å ‡∞®‡±á‡∞∞‡±ç‡∞ö‡±Å‡∞ï‡±ã‡∞µ‡∞°‡∞æ‡∞®‡∞ø‡∞ï‡∞ø ‡∞ö‡∞æ‡∞≤‡∞æ ‡∞á‡∞∑‡±ç‡∞ü‡∞Ç

‚ú® Notice the difference in quality!
Fine-tuning adapts the model to your specific data distribution.


In [None]:
# ========================================
# CELL 12: Create a Simple Translation Function
# ========================================
def create_translator(model_path="./mbart-finetuned-en-te"):
    """
    Creates a ready-to-use translation function
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = MBartForConditionalGeneration.from_pretrained(model_path).to(device)
    tokenizer = MBart50TokenizerFast.from_pretrained(model_path)
    tokenizer.src_lang = "en_XX"

    def translate_text(text, target_lang="te_IN"):
        inputs = tokenizer(text, return_tensors="pt", truncation=True).to(device)
        outputs = model.generate(
            **inputs,
            forced_bos_token_id=tokenizer.lang_code_to_id[target_lang],
            num_beams=5
        )
        return tokenizer.decode(outputs[0], skip_special_tokens=True)

    return translate_text

# Create translator function
translator = create_translator()

# Test it
print("\nüéØ Quick Translation Test:")
print(translator("How are you today?"))

print("\n" + "="*60)
print("üéì TUTORIAL COMPLETE!")
print("="*60)
print("\nüìö What you learned:")
print("  ‚úì How to fine-tune mBART for translation")
print("  ‚úì Data preprocessing for sequence-to-sequence tasks")
print("  ‚úì Using Hugging Face Trainer API")
print("  ‚úì Evaluating translation quality")
print("  ‚úì Deploying models to Hugging Face Hub")
print("\nüí° Next steps:")
print("  ‚Ä¢ Try with more data for better results")
print("  ‚Ä¢ Experiment with different hyperparameters")
print("  ‚Ä¢ Add evaluation metrics (BLEU, METEOR)")
print("  ‚Ä¢ Deploy as an API using FastAPI")
print("\nHappy learning! üöÄ")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



üéØ Quick Translation Test:
‡∞à ‡∞∞‡±ã‡∞ú‡±Å ‡∞Æ‡±Ä‡∞∞‡±Å ‡∞é‡∞≤‡∞æ ‡∞â‡∞®‡±ç‡∞®‡∞æ‡∞∞‡±Å?

üéì TUTORIAL COMPLETE!

üìö What you learned:
  ‚úì How to fine-tune mBART for translation
  ‚úì Data preprocessing for sequence-to-sequence tasks
  ‚úì Using Hugging Face Trainer API
  ‚úì Evaluating translation quality
  ‚úì Deploying models to Hugging Face Hub

üí° Next steps:
  ‚Ä¢ Try with more data for better results
  ‚Ä¢ Experiment with different hyperparameters
  ‚Ä¢ Add evaluation metrics (BLEU, METEOR)
  ‚Ä¢ Deploy as an API using FastAPI

Happy learning! üöÄ


# Task
Zip the folder `./mbart-finetuned-en-te` and download it to the local system.

## Zip the folder

### Subtask:
Create a zip archive of the specified folder using a shell command.


**Reasoning**:
The subtask is to create a zip archive of the `./mbart-finetuned-en-te` folder using a shell command. The `zip -r` command is suitable for this purpose. I'll use a `code_block` to execute this shell command.



In [11]:
!zip -r mbart-finetuned-en-te.zip ./mbart-finetuned-en-te
print("‚úÖ Folder successfully zipped!")

  adding: mbart-finetuned-en-te/ (stored 0%)
  adding: mbart-finetuned-en-te/tokenizer_config.json (deflated 92%)
  adding: mbart-finetuned-en-te/training_args.bin (deflated 54%)
  adding: mbart-finetuned-en-te/sentencepiece.bpe.model (deflated 49%)
  adding: mbart-finetuned-en-te/config.json (deflated 60%)
  adding: mbart-finetuned-en-te/model.safetensors (deflated 7%)
  adding: mbart-finetuned-en-te/tokenizer.json (deflated 76%)
  adding: mbart-finetuned-en-te/special_tokens_map.json (deflated 61%)
  adding: mbart-finetuned-en-te/generation_config.json (deflated 41%)
‚úÖ Folder successfully zipped!


## Download the zip file

### Subtask:
Provide a Python script to download the generated zip file from the Colab environment.


## Summary:

### Data Analysis Key Findings
*   A zip archive named `mbart-finetuned-en-te.zip` was successfully created from the `./mbart-finetuned-en-te` folder using the `zip -r` shell command.
*   The creation of the zip file was confirmed by the message "‚úÖ Folder successfully zipped!".

### Insights or Next Steps
*   The next logical step is to download the `mbart-finetuned-en-te.zip` file to the local system, as indicated by the original task.


In [12]:
import os
from google.colab import files

zip_file_name = "mbart-finetuned-en-te.zip"

# Get file size
if os.path.exists(zip_file_name):
    size_bytes = os.path.getsize(zip_file_name)
    size_mb = size_bytes / (1024 * 1024)
    print(f"\nüì¶ Size of {zip_file_name}: {size_mb:.2f} MB")
else:
    print(f"Error: {zip_file_name} not found.")

# Code to download the file
print(f"\nüì• Preparing to download {zip_file_name}...")
files.download(zip_file_name)
print("‚úÖ Download initiated!")


üì¶ Size of mbart-finetuned-en-te.zip: 2164.60 MB

üì• Preparing to download mbart-finetuned-en-te.zip...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

‚úÖ Download initiated!
