In [None]:
# ======================================
# 1. INSTALL & IMPORT REQUIRED LIBRARIES
# ======================================

!pip install transformers datasets accelerate sentencepiece evaluate huggingface_hub --quiet

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import torch

# Hugging Face Transformers / Datasets
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    MT5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer
)




[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Login to Hugging Face (for pushing the model to HF Hub)
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# ======================
# 2. LOAD THE DATASET
# ======================

# The dataset is hosted on Hugging Face: SKNahin/bengali-transliteration-data
# We specifically use the .parquet file as indicated.

DATA_PATH = "hf://datasets/SKNahin/bengali-transliteration-data/data/train-00000-of-00001.parquet"

# Load the data using pandas
df = pd.read_parquet(DATA_PATH)

print("Sample data:")
print(df.head())
print("\nTotal samples:", len(df))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Sample data:
                                                  bn  \
0      স্ক্রোল করে ২০/৩০ সেকেন্ড এর ভিডিও পান নাই???   
1                         ও গুলা টরেন্ট সাইট এ পাবেন   
2  ভক্কর চক্কর পোস্ট একটা করলেই এপ্রুভড.… নিশ্চই ...   
3                           আমি টেস্ট করেই কোড দিছি…   
4  এতো কষ্টের কি আছে সাকিবওয়াপ.টক,সাকিবওয়াপ.মল&এআ...   

                                                  rm  
0      scroll kore 20/30 second er video pann nai???  
1                        o gula Torrent site e paben  
2  vokkor chokkor post akta korlei approved…. nis...  
3                         ami test koreii code disi…  
4  eto koster ki ache shakibwap.tk,shakibwap.ml&a...  

Total samples: 5006


In [None]:
# ========================================
# 3. SPLIT DATA INTO TRAIN & VALIDATION
# ========================================

# For example, use an 80/20 split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

print("Training samples:", len(train_df))
print("Validation samples:", len(val_df))

# Convert pandas DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)

# Create a DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset
})

dataset_dict


Training samples: 4004
Validation samples: 1002


DatasetDict({
    train: Dataset({
        features: ['bn', 'rm', '__index_level_0__'],
        num_rows: 4004
    })
    validation: Dataset({
        features: ['bn', 'rm', '__index_level_0__'],
        num_rows: 1002
    })
})

In [None]:
# =========================================================
# 4. TOKENIZATION & DATA PREPROCESSING FOR SEQ2SEQ MODELS
# =========================================================

# We'll use the mT5-small tokenizer from Hugging Face
MODEL_CHECKPOINT = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

# Typically, 'rm' is the column with Banglish (Romanized text),
# and 'bn' is the column with Bengali text in native script.
# We'll define a function to tokenize them.

MAX_LENGTH = 128

def preprocess_function(examples):
    # 'rm' is our source (Banglish)
    inputs = examples["rm"]
    # 'bn' is our target (Bengali)
    targets = examples["bn"]

    # Tokenize the inputs
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LENGTH,
        truncation=True
    )

    # Tokenize the targets as labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=MAX_LENGTH,
            truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Map the preprocessing over our dataset
tokenized_datasets = dataset_dict.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset_dict["train"].column_names  # remove original columns
)

print("Tokenization complete!")
tokenized_datasets


tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/4004 [00:00<?, ? examples/s]



Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

Tokenization complete!


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4004
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1002
    })
})

In [None]:
# ==========================
# 5. LOAD THE PRE-TRAINED MODEL
# ==========================

model = MT5ForConditionalGeneration.from_pretrained(MODEL_CHECKPOINT)

# ==============================================
# 6. SET TRAINING ARGUMENTS & DATA COLLATOR
# ==============================================
batch_size = 8
num_epochs = 3
learning_rate = 1e-4

training_args = TrainingArguments(
    output_dir="mt5-banglish2bangla-checkpoints",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    push_to_hub=True,  # so we can push model directly to HF Hub
    report_to="none"   # or "wandb"/"tensorboard" if you want logging
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model
)

# =======================================
# 7. DEFINE THE TRAINER AND START TRAINING
# =======================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Start training (this may take some time depending on your hardware)
trainer.train()


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,4.3468,3.362756
2,3.8689,3.02045
3,3.6486,2.917534


TrainOutput(global_step=1503, training_loss=5.145108370168639, metrics={'train_runtime': 598.0725, 'train_samples_per_second': 20.085, 'train_steps_per_second': 2.513, 'total_flos': 327899163770880.0, 'train_loss': 5.145108370168639, 'epoch': 3.0})

In [None]:
# ===============================
# 8. PUSH MODEL TO HUGGING FACE
# ===============================

# Create a new model repo name of your choice
model_repo_name = "mt5-banglish2bangla-demo"

# Push the trained model (plus tokenizer etc.) to Hugging Face Hub
trainer.push_to_hub(model_repo_name)


No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/jahangir37/mt5-banglish2bangla-checkpoints/commit/838e82db7d383e856ca6da12b7ee1d713bca3d9f', commit_message='mt5-banglish2bangla-demo', commit_description='', oid='838e82db7d383e856ca6da12b7ee1d713bca3d9f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jahangir37/mt5-banglish2bangla-checkpoints', endpoint='https://huggingface.co', repo_type='model', repo_id='jahangir37/mt5-banglish2bangla-checkpoints'), pr_revision=None, pr_num=None)

In [None]:
# 1. Identify the device
device = "cuda" if torch.cuda.is_available() else "cpu"

# 2. Move the model to the device
model.to(device)

def transliterate_banglish(text: str, max_length: int = 128) -> str:
    # 3. Tokenize the input text
    inputs = tokenizer(
        text,
        return_tensors="pt",
        max_length=max_length,
        truncation=True
    ).to(device)  # Move inputs to the same device as the model

    # 4. Generate output (using beam search for better results)
    output_ids = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length,
        num_beams=4,
        early_stopping=True
    )
    # 5. Decode the output tokens to string
    decoded = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return decoded

# Test the function
sample_text = "amar vai "
print("Banglish Input: ", sample_text)
print("Bengali Output: ", transliterate_banglish(sample_text))

Banglish Input:  amar vai 
Bengali Output:  আমার ভাই
