In [1]:
import pandas as pd
from transformers import MT5Tokenizer, MT5ForConditionalGeneration, AdamW
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm

df = pd.read_csv("train.csv").dropna()

tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

class BanglaDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        source = "translate Bangla to Bangla: " + self.data.iloc[index]["Input"]
        target = self.data.iloc[index]["Target"]

        source_enc = self.tokenizer(source, max_length=self.max_len, padding="max_length", truncation=True, return_tensors="pt")
        target_enc = self.tokenizer(target, max_length=self.max_len, padding="max_length", truncation=True, return_tensors="pt")

        return {
            "input_ids": source_enc.input_ids.squeeze(),
            "attention_mask": source_enc.attention_mask.squeeze(),
            "labels": target_enc.input_ids.squeeze()
        }

train_data = BanglaDataset(df, tokenizer)
train_loader = DataLoader(train_data, batch_size=4, shuffle=True)

optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(6):
    print(f"Epoch {epoch+1}")
    for batch in tqdm(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

model.eval()
total_correct_tokens = 0
total_tokens = 0

with torch.no_grad():
    for batch in tqdm(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits.argmax(dim=-1)
        labels = batch['labels']

        mask = labels != tokenizer.pad_token_id
        correct = (logits == labels) & mask

        total_correct_tokens += correct.sum().item()
        total_tokens += mask.sum().item()

final_accuracy = total_correct_tokens / total_tokens if total_tokens > 0 else 0
print(f"\nFinal Token-level Accuracy: {final_accuracy:.4f}")

model.save_pretrained("mt5-bangla-correction")
tokenizer.save_pretrained("mt5-bangla-correction")


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
  return self.fget.__get__(instance, owner)()


Epoch 1


100%|██████████| 20000/20000 [45:58<00:00,  7.25it/s]


Epoch 2


100%|██████████| 20000/20000 [46:15<00:00,  7.21it/s]


Epoch 3


100%|██████████| 20000/20000 [46:16<00:00,  7.20it/s]


Epoch 4


100%|██████████| 20000/20000 [46:16<00:00,  7.20it/s]


Epoch 5


100%|██████████| 20000/20000 [46:15<00:00,  7.21it/s]


Epoch 6


100%|██████████| 20000/20000 [46:16<00:00,  7.20it/s]
100%|██████████| 20000/20000 [11:40<00:00, 28.56it/s]



Final Token-level Accuracy: 0.9393


('mt5-bangla-correction/tokenizer_config.json',
 'mt5-bangla-correction/special_tokens_map.json',
 'mt5-bangla-correction/spiece.model',
 'mt5-bangla-correction/added_tokens.json')

In [2]:
import pandas as pd
from transformers import MT5Tokenizer, MT5ForConditionalGeneration, AdamW
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm

df = pd.read_csv("val.csv").dropna()

tokenizer = MT5Tokenizer.from_pretrained("mt5-bangla-correction")
model = MT5ForConditionalGeneration.from_pretrained("mt5-bangla-correction")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

class BanglaDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        source = "translate Bangla to Bangla: " + self.data.iloc[index]["Input"]
        target = self.data.iloc[index]["Target"]

        source_enc = self.tokenizer(source, max_length=self.max_len, padding="max_length", truncation=True, return_tensors="pt")
        target_enc = self.tokenizer(target, max_length=self.max_len, padding="max_length", truncation=True, return_tensors="pt")

        return {
            "input_ids": source_enc.input_ids.squeeze(),
            "attention_mask": source_enc.attention_mask.squeeze(),
            "labels": target_enc.input_ids.squeeze()
        }

val_data = BanglaDataset(df, tokenizer)
val_loader = DataLoader(val_data, batch_size=4, shuffle=True)

optimizer = AdamW(model.parameters(), lr=3e-5)

model.train()
for epoch in range(4):
    print(f"Fine-tune Epoch {epoch+1}")
    for batch in tqdm(val_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Loss: {loss.item():.4f}")

model.eval()
total_correct_tokens = 0
total_tokens = 0

with torch.no_grad():
    for batch in tqdm(val_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits.argmax(dim=-1)
        labels = batch['labels']

        mask = labels != tokenizer.pad_token_id
        correct = (logits == labels) & mask

        total_correct_tokens += correct.sum().item()
        total_tokens += mask.sum().item()

accuracy = total_correct_tokens / total_tokens if total_tokens > 0 else 0
print(f"\nFinal Token-level Accuracy after Fine-tuning: {accuracy:.4f}")

model.save_pretrained("mt5-bangla-finetuned")
tokenizer.save_pretrained("mt5-bangla-finetuned")




Fine-tune Epoch 1


100%|██████████| 2500/2500 [05:47<00:00,  7.20it/s]


Loss: 0.0656
Fine-tune Epoch 2


100%|██████████| 2500/2500 [05:46<00:00,  7.22it/s]


Loss: 0.0320
Fine-tune Epoch 3


100%|██████████| 2500/2500 [05:45<00:00,  7.23it/s]


Loss: 0.0581
Fine-tune Epoch 4


100%|██████████| 2500/2500 [05:45<00:00,  7.24it/s]


Loss: 0.0592


100%|██████████| 2500/2500 [01:26<00:00, 29.04it/s]



Final Token-level Accuracy after Fine-tuning: 0.9443


('mt5-bangla-finetuned/tokenizer_config.json',
 'mt5-bangla-finetuned/special_tokens_map.json',
 'mt5-bangla-finetuned/spiece.model',
 'mt5-bangla-finetuned/added_tokens.json')

In [4]:
from transformers import MT5Tokenizer, MT5ForConditionalGeneration
import difflib

model = MT5ForConditionalGeneration.from_pretrained("mt5-bangla-finetuned")
tokenizer = MT5Tokenizer.from_pretrained("mt5-bangla-finetuned")
model.eval()

incorrect_sentence = "এই সিদডান্ত বাসটবায়ন অগরদূট ছলেন সিটির ফভরণর পাইস পার্কার ।"

input_text = "translate Bangla to Bangla: " + incorrect_sentence
inputs = tokenizer(input_text, return_tensors="pt", max_length=128, padding=True, truncation=True)

outputs = model.generate(**inputs, max_length=128)
corrected_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Incorrect Sentence:")
print(incorrect_sentence)

print("\nCorrected Sentence:")
print(corrected_sentence)

print("\nDifference:")
diff = '\n'.join(difflib.ndiff(incorrect_sentence, corrected_sentence))
print(diff)


Incorrect Sentence:
এই সিদডান্ত বাসটবায়ন অগরদূট ছলেন সিটির ফভরণর পাইস পার্কার ।

Corrected Sentence:
এই সিদ্ধান্ত বাস্তবায়ন অগ্রদূট ছিলেন সিটির ফ্ল্যাটের পাইস পার্কার ।

Difference:
  এ
  ই
   
  স
  ি
  দ
- ড
+ ্
+ ধ
  া
  ন
  ্
  ত
   
  ব
  া
  স
- ট
+ ্
+ ত
  ব
  া
- য়
+ য
+ ়
  ন
   
  অ
  গ
+ ্
  র
  দ
  ূ
  ট
   
  ছ
+ ি
  ল
  ে
  ন
   
  স
  ি
  ট
  ি
  র
   
  ফ
- ভ
- র
- ণ
+ ্
+ ল
+ ্
+ য
+ া
+ ট
+ ে
  র
   
  প
  া
  ই
  স
   
  প
  া
  র
  ্
  ক
  া
  র
   
  ।


In [None]:
from transformers import MT5Tokenizer, MT5ForConditionalGeneration
import difflib

tokenizer = MT5Tokenizer.from_pretrained("mt5-bangla-finetuned")
model = MT5ForConditionalGeneration.from_pretrained("mt5-bangla-finetuned")
model.eval()

def correct_sentence(sentence):
    input_text = "translate Bangla to Bangla: " + sentence
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    outputs = model.generate(inputs.input_ids, max_length=128)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def show_diff(orig, corrected):
    return '\n'.join(difflib.ndiff(orig, corrected))

while True:
    user_input = input("\nEnter Bangla sentence (or 'exit'): ")
    if user_input.strip().lower() == 'exit':
        break
    corrected = correct_sentence(user_input)
    print(f"\nCorrected Sentence:\n{corrected}")
    print(f"\nDifference:\n{show_diff(user_input, corrected)}")


Enter Bangla sentence (or 'exit'):  এতি ওবশা পুড়োনো অেভাস ।



Corrected Sentence:
এটি অবশ্য পুরোনো অভ্যাস ।

Difference:
  এ
- ত
+ ট
  ি
   
- ও
+ অ
  ব
  শ
- া
+ ্
+ য
   
  প
  ু
- ড়
+ র
  ো
  ন
  ো
   
  অ
- ে
  ভ
+ ্
+ য
  া
  স
   
  ।
