In [1]:
from datasets import load_dataset

ds = load_dataset("SKNahin/bengali-transliteration-data")
print(ds)

README.md:   0%|          | 0.00/300 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/333k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5006 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['bn', 'rm'],
        num_rows: 5006
    })
})


In [6]:
split = ds['train'].train_test_split(test_size=0.2,seed=42)
ds_train = split['train']
ds_val = split['test']

In [3]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [8]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small',use_fast=True)

# Model Selection
We have used t5-small. Because it is multilingual and mBART doesn't support bengali
T5 is efficient and we have used T5-small for faster training

In [16]:
def preprocess(ds):
    def tokenize_func(ds1):
        inputs = ds1["rm"]
        outputs = ds1["bn"]
        model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=128)
        labels = tokenizer(outputs, padding="max_length", truncation=True, max_length=128)
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs
    ds = ds.map(tokenize_func, batched=True)
    ds = ds.filter(
        lambda x: 1 <= len(x["rm"]) <= 120 and 1 <= len(x["bn"]) <= 120
    )
    return ds
ds_train = preprocess(ds_train)
ds_val = preprocess(ds_val)    

Map:   0%|          | 0/3938 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3938 [00:00<?, ? examples/s]

Map:   0%|          | 0/979 [00:00<?, ? examples/s]

Filter:   0%|          | 0/979 [00:00<?, ? examples/s]

In [17]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='/kaggle/working/results',            
    evaluation_strategy="epoch",                     
    learning_rate=5e-5,                              
    per_device_train_batch_size=8,                  
    per_device_eval_batch_size=8,                   
    num_train_epochs=1,                              
    weight_decay=0.01,                               
    logging_dir='/kaggle/working/logs',              
    save_strategy="epoch",                           
    load_best_model_at_end=True,                     
    metric_for_best_model="loss",                    
    gradient_accumulation_steps=2,                   
    fp16=True,                                       
    eval_steps=100,                                  
    save_steps=500,                                  
    push_to_hub=False,                               
    report_to="none",                                
)




# Hyperparameter
1. Learning Rate: We used 5e-5 for faster convergence
2. batch_size: We initially wanted to use 32 but kaggle couldn't afford it so we had to settle for 8
3. fp16 mixed precision: For faster and less memory
4. number of epochs: Have used 1 for quick results

In [18]:
from transformers import Trainer

trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=ds_train,              
    eval_dataset=ds_val,                 
    tokenizer=tokenizer,                 
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [19]:
trainer.train()



Epoch,Training Loss,Validation Loss
0,No log,0.198911


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=123, training_loss=1.8238436070884145, metrics={'train_runtime': 60.1665, 'train_samples_per_second': 65.452, 'train_steps_per_second': 2.044, 'total_flos': 133176332648448.0, 'train_loss': 1.8238436070884145, 'epoch': 0.9959514170040485})

In [20]:
results = trainer.evaluate()
print(results)



{'eval_loss': 0.19891144335269928, 'eval_runtime': 5.39, 'eval_samples_per_second': 181.633, 'eval_steps_per_second': 11.503, 'epoch': 0.9959514170040485}


In [21]:
trainer.save_model('/kaggle/working/bengali_transliteration_model')
tokenizer.save_pretrained('/kaggle/working/bengali_transliteration_model')

('/kaggle/working/bengali_transliteration_model/tokenizer_config.json',
 '/kaggle/working/bengali_transliteration_model/special_tokens_map.json',
 '/kaggle/working/bengali_transliteration_model/spiece.model',
 '/kaggle/working/bengali_transliteration_model/added_tokens.json')

In [25]:
from huggingface_hub import login
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [26]:
from huggingface_hub import upload_folder
model_dir = '/kaggle/working/bengali_transliteration_model'
repo_name = 'DJ2003/my-bengali-transliteration-model'
upload_folder(
    folder_path=model_dir,          
    repo_id=repo_name,              
    path_in_repo=''                 
)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/DJ2003/my-bengali-transliteration-model/commit/baeb28e4f4975c509c61eb46183603d1d8ae04bd', commit_message='Upload folder using huggingface_hub', commit_description='', oid='baeb28e4f4975c509c61eb46183603d1d8ae04bd', pr_url=None, pr_revision=None, pr_num=None)

In [28]:
from transformers import MBartTokenizer

tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-50-many-to-one-mmt')
print(tokenizer.lang_code_to_id)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.


{'ar_AR': 250001, 'cs_CZ': 250002, 'de_DE': 250003, 'en_XX': 250004, 'es_XX': 250005, 'et_EE': 250006, 'fi_FI': 250007, 'fr_XX': 250008, 'gu_IN': 250009, 'hi_IN': 250010, 'it_IT': 250011, 'ja_XX': 250012, 'kk_KZ': 250013, 'ko_KR': 250014, 'lt_LT': 250015, 'lv_LV': 250016, 'my_MM': 250017, 'ne_NP': 250018, 'nl_XX': 250019, 'ro_RO': 250020, 'ru_RU': 250021, 'si_LK': 250022, 'tr_TR': 250023, 'vi_VN': 250024, 'zh_CN': 250025}
