## Get the data

In [0]:
!pip install evaluate datasets sacrebleu accelerate transformers[sentencepiece]

In [0]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
device

In [0]:
# Load the dataset
from datasets import load_dataset
data = load_dataset("pib", "en-ta")
data

In [0]:
split_datasets = data["train"].train_test_split(train_size =0.8, seed = 40)

In [0]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
def preprocess(example):
  #english_text = [example[val]["translation"]["en"].split() for val in range(len(example))]
  #tamil_text = [example[val]["translation"]["ta"].split() for val in range(len(example))]
  english_text = [ex["en"] for ex in example["translation"]]
  tamil_text =  [ex["ta"] for ex in example["translation"]]
  return tokenizer(english_text, text_target=tamil_text, max_length=128, truncation=True)
tokenized_data= split_datasets.map(preprocess, batched=True, remove_columns=split_datasets["train"].column_names)


In [0]:
# Train the model
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small").to(device)


In [0]:
from transformers import DataCollatorForSeq2Seq
data_collate = DataCollatorForSeq2Seq(tokenizer, model, label_pad_token_id=tokenizer.pad_token_id, return_tensors="pt")
data_collate

In [0]:
# Data Loaders
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_data["train"], collate_fn = data_collate, shuffle=True, batch_size = 32)
test_dataloader = DataLoader(tokenized_data["test"], collate_fn= data_collate, batch_size = 32)
len(train_dataloader)

In [0]:
# try lower learning rate
# how does learning rate change
# log both start learning rate (param) and scheduled learning rate
# early stopping (metric and threshold) if metric does not improve after 5 batches/epochs, 

In [0]:
from transformers import get_scheduler
from tqdm.auto import tqdm
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from transformers import EarlyStoppingCallback

experiment_name = "/Users/Ajay.Kamalakannan@edelmandxi.com/PyTorch_Projects/text-translation-experiment"
import mlflow
#mlflow.create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)
loss = CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=1e-3)
epochs = 1
lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=epochs*len(train_dataloader))
progress_bar = tqdm(range(epochs*len(train_dataloader)))
model.train()
total_loss = 0
from accelerate import Accelerator
accelerator = Accelerator()
train_dataloader, test_dataloader, model, optimizer = accelerator.prepare(train_dataloader, test_dataloader, model, optimizer)

#dir(lr_scheduler.optimizer)
for pg in lr_scheduler.optimizer.param_groups:
  lr = pg['lr']
  print(lr)


In [0]:
from sacrebleu import raw_corpus_bleu

# 2 experiments (training and testing)
# without fine tuning only (testing)
# 1 experiment with 2 separate runs (1 per hypothesis) (1 experiment for each project, when logging, training runs, 
#(load base model, calculate performance metrics as single run)
# (train loop with fine-tuning and show)
# couple of epochs for specific task

In [0]:
# Tracking part of MLFlow
# Artifact or Model that is Deployment- Ready (pyfunc)
# attach and detach cluster
import evaluate
metric = evaluate.load("sacrebleu")

In [0]:
accumulation_steps = 10
with mlflow.start_run():
  for epoch in range(epochs):
    for i, batch in enumerate(train_dataloader):
      outputs = model(input_ids=batch["input_ids"], labels=batch["labels"])
      loss_values = outputs.loss
      print(f"Loss: {loss_values.item()}")
      mlflow.log_metric("Learning Rate", lr)
      mlflow.log_metric("Training Loss", loss_values.item())
      total_loss += loss_values.item()
      optimizer.zero_grad()
      accelerator.backward(loss_values)
      optimizer.step()
      lr_scheduler.step()
      progress_bar.update(1)
      
    for batch in tqdm(test_dataloader):
      with torch.inference_mode():

        gen_tokens = accelerator.unwrap_model(model).generate(
            batch["input_ids"],
            attention_mask = batch["attention_mask"],
            max_length=128,
        )
        #print(batch)
        labels = batch["labels"]

        gen_tokens = accelerator.pad_across_processes(
            gen_tokens, dim=1, pad_index = tokenizer.pad_token_id
        )
        labels = accelerator.pad_across_processes(
            labels, dim = 1, pad_index = tokenizer.pad_token_id
        )
        #print(gen_tokens, labels)
        #print("Before Gathering")
        pred_gathered =accelerator.gather(gen_tokens)
        labels_gathered =accelerator.gather(labels)
        #print("After Gathering")
        #print(pred_gathered, labels_gathered)
        #test_outputs = model(input_ids=batch["input_ids"], labels=batch["labels"])

        #print(test_outputs[0], test_outputs[1])
        #preds = test_outputs["input_ids"]
        #labels = test_outputs["labels"]

        pred_gathered = pred_gathered.cpu().numpy()
        labels_gathered = labels.cpu().numpy()
        decoded_preds = tokenizer.batch_decode(pred_gathered, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels_gathered, skip_special_tokens=True)
        #print(decoded_preds, decoded_labels)

        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [[labels.strip()] for labels in decoded_labels]

        metric.add_batch(predictions=decoded_preds, references=decoded_labels)
  results = metric.compute()
  print(f"The average bleu score: {results['score']}")
  mlflow.log_metric("Bleu Metric", results['score'])
  avg_loss = total_loss/len(train_dataloader)
  print(f"Average Training Loss: {total_loss/len(train_dataloader)}")
  mlflow.log_metric("Average Training Loss", avg_loss)