### If you dont want to use Wandb, disable Wandb otherwise optional

references for WANDB
https://analyticsindiamag.com/hands-on-guide-to-weights-and-biases-wandb-with-python-implementation/

https://docs.wandb.ai/


In [None]:
import os
os.environ["WANDB_DISABLED"]="true"

# Install required packages

In [None]:
! pip install datasets transformers sacrebleu torch sentencepiece transformers[sentencepiece]

Make sure your version of Transformers is at least 4.11.0 since the functionality was introduced in that version

In [None]:
import transformers
print(transformers.__version__)

In [None]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"

In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

You can directly call this tokenizer on one sentence or a pair of sentences:

In [None]:
tokenizer(["Hello, this one sentence!", "This is another sentence."])

To prepare the targets for our model, we need to tokenize them inside the as_target_tokenizer context manager. This will make sure the tokenizer uses the special tokens corresponding to the targets:

In [None]:
with tokenizer.as_target_tokenizer():
    print(tokenizer(["Hello, this one sentence!", "This is another sentence."]))

We can then write the function that will preprocess our samples. We just feed them to the tokenizer with the argument truncation=True. This will ensure that an input longer that what the model selected can handle will be truncated to the maximum length accepted by the model. The padding will be dealt with later on (in a data collator) so we pad examples to the longest length in the batch and not the whole dataset.

In [None]:
prefix = ""
max_input_length = 128
max_target_length = 128
source_lang = "eng"
target_lang = "hi"
def preprocess_function(examples):
    inputs = [prefix + ex for ex in examples[source_lang].tolist()]
    targets = [ex for ex in examples[target_lang].tolist()]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True,padding=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True,padding=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
import pandas as pd
cols=['image_id', 'X', 'Y', 'Width','Height','eng','hi']
train_df=pd.read_csv('../input/wat-2022-shared-task-image-captioning-task/hindi-visual-genome-train.txt',sep="	",names=cols, header=None)
dev_df=pd.read_csv('../input/wat-2022-shared-task-image-captioning-task/hindi-visual-genome-dev.txt',sep="	",names=cols, header=None)

In [None]:
train_tokenized_datasets = preprocess_function(train_df)#raw_datasets.map(preprocess_function, batched=True)

dev_tokenized_datasets = preprocess_function(dev_df)

## Fine-tuning the model

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
batch_size = 32
model_name = model_checkpoint.split("/")[-1]
source_lang = "en"
target_lang = "hi"
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=30,
    predict_with_generate=True    
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

The last thing to define for our Seq2SeqTrainer is how to compute the metrics from the predictions. We need to define a function for this, which will just use the metric we loaded earlier, and we have to do a bit of pre-processing to decode the predictions into texts:

In [None]:
from datasets import  load_metric
metric = load_metric("sacrebleu")

In [None]:
import numpy as np
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

Then we just need to pass all of this along with our datasets to the Seq2SeqTrainer:

In [None]:
prefix = ""
max_input_length = 128
max_target_length = 128
source_lang = "eng"
target_lang = "ml"
def preprocess_function(examples):
    inputs = [prefix + ex for ex in examples[source_lang].tolist()]
    targets = [ex for ex in examples[target_lang].tolist()]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True,padding=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True,padding=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
import torch
class ModelDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets
    
    def __len__(self):
        return len(self.targets)
    
    def __getitem__(self, index):
#         print("index in getitem:",index)
        index=int(index)
        input_ids = torch.tensor(self.inputs[index]).squeeze()
        target_ids = torch.tensor(self.targets[index]).squeeze()
        
        return {"input_ids": input_ids, "labels": target_ids}

In [None]:
traindata=ModelDataset(train_tokenized_datasets['input_ids'],train_tokenized_datasets['labels'])
devdata=ModelDataset(dev_tokenized_datasets['input_ids'],dev_tokenized_datasets['labels'])

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=traindata,
    eval_dataset=devdata,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# trained for a total of 20 epochs

We can now finetune our model by just calling the train method:

In [None]:
trainer.train()

In [None]:
import torch
# model.to('cpu')

# inference

In [None]:
import pandas as pd

ch=pd.read_csv("../input/wat-2022-shared-task-image-captioning-task/hindi-visual-genome-challenge-test-set.txt",names=['image_id', 'X', 'Y', 'Width','Height','eng','hi'],sep="	",header=None)
ch

In [None]:
inp_ch=ch['eng'].tolist()

In [None]:
len(inp_ch)

In [None]:
import torch
torch.cuda.get_device_name(0)

In [None]:
model.to(0)

In [None]:
from numba import cuda
cuda.select_device(0)
cuda.close()

In [None]:
!nvidia-smi

In [None]:
# model = MarianMTModel.from_pretrained(model_name)
# src_text = ['silver car is parked']
# src.

device = "cuda" if torch.cuda.is_available() else "cpu"
inf_model=AutoModelForSeq2SeqLM.from_pretrained('./opus-mt-en-hi-finetuned-en-to-hi/checkpoint-27000')
opf=open("ch_op.txt","a")
for i in range(len(inp_ch)):
    print(i)
    translated = inf_model.generate(**tokenizer([inp_ch[i]], return_tensors="pt", padding=True))
    op=[tokenizer.decode(t, skip_special_tokens=True) for t in translated][0]
    opf.write(str(op)+'\n')
    
opf.close()

In [None]:
te

In [None]:

te=pd.read_csv("../input/wat-2022-shared-task-image-captioning-task/hindi-visual-genome-test.txt",names=['image_id', 'X', 'Y', 'Width','Height','eng','hi'],sep="	",header=None)
tlist=te['eng'].tolist()

opf=open("hi_test_op.txt","w")
for i in range(len(tlist)):
    print(i)
    translated = inf_model.generate(**tokenizer([tlist[i]], return_tensors="pt", padding=True))
    op=[tokenizer.decode(t, skip_special_tokens=True) for t in translated][0]
    opf.write(str(op)+'\n')
    
opf.close()

In [None]:
s="A rectangular picture on a yellow wall."

translated = inf_model.generate(**tokenizer([s], return_tensors="pt", padding=True))
op=[tokenizer.decode(t, skip_special_tokens=True) for t in translated][0]
print(op)