In [None]:
import os
os.environ["WANDB_DISABLED"]="true"

# Install required packages

In [None]:
! pip install datasets transformers sacrebleu torch sentencepiece transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.3.2-py3-none-any.whl (362 kB)
[K     |████████████████████████████████| 362 kB 4.7 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 66.6 MB/s 
[?25hCollecting sacrebleu
  Downloading sacrebleu-2.1.0-py3-none-any.whl (92 kB)
[K     |████████████████████████████████| 92 kB 15.0 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 61.8 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 74.6 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux

In [None]:
import transformers
print(transformers.__version__)

4.20.1


# Fine-tuning a model on a translation task

In [None]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-mul"

In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/690k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.35M [00:00<?, ?B/s]



In [None]:
tokenizer(["Hello, this one sentence!", "This is another sentence."])

{'input_ids': [[13078, 3, 72, 151, 18802, 58, 0], [304, 17, 675, 18802, 2, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

In [None]:
with tokenizer.as_target_tokenizer():
    print(tokenizer(["Hello, this one sentence!", "This is another sentence."]))

{'input_ids': [[17142, 146, 3, 72, 4, 1990, 4, 5724, 3811, 58, 0], [6720, 271, 17, 2058, 6438, 4, 5724, 3811, 2, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [None]:
prefix = ""
max_input_length = 128
max_target_length = 128
source_lang = "en"
target_lang = "bn"
def preprocess_function(examples):
    inputs = [prefix + ex for ex in examples[source_lang].tolist()]
    targets = [ex for ex in examples[target_lang].tolist()]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True,padding=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True,padding=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
import pandas as pd
cols=['image_id', 'X', 'Y', 'Width','Height','en','bn']
train_df=pd.read_csv('bn_train.csv')
dev_df=pd.read_csv('bn_dev.csv')

In [None]:
train_df['en']

0                                 it is an indoor scene
1                            Computer screens turned on
2                                    man has short hair
3                    photo album open on an adult's lap
4        there is a group of girls beside the black car
                              ...                      
28925                            two halves of sandwich
28926                      an elephant standing outside
28927                             blue umbrella on deck
28928                                   this is a sheep
28929                         a train's left tail light
Name: en, Length: 28930, dtype: object

In [None]:
train_tokenized_datasets = preprocess_function(train_df)#raw_datasets.map(preprocess_function, batched=True)

dev_tokenized_datasets = preprocess_function(dev_df)

## Fine-tuning the model

Now that our data is ready, we can download the pretrained model and fine-tune it. Since our task is of the sequence-to-sequence kind, we use the AutoModelForSeq2SeqLM class. Like with the tokenizer, the from_pretrained method will download and cache the model for us.

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/296M [00:00<?, ?B/s]

To instantiate a Seq2SeqTrainer, we will need to define three more things. The most important is the [Seq2SeqTrainingArguments](https://huggingface.co/transformers/main_classes/trainer.html#transformers.Seq2SeqTrainingArguments), which is a class that contains all the attributes to customize the training. It requires one folder name, which will be used to save the checkpoints of the model, and all other arguments are optional:

In [None]:
batch_size = 128
model_name = model_checkpoint.split("/")[-1]
source_lang = "en"
target_lang = "bn"
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=25,
    predict_with_generate=True    
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
from datasets import  load_metric
metric = load_metric("sacrebleu")

In [None]:
import numpy as np
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
prefix = ""
max_input_length = 128
max_target_length = 128
source_lang = "eng"
target_lang = "ml"
def preprocess_function(examples):
    inputs = [prefix + ex for ex in examples[source_lang].tolist()]
    targets = [ex for ex in examples[target_lang].tolist()]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True,padding=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True,padding=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
import torch
class ModelDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets
    
    def __len__(self):
        return len(self.targets)
    
    def __getitem__(self, index):
#         print("index in getitem:",index)
        index=int(index)
        input_ids = torch.tensor(self.inputs[index]).squeeze()
        target_ids = torch.tensor(self.targets[index]).squeeze()
        
        return {"input_ids": input_ids, "labels": target_ids}

In [None]:
traindata=ModelDataset(train_tokenized_datasets['input_ids'],train_tokenized_datasets['labels'])
devdata=ModelDataset(dev_tokenized_datasets['input_ids'],dev_tokenized_datasets['labels'])

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=traindata,
    eval_dataset=devdata,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# trained for a total of 20 epochs

We can now finetune our model by just calling the train method:

In [None]:
trainer.train()

In [None]:
import torch
# model.to('cpu')

# inference

In [None]:
import pandas as pd

ch=pd.read_csv("bn_challenge.csv")
ch

In [None]:
inp_ch=ch['en'].tolist()

In [None]:
len(inp_ch)

In [None]:
# torch.cuda.get_device_name(0)

In [None]:
model.to('cpu')

In [None]:
# from numba import cuda
# cuda.select_device(0)
# cuda.close()

In [None]:
!nvidia-smi

In [None]:
# model = MarianMTModel.from_pretrained(model_name)
# src_text = ['silver car is parked']
# src.

device = "cuda" if torch.cuda.is_available() else "cpu"
# inf_model=AutoModelForSeq2SeqLM.from_pretrained('./opus-mt-en-hi-finetuned-en-to-hi/checkpoint-27000')
opf=open("ch_op_bn.txt","a")
for i in range(len(inp_ch)):
    print(i)
    translated = model.generate(**tokenizer([inp_ch[i]], return_tensors="pt", padding=True))
    op=[tokenizer.decode(t, skip_special_tokens=True) for t in translated][0]
    opf.write(str(op)+'\n')
    
opf.close()

In [None]:
te

In [None]:

te=pd.read_csv("bn_test.csv")
tlist=te['en'].tolist()

opf=open("bn_test_op.txt","w")
for i in range(len(tlist)):
    print(i)
    translated = model.generate(**tokenizer([tlist[i]], return_tensors="pt", padding=True))
    op=[tokenizer.decode(t, skip_special_tokens=True) for t in translated][0]
    opf.write(str(op)+'\n')
    
opf.close()

In [None]:
s="A rectangular picture on a yellow wall."

translated = inf_model.generate(**tokenizer([s], return_tensors="pt", padding=True))
op=[tokenizer.decode(t, skip_special_tokens=True) for t in translated][0]
print(op)