In [None]:
import json
file_path = "..\datasets\wiki-cloze\\bn.json"

# Opening JSON file
f = open(file_path, mode="r", encoding="utf-8")
  
# returns JSON object as |
# a dictionary
json_data = json.load(f)

In [None]:
json_data.keys()

In [None]:
len(json_data['cloze_data'])

In [None]:
from datasets import load_dataset

train_dataset = load_dataset("json", data_files="..\datasets\copa-translated\\hi\\train.jsonl", \
                             split="train")

In [None]:
val_dataset = load_dataset("json", data_files="..\datasets\copa-translated\\hi\\val.jsonl", \
                            split="train")

In [None]:
test_dataset = load_dataset("json", data_files="..\datasets\copa-translated\\hi\\test.jsonl", \
                             split="train")

In [None]:
from datasets import Dataset, DatasetDict

datasets = DatasetDict()
datasets['train'] = train_dataset
datasets['validation'] = val_dataset
datasets['test'] = test_dataset

In [None]:
# from datasets import load_dataset

# datasets = load_dataset("indic_glue","copa.hi")

In [None]:
datasets

In [None]:
datasets.set_format("pandas")

In [None]:
# get label counts for both classes
label_counts = datasets["train"]["label"].value_counts()
num_labels = (len(label_counts.keys()))

In [None]:
label_counts

In [None]:
datasets.reset_format()

In [None]:
from transformers import PreTrainedTokenizerFast, AutoModelForSequenceClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert', keep_accents=True)
# tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased", max_length=128)

In [None]:
choice_names = ['choice1', 'choice2']

In [None]:
def preprocess_function(examples):
    premise = [[context] * 2 for context in examples["premise"]]
    cause = [[f"{examples[choice][i]}" for choice in choice_names] for i,_ in enumerate(premise)]

    premise = sum(premise, [])
    cause = sum(cause, [])
    
#     print(premise)
#     print(cause)
    

    tokenized_examples = tokenizer(premise, cause, max_length=128, truncation=True)
#     print(len(tokenized_examples))
    return {k: [v[i : i + 2] for i in range(0, len(v), 2)] for k, v in tokenized_examples.items()}
    return tokenized_examples

In [None]:
temp = preprocess_function(datasets["train"][:1])
temp

In [None]:
datasets["train"][:1]

In [None]:
for chunk in temp['input_ids'][0]:
    print(tokenizer.decode(chunk))

In [None]:
tokenized_datasets = datasets.map(preprocess_function, batched=True)

In [None]:
tokenized_datasets["train"][0]

In [None]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch


@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [None]:
# data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer)

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
from transformers import set_seed
set_seed(2)
# set_seed(80)

In [None]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer, AutoModel

model = AutoModelForMultipleChoice.from_pretrained('ai4bharat/indic-bert')
# model =  AutoModelForMultipleChoice.from_pretrained("bert-base-multilingual-cased")

In [None]:
#  disable weights and biases logging
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# temp_data = tokenized_datasets.remove_columns(['premise','choice1','choice2','question','idx'])
# temp = [temp_data["train"][i]for i in range(5)]

In [None]:
# batch = data_collator(temp)
# batch

In [None]:
# for sample in batch["input_ids"].tolist():
#     for choice in sample:
#         print(tokenizer.decode(choice))

In [None]:
# datasets["train"][:5]

In [None]:
training_args = TrainingArguments(
    output_dir="mbert_swag_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    #learning_rate=5e-5,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    #push_to_hub=True,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

In [None]:
trainer.evaluate()

In [None]:
trainer.train()

In [None]:
trainer.evaluate(tokenized_datasets["test"])