In [1]:
import json
file_path = "..\datasets\wiki-cloze\\bn.json"

# Opening JSON file
f = open(file_path, mode="r", encoding="utf-8")
  
# returns JSON object as |
# a dictionary
json_data = json.load(f)

In [2]:
json_data.keys()

dict_keys(['params', 'metadata', 'cloze_data'])

In [3]:
len(json_data['cloze_data'])

38845

In [6]:
from datasets import load_dataset

train_dataset = load_dataset("json", data_files="..\datasets\copa-translated\\hi\\train.jsonl", \
                             split="train")

Using custom data configuration default-6ecfa560884c9a31
Found cached dataset json (C:/Users/arifa/.cache/huggingface/datasets/json/default-6ecfa560884c9a31/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


In [7]:
val_dataset = load_dataset("json", data_files="..\datasets\copa-translated\\hi\\val.jsonl", \
                            split="train")

Using custom data configuration default-d361c8987e918d36
Found cached dataset json (C:/Users/arifa/.cache/huggingface/datasets/json/default-d361c8987e918d36/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


In [8]:
test_dataset = load_dataset("json", data_files="..\datasets\copa-translated\\hi\\test.jsonl", \
                             split="train")

Using custom data configuration default-07180908d3559f11
Found cached dataset json (C:/Users/arifa/.cache/huggingface/datasets/json/default-07180908d3559f11/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


In [9]:
from datasets import Dataset, DatasetDict

datasets = DatasetDict()
datasets['train'] = train_dataset
datasets['validation'] = val_dataset
datasets['test'] = test_dataset

In [10]:
# from datasets import load_dataset

# datasets = load_dataset("indic_glue","copa.hi")

In [11]:
datasets

DatasetDict({
    train: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'idx', 'label'],
        num_rows: 362
    })
    validation: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'idx', 'label'],
        num_rows: 88
    })
    test: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'idx', 'label'],
        num_rows: 449
    })
})

In [12]:
datasets.set_format("pandas")

In [13]:
# get label counts for both classes
label_counts = datasets["train"]["label"].value_counts()
num_labels = (len(label_counts.keys()))

In [14]:
label_counts

1    186
0    176
Name: label, dtype: int64

In [15]:
datasets.reset_format()

In [16]:
from transformers import PreTrainedTokenizerFast, AutoModelForSequenceClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert', keep_accents=True)
# tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased", max_length=128)

In [17]:
choice_names = ['choice1', 'choice2']

In [18]:
def preprocess_function(examples):
    premise = [[context] * 2 for context in examples["premise"]]
    cause = [[f"{examples[choice][i]}" for choice in choice_names] for i,_ in enumerate(premise)]

    premise = sum(premise, [])
    cause = sum(cause, [])
    
#     print(premise)
#     print(cause)
    

    tokenized_examples = tokenizer(premise, cause, max_length=128, truncation=True)
#     print(len(tokenized_examples))
    return {k: [v[i : i + 2] for i in range(0, len(v), 2)] for k, v in tokenized_examples.items()}
    return tokenized_examples

In [19]:
temp = preprocess_function(datasets["train"][:1])
temp

{'input_ids': [[[2,
    1865,
    2384,
    34,
    34144,
    37,
    27193,
    24859,
    15,
    3,
    16481,
    25155,
    205,
    106,
    15,
    3],
   [2,
    1865,
    2384,
    34,
    34144,
    37,
    27193,
    24859,
    15,
    3,
    34144,
    29,
    3758,
    241,
    15,
    3]]],
 'token_type_ids': [[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]]],
 'attention_mask': [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
   [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]}

In [20]:
datasets["train"][:1]

{'premise': ['मेरे शरीर ने घास पर छाया डाली।'],
 'choice1': ['सूरज उग रहा था।'],
 'choice2': ['घास काटी गई।'],
 'question': ['cause'],
 'idx': [0],
 'label': [0]}

In [21]:
for chunk in temp['input_ids'][0]:
    print(tokenizer.decode(chunk))

[CLS] मेरे शरीर ने घास पर छाया डाली।[SEP] सूरज उग रहा था।[SEP]
[CLS] मेरे शरीर ने घास पर छाया डाली।[SEP] घास काटी गई।[SEP]


In [22]:
tokenized_datasets = datasets.map(preprocess_function, batched=True)

Loading cached processed dataset at C:\Users\arifa\.cache\huggingface\datasets\json\default-6ecfa560884c9a31\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-689f636bb2dd6bbc.arrow
Loading cached processed dataset at C:\Users\arifa\.cache\huggingface\datasets\json\default-d361c8987e918d36\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-4481a68d9cb49cf1.arrow
Loading cached processed dataset at C:\Users\arifa\.cache\huggingface\datasets\json\default-07180908d3559f11\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-cf304c7ea628f93c.arrow


In [23]:
tokenized_datasets["train"][0]

{'premise': 'मेरे शरीर ने घास पर छाया डाली।',
 'choice1': 'सूरज उग रहा था।',
 'choice2': 'घास काटी गई।',
 'question': 'cause',
 'idx': 0,
 'label': 0,
 'input_ids': [[2,
   1865,
   2384,
   34,
   34144,
   37,
   27193,
   24859,
   15,
   3,
   16481,
   25155,
   205,
   106,
   15,
   3],
  [2,
   1865,
   2384,
   34,
   34144,
   37,
   27193,
   24859,
   15,
   3,
   34144,
   29,
   3758,
   241,
   15,
   3]],
 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [24]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch


@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [25]:
# data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer)

In [26]:
import evaluate

accuracy = evaluate.load("accuracy")

In [27]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [28]:
from transformers import set_seed
set_seed(2)
# set_seed(80)

In [29]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer, AutoModel

model = AutoModelForMultipleChoice.from_pretrained('ai4bharat/indic-bert')
# model =  AutoModelForMultipleChoice.from_pretrained("bert-base-multilingual-cased")

Some weights of the model checkpoint at ai4bharat/indic-bert were not used when initializing AlbertForMultipleChoice: ['predictions.LayerNorm.weight', 'predictions.dense.weight', 'predictions.decoder.bias', 'predictions.decoder.weight', 'sop_classifier.classifier.bias', 'predictions.dense.bias', 'predictions.bias', 'sop_classifier.classifier.weight', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForMultipleChoice were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized

In [30]:
#  disable weights and biases logging
import os
os.environ["WANDB_DISABLED"] = "true"

In [31]:
# temp_data = tokenized_datasets.remove_columns(['premise','choice1','choice2','question','idx'])
# temp = [temp_data["train"][i]for i in range(5)]

In [32]:
# batch = data_collator(temp)
# batch

In [33]:
# for sample in batch["input_ids"].tolist():
#     for choice in sample:
#         print(tokenizer.decode(choice))

In [34]:
# datasets["train"][:5]

In [35]:
training_args = TrainingArguments(
    output_dir="mbert_swag_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    #learning_rate=5e-5,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    #push_to_hub=True,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [36]:
trainer.evaluate()

You're using a AlbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 0.6932427883148193,
 'eval_accuracy': 0.4431818181818182,
 'eval_runtime': 1.7892,
 'eval_samples_per_second': 49.184,
 'eval_steps_per_second': 1.677}

In [37]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.693221,0.465909
2,No log,0.693248,0.386364
3,No log,0.693226,0.375


TrainOutput(global_step=36, training_loss=0.6930760807461209, metrics={'train_runtime': 5.9853, 'train_samples_per_second': 181.443, 'train_steps_per_second': 6.015, 'total_flos': 2862634904976.0, 'train_loss': 0.6930760807461209, 'epoch': 3.0})

In [38]:
trainer.evaluate(tokenized_datasets["test"])

{'eval_loss': 0.6931473016738892,
 'eval_accuracy': 0.5100222717149221,
 'eval_runtime': 0.5854,
 'eval_samples_per_second': 767.038,
 'eval_steps_per_second': 25.625,
 'epoch': 3.0}