In [36]:
import json
file_path = "..\datasets\wiki-cloze\\bn.json"

# Opening JSON file
f = open(file_path, mode="r", encoding="utf-8")
  
# returns JSON object as 
# a dictionary
json_data = json.load(f)

In [37]:
json_data.keys()

dict_keys(['params', 'metadata', 'cloze_data'])

In [38]:
len(json_data['cloze_data'])

38845

In [39]:
from datasets import load_dataset

train_dataset = load_dataset("json", data_files="..\datasets\copa-translated\\hi\\train.jsonl", \
                             split="train")

Using custom data configuration default-6ecfa560884c9a31
Found cached dataset json (C:/Users/arifa/.cache/huggingface/datasets/json/default-6ecfa560884c9a31/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


In [40]:
val_dataset = load_dataset("json", data_files="..\datasets\copa-translated\\hi\\val.jsonl", \
                            split="train")

Using custom data configuration default-d361c8987e918d36
Found cached dataset json (C:/Users/arifa/.cache/huggingface/datasets/json/default-d361c8987e918d36/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


In [41]:
test_dataset = load_dataset("json", data_files="..\datasets\copa-translated\\hi\\test.jsonl", \
                             split="train")

Using custom data configuration default-07180908d3559f11
Found cached dataset json (C:/Users/arifa/.cache/huggingface/datasets/json/default-07180908d3559f11/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


In [42]:
from datasets import Dataset, DatasetDict

datasets = DatasetDict()
datasets['train'] = train_dataset
datasets['validation'] = val_dataset
datasets['test'] = test_dataset

In [43]:
datasets

DatasetDict({
    train: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'idx', 'label'],
        num_rows: 362
    })
    validation: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'idx', 'label'],
        num_rows: 88
    })
    test: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'idx', 'label'],
        num_rows: 449
    })
})

In [44]:
datasets.set_format("pandas")

In [45]:
# get label counts for both classes
label_counts = datasets["train"]["label"].value_counts()
num_labels = (len(label_counts.keys()))

In [46]:
label_counts

1    186
0    176
Name: label, dtype: int64

In [47]:
datasets.reset_format()

In [52]:
from transformers import PreTrainedTokenizerFast, AutoModelForSequenceClassification, AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert', keep_accents=True)
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

In [53]:
choice_names = ['choice1', 'choice2']

In [54]:
def preprocess_function(examples):
    premise = [[context] * 2 for context in examples["premise"]]
    cause = [[f"{examples[choice][i]}" for choice in choice_names] for i,_ in enumerate(premise)]

    premise = sum(premise, [])
    cause = sum(cause, [])
    
#     print(premise)
#     print(cause)
    

    tokenized_examples = tokenizer(premise, cause, truncation=True)
#     print(len(tokenized_examples))
    return {k: [v[i : i + 2] for i in range(0, len(v), 2)] for k, v in tokenized_examples.items()}
    return tokenized_examples

In [55]:
temp = preprocess_function(datasets["train"][:1])
temp

{'input_ids': [[[101,
    40265,
    28546,
    80686,
    13088,
    868,
    34646,
    12213,
    871,
    67341,
    877,
    88113,
    920,
    102,
    898,
    45028,
    17413,
    855,
    19741,
    36475,
    13794,
    920,
    102],
   [101,
    40265,
    28546,
    80686,
    13088,
    868,
    34646,
    12213,
    871,
    67341,
    877,
    88113,
    920,
    102,
    868,
    34646,
    11081,
    58580,
    32447,
    920,
    102]]],
 'token_type_ids': [[[0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]]],
 'attention_mask': [[[1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1],
   [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]}

In [56]:
datasets["train"][:2]

{'premise': ['मेरे शरीर ने घास पर छाया डाली।',
  'महिला ने अपने दोस्त के कठिन व्यवहार को सहन किया।'],
 'choice1': ['सूरज उग रहा था।',
  'महिला को पता था कि उसका दोस्त कठिन समय से गुजर रहा है।'],
 'choice2': ['घास काटी गई।',
  'महिला को लगा कि उसके दोस्त ने उसकी दया का फायदा उठाया।'],
 'question': ['cause', 'cause'],
 'idx': [0, 1],
 'label': [0, 0]}

In [57]:
for chunk in temp['input_ids'][0]:
    print(tokenizer.decode(chunk))

[CLS] मेरे शरीर ने घास पर छाया डाली । [SEP] सूरज उग रहा था । [SEP]
[CLS] मेरे शरीर ने घास पर छाया डाली । [SEP] घास काटी गई । [SEP]


In [58]:
tokenized_datasets = datasets.map(preprocess_function, batched=True)

Loading cached processed dataset at C:\Users\arifa\.cache\huggingface\datasets\json\default-6ecfa560884c9a31\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-040743c82eaf2b14.arrow
Loading cached processed dataset at C:\Users\arifa\.cache\huggingface\datasets\json\default-d361c8987e918d36\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-bdce96870761d06a.arrow
Loading cached processed dataset at C:\Users\arifa\.cache\huggingface\datasets\json\default-07180908d3559f11\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-f572362a488d5494.arrow


In [59]:
tokenized_datasets["train"][0]

{'premise': 'मेरे शरीर ने घास पर छाया डाली।',
 'choice1': 'सूरज उग रहा था।',
 'choice2': 'घास काटी गई।',
 'question': 'cause',
 'idx': 0,
 'label': 0,
 'input_ids': [[101,
   40265,
   28546,
   80686,
   13088,
   868,
   34646,
   12213,
   871,
   67341,
   877,
   88113,
   920,
   102,
   898,
   45028,
   17413,
   855,
   19741,
   36475,
   13794,
   920,
   102],
  [101,
   40265,
   28546,
   80686,
   13088,
   868,
   34646,
   12213,
   871,
   67341,
   877,
   88113,
   920,
   102,
   868,
   34646,
   11081,
   58580,
   32447,
   920,
   102]],
 'token_type_ids': [[0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]],
 'attention_mask': [[1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [60]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch


@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [61]:
# data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer)

In [62]:
import evaluate

accuracy = evaluate.load("accuracy")

In [63]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [64]:
from transformers import set_seed
set_seed(2)

In [65]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer, AutoModel

# model = AutoModelForMultipleChoice.from_pretrained('ai4bharat/indic-bert')
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

In [66]:
#  disable weights and biases logging
import os
os.environ["WANDB_DISABLED"] = "true"

In [67]:
# temp_data = tokenized_datasets.remove_columns(['premise','choice1','choice2','question','idx'])
# temp = [temp_data["train"][i]for i in range(5)]

In [68]:
# batch = data_collator(temp)
# batch

In [69]:
# for sample in batch["input_ids"].tolist():
#     for choice in sample:
#         print(tokenizer.decode(choice))

In [70]:
# datasets["train"][:5]

In [71]:
training_args = TrainingArguments(
    output_dir="mbert_swag_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=5e-5,
    #learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    #num_train_epochs=3,
    weight_decay=0.01,
    #push_to_hub=True,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [72]:
trainer.evaluate()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 0.693287193775177,
 'eval_accuracy': 0.4659090909090909,
 'eval_runtime': 0.392,
 'eval_samples_per_second': 224.506,
 'eval_steps_per_second': 7.654}

In [73]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.692993,0.534091
2,No log,0.692999,0.5
3,No log,0.693337,0.465909


TrainOutput(global_step=36, training_loss=0.6926099989149306, metrics={'train_runtime': 7.2491, 'train_samples_per_second': 149.812, 'train_steps_per_second': 4.966, 'total_flos': 4565766393048.0, 'train_loss': 0.6926099989149306, 'epoch': 3.0})

In [74]:
trainer.evaluate(tokenized_datasets["test"])

{'eval_loss': 0.6931788325309753,
 'eval_accuracy': 0.49888641425389757,
 'eval_runtime': 0.6558,
 'eval_samples_per_second': 684.676,
 'eval_steps_per_second': 22.873,
 'epoch': 3.0}