In [114]:
from datasets import load_dataset

train_dataset = load_dataset("json", data_files="..\datasets\copa-translated\\hi\\train.jsonl", \
                             split="train")

Using custom data configuration default-6ecfa560884c9a31
Found cached dataset json (C:/Users/arifa/.cache/huggingface/datasets/json/default-6ecfa560884c9a31/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


In [115]:
val_dataset = load_dataset("json", data_files="..\datasets\copa-translated\\hi\\val.jsonl", \
                            split="train")

Using custom data configuration default-d361c8987e918d36
Found cached dataset json (C:/Users/arifa/.cache/huggingface/datasets/json/default-d361c8987e918d36/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


In [116]:
test_dataset = load_dataset("json", data_files="..\datasets\copa-translated\\hi\\test.jsonl", \
                             split="train")

Using custom data configuration default-07180908d3559f11
Found cached dataset json (C:/Users/arifa/.cache/huggingface/datasets/json/default-07180908d3559f11/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


In [117]:
from datasets import Dataset, DatasetDict

datasets = DatasetDict()
datasets['train'] = train_dataset
datasets['validation'] = val_dataset
datasets['test'] = test_dataset

In [118]:
datasets

DatasetDict({
    train: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'idx', 'label'],
        num_rows: 362
    })
    validation: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'idx', 'label'],
        num_rows: 88
    })
    test: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'idx', 'label'],
        num_rows: 449
    })
})

In [119]:
datasets["train"][:5]

{'premise': ['मेरे शरीर ने घास पर छाया डाली।',
  'महिला ने अपने दोस्त के कठिन व्यवहार को सहन किया।',
  'महिलाएं कॉफी के लिए मिलीं।',
  'धावक ने शॉर्ट्स पहनी थी।',
  'पार्टी के मेहमान सोफे के पीछे छिप गए।'],
 'choice1': ['सूरज उग रहा था।',
  'महिला को पता था कि उसका दोस्त कठिन समय से गुजर रहा है।',
  'एक नए स्थान में कैफे फिर से खुल गया।',
  'पूर्वानुमान में उच्च तापमान की भविष्यवाणी की गई थी।',
  'यह एक सरप्राइज पार्टी थी।'],
 'choice2': ['घास काटी गई।',
  'महिला को लगा कि उसके दोस्त ने उसकी दया का फायदा उठाया।',
  'वे एक-दूसरे को पकड़ना चाहते थे।',
  'उसने समुद्र तट के साथ दौड़ने की योजना बनाई।',
  'यह जन्मदिन की पार्टी थी।'],
 'question': ['cause', 'cause', 'cause', 'cause', 'cause'],
 'idx': [0, 1, 2, 3, 4],
 'label': [0, 0, 1, 0, 0]}

In [120]:
datasets["test"][:2]

{'premise': ['आइटम को बबल रैप में पैक किया गया था।',
  'मैंने अपनी जेबें खाली कर दीं।'],
 'choice1': ['यह नाजुक था।', 'मैंने एक टिकट स्टब को पुनः प्राप्त किया।'],
 'choice2': ['छोटा था।', 'मुझे एक हथियार मिला।'],
 'question': ['cause', 'effect'],
 'idx': [0, 1],
 'label': [0, 0]}

In [121]:
datasets.set_format("pandas")

In [122]:
# get label counts for both classes
label_counts = datasets["train"]["label"].value_counts()
num_labels = (len(label_counts.keys()))

In [123]:
label_counts

1    186
0    176
Name: label, dtype: int64

In [124]:
datasets.reset_format()

In [125]:
from transformers import PreTrainedTokenizerFast, AutoModelForSequenceClassification, AutoTokenizer

tokenizer = PreTrainedTokenizerFast.from_pretrained("../Hindi Pretraining/models/unigram/bert-base-pretrained-hindi")
# tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

In [126]:
choice_names = ['choice1', 'choice2']

In [54]:
CONTEXT_COL = "premise"
QUESTION_COL = "question"
ANSWER_1_COL = "choice1"
ANSWER_2_COL = "choice2"

def preprocess_function(examples):
    """
    The preprocessing function needs to:
    1. Make two copies of the CONTEXT_COL field and combine each of them with QUESTION_COL to recreate how a sentence starts.
    2. Combine QUESTION_COL with each of the two possible choices.
    3. Flatten these two lists so you can tokenize them, and then unflatten them afterward so each example has a corresponding input_ids, attention_mask, and labels field.
    """

    question_headers = examples[QUESTION_COL]
    first_sentences = [
        [f"{examples[CONTEXT_COL][i]} कारण क्या है? "]*2 if header == "cause" else\
        [f"{examples[CONTEXT_COL][i]} परिणाम क्या है? "]*2\
            for i, header in enumerate(question_headers)
    ]
    first_sentences = sum(first_sentences, [])
    
    second_sentences = [
        [examples[end][i] for end in [ANSWER_1_COL, ANSWER_2_COL]] for i, header in enumerate(question_headers)
    ]
    second_sentences = sum(second_sentences, [])
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
    return {k: [v[i : i + 2] for i in range(0, len(v), 2)] for k, v in tokenized_examples.items()}

In [55]:
temp = preprocess_function(datasets["train"][:1])
temp

{'input_ids': [[[2,
    198,
    437,
    18,
    4279,
    17,
    3065,
    2989,
    6,
    134,
    77,
    7,
    57,
    5,
    3,
    2196,
    7215,
    44,
    32,
    6,
    3],
   [2,
    198,
    437,
    18,
    4279,
    17,
    3065,
    2989,
    6,
    134,
    77,
    7,
    57,
    5,
    3,
    4279,
    14,
    998,
    51,
    6,
    3]]],
 'token_type_ids': [[[0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    1,
    1,
    1,
    1,
    1,
    1],
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]]],
 'attention_mask': [[[1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1],
   [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]}

In [57]:
datasets["train"][:1]

{'premise': ['मेरे शरीर ने घास पर छाया डाली।'],
 'choice1': ['सूरज उग रहा था।'],
 'choice2': ['घास काटी गई।'],
 'question': ['cause'],
 'idx': [0],
 'label': [0]}

In [58]:
for chunk in temp['input_ids'][0]:
    print(tokenizer.decode(chunk))

[CLS] मेरे शरीर ने घास पर छाया डाली। कारण क्या है? [SEP] सूरज उग रहा था।[SEP]
[CLS] मेरे शरीर ने घास पर छाया डाली। कारण क्या है? [SEP] घास काटी गई।[SEP]


In [59]:
tokenized_datasets = datasets.map(preprocess_function, batched=True)

Loading cached processed dataset at C:\Users\arifa\.cache\huggingface\datasets\json\default-6ecfa560884c9a31\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-119644ae4f89a3f0.arrow
Loading cached processed dataset at C:\Users\arifa\.cache\huggingface\datasets\json\default-d361c8987e918d36\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-a746333cbbcdcfce.arrow
Loading cached processed dataset at C:\Users\arifa\.cache\huggingface\datasets\json\default-07180908d3559f11\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-5abe6ab721a84571.arrow


In [60]:
tokenized_datasets["train"][0]

{'premise': 'मेरे शरीर ने घास पर छाया डाली।',
 'choice1': 'सूरज उग रहा था।',
 'choice2': 'घास काटी गई।',
 'question': 'cause',
 'idx': 0,
 'label': 0,
 'input_ids': [[2,
   198,
   437,
   18,
   4279,
   17,
   3065,
   2989,
   6,
   134,
   77,
   7,
   57,
   5,
   3,
   2196,
   7215,
   44,
   32,
   6,
   3],
  [2,
   198,
   437,
   18,
   4279,
   17,
   3065,
   2989,
   6,
   134,
   77,
   7,
   57,
   5,
   3,
   4279,
   14,
   998,
   51,
   6,
   3]],
 'token_type_ids': [[0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   1,
   1,
   1,
   1,
   1,
   1],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]],
 'attention_mask': [[1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [61]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch


@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [62]:
data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer)

In [63]:
tokenized_datasets["train"]

Dataset({
    features: ['premise', 'choice1', 'choice2', 'question', 'idx', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 362
})

In [64]:
temp_data = tokenized_datasets["train"].remove_columns(['premise', 'choice1', 'choice2', 'question', 'idx'])
temp_data

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 362
})

In [65]:
samples = [temp_data[i] for i in range(1)]
temp = data_collator(samples)

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [66]:
samples = [temp_data[i] for i in range(3)]
temp = data_collator(samples)

In [67]:
import evaluate

accuracy = evaluate.load("accuracy")

In [68]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [102]:
from transformers import set_seed
# set_seed(30)
set_seed(42)

In [103]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer, AutoModel

model = AutoModelForMultipleChoice.from_pretrained("../Hindi Pretraining/models/unigram/bert-base-pretrained-hindi")
# model = AutoModelForMultipleChoice.from_pretrained("bert-base-multilingual-cased")

Some weights of the model checkpoint at ../Hindi Pretraining/models/unigram/bert-base-pretrained-hindi were not used when initializing BertForMultipleChoice: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at ../Hindi Pretr

In [104]:
model

BertForMultipleChoice(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [105]:
# temp_data = tokenized_datasets.remove_columns(['premise','choice1','choice2','question','idx'])
# temp = [temp_data["train"][i]for i in range(5)]

In [106]:
# batch = data_collator(temp)
# batch

In [107]:
# for sample in batch["input_ids"].tolist():
#     for choice in sample:
#         print(tokenizer.decode(choice))

In [108]:
# datasets["train"][:5]

In [109]:
#  disable weights and biases logging
import os
os.environ["WANDB_DISABLED"] = "true"

In [110]:
training_args = TrainingArguments(
    output_dir="my_awesome_swag_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    #learning_rate=3e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    #num_train_epochs=3,
    warmup_ratio=0.1,
    weight_decay=0.01,
    #weight_decay=0.04,
    #fp16=True,
    metric_for_best_model = 'accuracy',
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    #eval_dataset=tokenized_datasets["validation"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [111]:
trainer.evaluate()

{'eval_loss': 0.6942782998085022,
 'eval_accuracy': 0.46325167037861914,
 'eval_runtime': 1.8903,
 'eval_samples_per_second': 237.526,
 'eval_steps_per_second': 7.935}

In [112]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.689553,0.547884
2,No log,0.686235,0.563474


TrainOutput(global_step=24, training_loss=0.6699581146240234, metrics={'train_runtime': 21.333, 'train_samples_per_second': 33.938, 'train_steps_per_second': 1.125, 'total_flos': 24732217141248.0, 'train_loss': 0.6699581146240234, 'epoch': 2.0})

In [287]:
trainer.save_model()

In [113]:
trainer.evaluate(tokenized_datasets["test"])

{'eval_loss': 0.686234712600708,
 'eval_accuracy': 0.5634743875278396,
 'eval_runtime': 1.9933,
 'eval_samples_per_second': 225.257,
 'eval_steps_per_second': 7.525,
 'epoch': 2.0}

In [72]:
model = AutoModelForMultipleChoice.from_pretrained("my_awesome_swag_model")