In [45]:
from transformers import set_seed
set_seed(80)

In [46]:
from transformers import BertModel
import torch.nn as nn

class CharacterBERTForMultipleChoice(nn.Module):
    def __init__(self):
          super(CharacterBERTForMultipleChoice, self).__init__()
          self.bert = BertModel.from_pretrained("../Hindi Pretraining/models/unigram/bert-base-pretrained-hindi")
          ### New layers:
          self.linear1 = nn.Linear(768, 1) ## 1 is the number of classes in this example

    def forward(self, input_ids,attention_mask,token_type_ids,position_ids,head_mask,\
                inputs_embeds,output_attentions,output_hidden_states,return_dict):
        outputs = self.bert(
            input_ids,
            attention_mask,
            token_type_ids,
            position_ids,
            head_mask,
            inputs_embeds,
            output_attentions,
            output_hidden_states,
            return_dict)
        
        pooled_output = outputs[1]
        logits = self.linear1(pooled_output)

        return logits

In [47]:
from datasets import load_dataset

train_dataset = load_dataset("json", data_files="..\datasets\copa-translated\\hi\\train.jsonl", \
                             split="train")

Using custom data configuration default-6ecfa560884c9a31
Found cached dataset json (C:/Users/arifa/.cache/huggingface/datasets/json/default-6ecfa560884c9a31/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


In [48]:
val_dataset = load_dataset("json", data_files="..\datasets\copa-translated\\hi\\val.jsonl", \
                            split="train")

Using custom data configuration default-d361c8987e918d36
Found cached dataset json (C:/Users/arifa/.cache/huggingface/datasets/json/default-d361c8987e918d36/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


In [49]:
test_dataset = load_dataset("json", data_files="..\datasets\copa-translated\\hi\\test.jsonl", \
                             split="train")

Using custom data configuration default-07180908d3559f11
Found cached dataset json (C:/Users/arifa/.cache/huggingface/datasets/json/default-07180908d3559f11/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


In [50]:
from datasets import Dataset, DatasetDict

datasets = DatasetDict()
datasets['train'] = train_dataset
datasets['validation'] = val_dataset
datasets['test'] = test_dataset

In [51]:
datasets

DatasetDict({
    train: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'idx', 'label'],
        num_rows: 362
    })
    validation: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'idx', 'label'],
        num_rows: 88
    })
    test: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'idx', 'label'],
        num_rows: 449
    })
})

In [52]:
datasets["train"][:5]

{'premise': ['मेरे शरीर ने घास पर छाया डाली।',
  'महिला ने अपने दोस्त के कठिन व्यवहार को सहन किया।',
  'महिलाएं कॉफी के लिए मिलीं।',
  'धावक ने शॉर्ट्स पहनी थी।',
  'पार्टी के मेहमान सोफे के पीछे छिप गए।'],
 'choice1': ['सूरज उग रहा था।',
  'महिला को पता था कि उसका दोस्त कठिन समय से गुजर रहा है।',
  'एक नए स्थान में कैफे फिर से खुल गया।',
  'पूर्वानुमान में उच्च तापमान की भविष्यवाणी की गई थी।',
  'यह एक सरप्राइज पार्टी थी।'],
 'choice2': ['घास काटी गई।',
  'महिला को लगा कि उसके दोस्त ने उसकी दया का फायदा उठाया।',
  'वे एक-दूसरे को पकड़ना चाहते थे।',
  'उसने समुद्र तट के साथ दौड़ने की योजना बनाई।',
  'यह जन्मदिन की पार्टी थी।'],
 'question': ['cause', 'cause', 'cause', 'cause', 'cause'],
 'idx': [0, 1, 2, 3, 4],
 'label': [0, 0, 1, 0, 0]}

In [53]:
datasets["test"][:2]

{'premise': ['आइटम को बबल रैप में पैक किया गया था।',
  'मैंने अपनी जेबें खाली कर दीं।'],
 'choice1': ['यह नाजुक था।', 'मैंने एक टिकट स्टब को पुनः प्राप्त किया।'],
 'choice2': ['छोटा था।', 'मुझे एक हथियार मिला।'],
 'question': ['cause', 'effect'],
 'idx': [0, 1],
 'label': [0, 0]}

In [54]:
datasets.set_format("pandas")

In [55]:
# get label counts for both classes
label_counts = datasets["train"]["label"].value_counts()
num_labels = (len(label_counts.keys()))

In [56]:
label_counts

1    186
0    176
Name: label, dtype: int64

In [57]:
datasets.reset_format()

In [58]:
from transformers import PreTrainedTokenizerFast, AutoModelForSequenceClassification, AutoTokenizer

tokenizer = PreTrainedTokenizerFast.from_pretrained("../Hindi Pretraining/models/unigram/bert-base-pretrained-hindi")

In [59]:
choice_names = ['choice1', 'choice2']

In [60]:
def preprocess_function(examples):
    premise = [[context] * 2 for context in examples["premise"]]
    cause = [[f"{examples[choice][i]}" for choice in choice_names] for i,_ in enumerate(premise)]

    premise = sum(premise, [])
    cause = sum(cause, [])
    
#     print(premise)
#     print(cause)
    

    tokenized_examples = tokenizer(premise, cause, truncation=True, max_length=128, padding='max_length')
#     print(len(tokenized_examples))
    return {k: [v[i : i + 2] for i in range(0, len(v), 2)] for k, v in tokenized_examples.items()}
    return tokenized_examples

In [61]:
temp = preprocess_function(datasets["train"][:1])
len(temp)

3

In [62]:
tokenized_datasets = datasets.map(preprocess_function, batched=True)

Loading cached processed dataset at C:\Users\arifa\.cache\huggingface\datasets\json\default-6ecfa560884c9a31\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-8939e5bd4b02951d.arrow
Loading cached processed dataset at C:\Users\arifa\.cache\huggingface\datasets\json\default-d361c8987e918d36\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-1efcbb1b6b9b27eb.arrow
Loading cached processed dataset at C:\Users\arifa\.cache\huggingface\datasets\json\default-07180908d3559f11\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-dc7c9d0a99be6f58.arrow


In [63]:
from transformers import Trainer

model = CharacterBERTForMultipleChoice()

Some weights of the model checkpoint at ../Hindi Pretraining/models/unigram/bert-base-pretrained-hindi were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ../Hindi Pretraining/models/unigram/bert-base-pretrained-hindi

In [64]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'idx', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 362
    })
    validation: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'idx', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 88
    })
    test: Dataset({
        features: ['premise', 'choice1', 'choice2', 'question', 'idx', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 449
    })
})

In [65]:
tokenized_datasets = tokenized_datasets.remove_columns(['premise', 'choice1', 'choice2', 'question', 'idx'])
tokenized_datasets.set_format("torch")
tokenized_datasets.column_names

{'train': ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
 'validation': ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
 'test': ['label', 'input_ids', 'token_type_ids', 'attention_mask']}

In [66]:
# tokenized_datasets["train"][0]['input_ids'].size()

In [67]:
# tokenized_datasets["train"][0]['attention_mask'].size()

In [68]:
# tokenized_datasets["train"][0]['token_type_ids'].size()

In [69]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [70]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)



In [71]:
from transformers import get_scheduler

num_epochs = 2
num_training_steps = num_epochs * len(tokenized_datasets["train"])
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    #num_warmup_steps=0,
    num_warmup_steps=0.1 * num_training_steps,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

724


In [72]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [73]:
import torch.nn as nn
from torch.nn import CrossEntropyLoss

# classifier = nn.Linear(768, 1).to(device)
loss_fct = CrossEntropyLoss()

In [74]:
for item in tokenized_datasets["train"]:
    item = {k: v.to(device) for k, v in item.items()}
    position_ids=None
    head_mask=None
    inputs_embeds=None
    output_attentions=None
    output_hidden_states=None
    return_dict=None
    logits = model(
            input_ids=item["input_ids"],
            attention_mask=item["attention_mask"],
            token_type_ids=item["token_type_ids"],
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
    reshaped_logits = logits.view(-1, 2)
    print(reshaped_logits)
    print(item["label"].unsqueeze(0))
    loss = loss_fct(reshaped_logits, item["label"].unsqueeze(0))
    print(loss)
    break

tensor([[-0.2238, -0.2424]], device='cuda:0', grad_fn=<ViewBackward0>)
tensor([0], device='cuda:0')
tensor(0.6839, device='cuda:0', grad_fn=<NllLossBackward0>)


In [75]:
import evaluate

metric = evaluate.load("accuracy")

In [76]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    model.train()
    training_losses = []
    for item in tokenized_datasets["train"]:
        item = {k: v.to(device) for k, v in item.items()}
        logits = model(
                input_ids=item["input_ids"],
                attention_mask=item["attention_mask"],
                token_type_ids=item["token_type_ids"],
                position_ids=None,
                head_mask=None,
                inputs_embeds=None,
                output_attentions=None,
                output_hidden_states=None,
                return_dict=None
            )
        reshaped_logits = logits.view(-1, 2)
        loss = loss_fct(reshaped_logits, item["label"].unsqueeze(0))
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        training_losses.append(loss.repeat(1))
    
    training_losses = torch.cat(training_losses)
    training_losses = training_losses[: len(tokenized_datasets["train"])]
    
    print(f">>> Epoch {epoch}: Training Loss: {torch.mean(training_losses)}")
    
    progress_bar2 = tqdm(range(len(tokenized_datasets["validation"])))

    model.eval()

    training_losses = []
    for item in tokenized_datasets["validation"]:
        item = {k: v.to(device) for k, v in item.items()}
        logits = model(
                input_ids=item["input_ids"],
                attention_mask=item["attention_mask"],
                token_type_ids=item["token_type_ids"],
                position_ids=None,
                head_mask=None,
                inputs_embeds=None,
                output_attentions=None,
                output_hidden_states=None,
                return_dict=None
            )
        reshaped_logits = logits.view(-1, 2)
        pred = torch.argmax(reshaped_logits)
        metric.add_batch(predictions=pred.unsqueeze(0), references=item["label"].unsqueeze(0))
        progress_bar2.update(1)

    acc = metric.compute()
    print("accuracy", acc)

  0%|          | 0/724 [00:00<?, ?it/s]

>>> Epoch 0: Training Loss: 0.6967911720275879


  0%|          | 0/88 [00:00<?, ?it/s]

accuracy {'accuracy': 0.5}
>>> Epoch 1: Training Loss: 0.5518900752067566


  0%|          | 0/88 [00:00<?, ?it/s]

accuracy {'accuracy': 0.5909090909090909}


In [77]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(len(tokenized_datasets["test"])))

model.eval()

training_losses = []
for item in tokenized_datasets["test"]:
    item = {k: v.to(device) for k, v in item.items()}
    logits = model(
            input_ids=item["input_ids"],
            attention_mask=item["attention_mask"],
            token_type_ids=item["token_type_ids"],
            position_ids=None,
            head_mask=None,
            inputs_embeds=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None
        )
    reshaped_logits = logits.view(-1, 2)
    pred = torch.argmax(reshaped_logits)
    metric.add_batch(predictions=pred.unsqueeze(0), references=item["label"].unsqueeze(0))
    progress_bar.update(1)
        
acc = metric.compute()

  0%|          | 0/449 [00:00<?, ?it/s]

In [78]:
acc

{'accuracy': 0.5746102449888641}