In [None]:
from transformers import set_seed
import torch

# random_seed = 42
random_seed = 80

set_seed(random_seed)

In [None]:
from transformers import CharacterBertModel
import torch.nn as nn

class CharacterBERTForMultipleChoice(nn.Module):
    def __init__(self):
          super(CharacterBERTForMultipleChoice, self).__init__()
          self.bert = CharacterBertModel.from_pretrained("E:\Documents\Character Bert\Hate Speech\character-bert-hindi")
          ### New layers:
          self.linear1 = nn.Linear(768, 1) ## 1 is the number of classes in this example

    def forward(self, input_ids,attention_mask,token_type_ids,position_ids,head_mask,\
                inputs_embeds,output_attentions,output_hidden_states,return_dict):
        outputs = self.bert(
            input_ids,
            attention_mask,
            token_type_ids,
            position_ids,
            head_mask,
            inputs_embeds,
            output_attentions,
            output_hidden_states,
            return_dict)
        
        pooled_output = outputs[1]
        logits = self.linear1(pooled_output)

        return logits

In [None]:
from datasets import load_dataset

train_dataset = load_dataset("json", data_files="..\datasets\copa-translated\\hi\\train.jsonl", \
                             split="train")

In [None]:
val_dataset = load_dataset("json", data_files="..\datasets\copa-translated\\hi\\val.jsonl", \
                            split="train")

In [None]:
test_dataset = load_dataset("json", data_files="..\datasets\copa-translated\\hi\\test.jsonl", \
                             split="train")

In [None]:
from datasets import Dataset, DatasetDict

datasets = DatasetDict()
datasets['train'] = train_dataset
datasets['validation'] = val_dataset
datasets['test'] = test_dataset

In [None]:
# from datasets import load_dataset

# datasets = load_dataset("indic_glue","copa.hi")

In [None]:
# datasets

In [None]:
datasets["train"][:50]

In [None]:
datasets["test"][:2]

In [None]:
datasets.set_format("pandas")

In [None]:
# get label counts for both classes
label_counts = datasets["train"]["label"].value_counts()
num_labels = (len(label_counts.keys()))

In [None]:
label_counts

In [None]:
datasets.reset_format()

In [None]:
from transformers import BertForSequenceClassification, BertConfig, CharacterBertModel, CharacterBertTokenizer

tokenizer = CharacterBertTokenizer(strip_accents=None, do_lower_case=None)

In [None]:
choice_names = ['choice1', 'choice2']

In [None]:
def preprocess_function(examples):
    premise = [[context] * 2 for context in examples["premise"]]
    cause = [[f"{examples[choice][i]}" for choice in choice_names] for i,_ in enumerate(premise)]

    premise = sum(premise, [])
    cause = sum(cause, [])
    
#     print(premise)
#     print(cause)
    

    tokenized_examples = tokenizer(premise, cause, truncation=True, max_length=128, padding='max_length')
#     print(len(tokenized_examples))
    return {k: [v[i : i + 2] for i in range(0, len(v), 2)] for k, v in tokenized_examples.items()}
    return tokenized_examples

In [None]:
temp = preprocess_function(datasets["train"][:1])
len(temp)

In [None]:
tokenized_datasets = datasets.map(preprocess_function, batched=True)

In [None]:
from transformers import BertConfig, CharacterBertTokenizer, BertModel,\
                        TrainingArguments, Trainer, CharacterBertConfig, CharacterBertModel

model = CharacterBERTForMultipleChoice()

In [None]:
tokenized_datasets

In [None]:
# tokenized_datasets = tokenized_datasets.remove_columns(['premise', 'choice1', 'choice2', 'question', 'idx'])
tokenized_datasets = tokenized_datasets.remove_columns(['premise', 'choice1', 'choice2', 'question'])
tokenized_datasets.set_format("torch")
tokenized_datasets.column_names

In [None]:
# tokenized_datasets["train"][0]['input_ids'].size()

In [None]:
# tokenized_datasets["train"][0]['attention_mask'].size()

In [None]:
# tokenized_datasets["train"][0]['token_type_ids'].size()

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import AdamW

# optimizer = AdamW(model.parameters(), lr=2e-5)
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
from transformers import get_scheduler

# num_epochs = 3
num_epochs = 2
num_training_steps = num_epochs * len(tokenized_datasets["train"])
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    #num_warmup_steps=0.05 * num_training_steps,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

In [None]:
import torch.nn as nn
from torch.nn import CrossEntropyLoss

# classifier = nn.Linear(768, 1).to(device)
loss_fct = CrossEntropyLoss()

In [None]:
for item in tokenized_datasets["train"]:
    item = {k: v.to(device) for k, v in item.items()}
    position_ids=None
    head_mask=None
    inputs_embeds=None
    output_attentions=None
    output_hidden_states=None
    return_dict=None
    logits = model(
            input_ids=item["input_ids"],
            attention_mask=item["attention_mask"],
            token_type_ids=item["token_type_ids"],
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
    reshaped_logits = logits.view(-1, 2)
    print(reshaped_logits)
    print(item["label"].unsqueeze(0))
    loss = loss_fct(reshaped_logits, item["label"].unsqueeze(0))
    print(loss)
    break

In [None]:
import evaluate

metric = evaluate.load("accuracy")

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))
accumulation_steps = 32

for epoch in range(num_epochs):
    model.train()
    training_losses = []
    for i, item in enumerate(tokenized_datasets["train"]):
        item = {k: v.to(device) for k, v in item.items()}
        logits = model(
                input_ids=item["input_ids"],
                attention_mask=item["attention_mask"],
                token_type_ids=item["token_type_ids"],
                position_ids=None,
                head_mask=None,
                inputs_embeds=None,
                output_attentions=None,
                output_hidden_states=None,
                return_dict=None
            )
        reshaped_logits = logits.view(-1, 2)
        loss = loss_fct(reshaped_logits, item["label"].unsqueeze(0))
        loss.backward()
        
        if (i+1) % accumulation_steps == 0:             # Wait for several backward steps
            optimizer.step()                            # Now we can do an optimizer step
            optimizer.zero_grad()                       # Reset gradients tensors
    
        lr_scheduler.step()
        progress_bar.update(1)
        training_losses.append(loss.repeat(1))
    
    training_losses = torch.cat(training_losses)
    training_losses = training_losses[: len(tokenized_datasets["train"])]
    
    print(f">>> Epoch {epoch}: Training Loss: {torch.mean(training_losses)}")
    
    progress_bar2 = tqdm(range(len(tokenized_datasets["validation"])))

    model.eval()

    training_losses = []
    for item in tokenized_datasets["validation"]:
        item = {k: v.to(device) for k, v in item.items()}
        logits = model(
                input_ids=item["input_ids"],
                attention_mask=item["attention_mask"],
                token_type_ids=item["token_type_ids"],
                position_ids=None,
                head_mask=None,
                inputs_embeds=None,
                output_attentions=None,
                output_hidden_states=None,
                return_dict=None
            )
        reshaped_logits = logits.view(-1, 2)
        pred = torch.argmax(reshaped_logits)
        metric.add_batch(predictions=pred.unsqueeze(0), references=item["label"].unsqueeze(0))
        progress_bar2.update(1)

    acc = metric.compute()
    print("accuracy", acc)

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(len(tokenized_datasets["test"])))

model.eval()

predictions = []
training_losses = []
for item in tokenized_datasets["test"]:
    item = {k: v.to(device) for k, v in item.items()}
    logits = model(
            input_ids=item["input_ids"],
            attention_mask=item["attention_mask"],
            token_type_ids=item["token_type_ids"],
            position_ids=None,
            head_mask=None,
            inputs_embeds=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None
        )
    reshaped_logits = logits.view(-1, 2)
    pred = torch.argmax(reshaped_logits)
    predictions.append(pred.tolist())
    metric.add_batch(predictions=pred.unsqueeze(0), references=item["label"].unsqueeze(0))
    progress_bar.update(1)
        
acc = metric.compute()

In [None]:
acc

In [None]:
y_true = tokenized_datasets['test']['label']

In [None]:
y_preds = predictions

In [None]:
from sklearn.metrics import classification_report
target_names = ['choice1', 'choice2']

In [None]:
import matplotlib.pyplot as plt
from seaborn import heatmap
from sklearn.metrics import confusion_matrix

#plot heatmap of confusion matrix
mat = confusion_matrix(y_true, y_preds)
heatmap(mat, cmap="Pastel1_r", fmt="d", xticklabels=target_names, yticklabels=target_names, annot=True)

#add overall title to plot
plt.title('Confusion matrix for COPA', fontsize = 12) # title with fontsize 20