## Install and Import Libraries

In [None]:
! pip install -U accelerate
! pip install -U transformers
! pip install datasets

In [None]:
import pandas as pd
import re
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import json
from transformers import AutoTokenizer
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer, XLNetForMultipleChoice, LongformerForMultipleChoice, BigBirdForMultipleChoice, RobertaForMultipleChoice
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch
import numpy as np

## Import Data

In [None]:
df = pd.read_csv('/content/train_data.csv')

In [None]:
df.head()

## Data Preprocessing

In [None]:
def split_question(row):
    question_text = ""
    choices_text = ""

    if 'Answer Choices:' in row['question']:
        # Split using "Answer Choices:"
        parts = row['question'].split('Answer Choices:')
        question_text = parts[0].strip()
        choices_text = parts[1]
    elif 'Answer choices:' in row['question']:
        # Handle case with just "Answer choices"
        parts = row['question'].split('Answer choices:')
        question_text = parts[0].strip()
        choices_text = parts[1]
    elif 'Answer Choices' in row['question']:
        # Handle case with just "Answer choices"
        parts = row['question'].split('Answer Choices')
        question_text = parts[0].strip()
        choices_text = parts[1]
    elif 'Answer choices' in row['question']:
        # Handle case with just "Answer choices"
        parts = row['question'].split('Answer choices')
        question_text = parts[0].strip()
        choices_text = parts[1]
    elif '?' in row['question']:
        # Split on the last question mark '?'
        last_question_mark = row['question'].rfind('?')
        question_text = row['question'][:last_question_mark + 1].strip()
        choices_text = row['question'][last_question_mark + 1:]


    # Handle cases where there are no choices
    if not choices_text.strip():
        return pd.Series([question_text, "", "", "", ""])

    # Split the choices and remove the first three characters from each choice
    choices = [choice[3:].strip() for choice in choices_text.split('\n') if choice.strip()]

    # Pad the choices list if there are fewer than 4 choices
    choices += [""] * (4 - len(choices))

    return pd.Series([question_text] + choices[:4])



In [None]:
# Apply the function to each row
new_columns = df.apply(split_question, axis=1)
new_columns.columns = ['question_text', 'choice1', 'choice2', 'choice3', 'choice4']

# Concatenate the new columns with the original dataframe
result_df = pd.concat([df, new_columns], axis=1)

result_df['answer'] = result_df['answer'].str[3:]

result_df = result_df[['story','question_text',
       'choice1', 'choice2', 'choice3', 'choice4','answer']]

In [None]:
result_df.head(10)

### Convert answer to label

In [None]:
def answer_to_label(row):
    choices = [row['choice1'].strip().lower().replace('.',''), row['choice2'].strip().lower().replace('.',''),
               row['choice3'].strip().lower().replace('.',''), row['choice4'].strip().lower().replace('.','')]
    correct_answer = row['answer'].strip().lower().replace('.','')

    # Check which choice matches the correct answer
    for i, choice in enumerate(choices):
        if choice == correct_answer:
            return i
    return 'unknown'

# Apply the function to each row
result_df['answer'] = result_df.apply(answer_to_label, axis=1)

### Remove data with wrong answers

In [None]:
result_df[result_df['answer']=='unknown']
result_df = result_df[result_df['answer'] != 'unknown']
result_df = result_df.rename(columns={"answer": "label"})

### Remove data with missing answer choices

In [None]:
 result_df[(result_df['choice1'] == "") | (result_df['choice2'] == "") | (result_df['choice3'] == "") | (result_df['choice4'] == "")]
 result_df =  result_df[(result_df['choice1'] != "") & (result_df['choice2'] != "") & (result_df['choice3'] != "") & (result_df['choice4'] != "")]

## Convert preprocessed data to CSV

In [None]:
result_df.to_csv("preprocessed_data.csv")

## Convert data to form usable by model

In [None]:
def create_dataset_dict(df, test_size=0.2):

    # Split the DataFrame into training and validation sets
    train_df, val_df = train_test_split(df, test_size=test_size)

    # Convert pandas DataFrames to Hugging Face Datasets
    train_dataset = Dataset.from_pandas(train_df)
    validation_dataset = Dataset.from_pandas(val_df)

    # Create a DatasetDict
    dataset_dict = DatasetDict({
        "train": train_dataset,
        "validation": validation_dataset
    })

    return dataset_dict

# Example usage
datasets = create_dataset_dict(result_df, test_size=0.2)

# Now dataset_dict contains train and validation datasets
print(datasets)


### Example of processed Data

In [None]:
def show_one(example):
    print(f"Context: {example['story']}")
    print(f"  A - {example['question_text']} {example['choice1']}")
    print(f"  B - {example['question_text']} {example['choice2']}")
    print(f"  C - {example['question_text']} {example['choice3']}")
    print(f"  D - {example['question_text']} {example['choice4']}")
    print(f"\nGround truth: option {['A', 'B', 'C', 'D'][example['label']]}")
show_one(datasets["train"][0])

## Model definition and Selection

In [None]:
models = {'bert':'bert-base-uncased', 'xlnet': 'xlnet-base-cased','longformer':'allenai/longformer-base-4096','bigbird':'google/bigbird-roberta-base','roberta':'roberta-base'}
tokenizers = {'bert':'bert-base-uncased', 'xlnet': 'xlnet-base-cased','longformer':'allenai/longformer-base-4096','bigbird':'google/bigbird-roberta-base','roberta':'roberta-base'}
batch_size = 4


#model choices - Bert, Roberta, Xlnet, Longformer, Bigbird
model_choice = models['roberta']

#initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_choice, use_fast=True)

#Initalize Model
model = AutoModelForMultipleChoice.from_pretrained(model_choice)
# model = XLNetForMultipleChoice.from_pretrained(models['xlnet'])
# model = LongformerForMultipleChoice.from_pretrained(models['longformer'])
# model = BigBirdForMultipleChoice.from_pretrained(models['bigbird'])
# model = RobertaForMultipleChoice.from_pretrained("roberta-base")

In [None]:
ending_names = ["choice1", "choice2", "choice3", "choice4"]

def preprocess_function(examples):
    # Repeat each first sentence four times to go with the four possibilities of answer choices.
    first_sentences = [[context] * 4 for context in examples["story"]]
    # Grab all choices for each context.
    question_headers = examples["question_text"]
    second_sentences = [[f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)]

    # Flatten everything
    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])

    # Tokenize
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
    # Un-flatten
    return {k: [v[i:i+4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}

In [None]:
encoded_datasets = datasets.map(preprocess_function, batched=True)

#### Data collator

In [None]:
@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [None]:
accepted_keys = ["input_ids", "attention_mask", "label"]
features = [{k: v for k, v in encoded_datasets["train"][i].items() if k in accepted_keys} for i in range(10)]
# print(features[0])
batch = DataCollatorForMultipleChoice(tokenizer)(features)

In [None]:
[tokenizer.decode(batch["input_ids"][8][i].tolist()) for i in range(4)]

### Define Evaluation metrics

In [None]:
def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

In [None]:
torch.cuda.empty_cache()

### Define training arguments and trainer

In [None]:
model_name = model_choice.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-tr",
    evaluation_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_datasets["train"],
    eval_dataset=encoded_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer),
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

### Save the model

In [None]:
trainer.save_model("my_model")

In [None]:
# Save the model checkpoint
#!zip -r /content/roberta-base-finetuned-tr /content

## Preprocess the training data

In [None]:
test_df = pd.read_csv('/content/validation_data.csv')

In [None]:
# Apply the function to each row
new_columns = test_df.apply(split_question, axis=1)
new_columns.columns = ['question_text', 'choice1', 'choice2', 'choice3', 'choice4']

# Concatenate the new columns with the original dataframe
result_test_df = pd.concat([test_df, new_columns], axis=1)

result_test_df['answer'] = result_test_df['answer'].str[3:]

result_test_df = result_test_df[['story','question_text',
       'choice1', 'choice2', 'choice3', 'choice4','answer']]

In [None]:
result_test_df.head(2)

In [None]:
def answer_to_label(row):
    choices = [row['choice1'].strip().lower().replace('.',''), row['choice2'].strip().lower().replace('.',''),
               row['choice3'].strip().lower().replace('.',''), row['choice4'].strip().lower().replace('.','')]
    correct_answer = row['answer'].strip().lower().replace('.','')

    # Check which choice matches the correct answer
    for i, choice in enumerate(choices):
        if choice == correct_answer:
            return i
    return 'unknown'

# Apply the function to each row
result_test_df['answer'] = result_test_df.apply(answer_to_label, axis=1)
result_test_df = result_test_df[result_test_df['answer'] != 'unknown']
result_test_df =  result_test_df[(result_test_df['choice1'] != "") & (result_test_df['choice2'] != "") & (result_test_df['choice3'] != "") & (result_test_df['choice4'] != "")]

In [None]:
answers = result_test_df['answer']
result_test_df = result_test_df.drop(columns=['answer'])

In [None]:
result_test_df.head(2)

In [None]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_choice)
model = AutoModelForMultipleChoice.from_pretrained('my_model')


predicted_labels = []

for index, row in result_test_df.iterrows():
    prompt = row['story']
    candidates = [row[f'choice{i}'] for i in range(1, 5)]

    # Create inputs for each candidate
    inputs = tokenizer([[prompt, candidate] for candidate in candidates], return_tensors="pt", padding=True)

    # Define labels (0 for the first choice, 1 for the second, etc.)
    labels = torch.tensor(0).unsqueeze(0)  # Adjust the label index as needed

    # Perform inference
    outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
    logits = outputs.logits
    predicted_class = logits.argmax().item()

    # Append the predicted label to the list
    predicted_labels.append(predicted_class)
