## Install and Import Libraries

In [None]:
! pip install -U accelerate
! pip install -U transformers
! pip install datasets

In [None]:
import pandas as pd
import re
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import json
from transformers import AutoTokenizer
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer, XLNetForMultipleChoice, LongformerForMultipleChoice, BigBirdForMultipleChoice, RobertaForMultipleChoice
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch
import numpy as np

## Import Data

## Data Preprocessing

## Preprocess other data

In [None]:
# Function to convert JSON entry to dataset format
def convert_json_to_dataset(entry):
    story = entry['Story']
    question = entry['Question']
    choices = entry['Answer Choices']
    answer = entry['Answer']

    # Find the index of the correct answer
    label = choices.index(answer)
    return {
        'story': story,
        'question_text': question,
        'choice1': choices[0],
        'choice2': choices[1],
        'choice3': choices[2],
        'choice4': choices[3],
        'label': label
    }

# Function to read and transform a JSON file
def process_json_file(file_path):
    with open(file_path, 'r') as file:
        json_data = json.load(file)
    return [convert_json_to_dataset(entry) for entry in json_data]

# File paths for your JSON files
train_file_paths = ['/content/dataset_1.json', '/content/dataset_2.json', '/content/dataset_3.json']

# Process each file and create a DataFrame
train_dfs = [pd.DataFrame(process_json_file(file_path)) for file_path in train_file_paths]

# Concatenate all DataFrames
train_df_2nd = pd.concat(train_dfs, ignore_index=True)


# File paths for your JSON files
val_file_paths = ['/content/validation_1.json', '/content/validation_2.json', '/content/validation_3.json']

# Process each file and create a DataFrame
val_dfs = [pd.DataFrame(process_json_file(file_path)) for file_path in val_file_paths]

# Concatenate all DataFrames
val_df_2nd = pd.concat(val_dfs, ignore_index=True)

val_df_2nd.to_csv('other_val_data.csv', index=False)
train_df_2nd.to_csv('other_train_data.csv', index=False)



def create_dataset_dict(df, test_size=0.2):

    # Split the DataFrame into training and validation sets
    train_df, val_df = train_test_split(df, test_size=test_size)

    # Convert pandas DataFrames to Hugging Face Datasets
    train_dataset = Dataset.from_pandas(train_df)
    validation_dataset = Dataset.from_pandas(val_df)

    # Create a DatasetDict
    dataset_dict = DatasetDict({
        "train": train_dataset,
        "validation": validation_dataset
    })

    return dataset_dict

# Example usage
other_datasets = create_dataset_dict(train_df_2nd, test_size=0.2)

## Model definition and Selection

In [None]:
models = {'bert':'bert-base-uncased', 'xlnet': 'xlnet-base-cased','longformer':'allenai/longformer-base-4096','bigbird':'google/bigbird-roberta-base','roberta':'roberta-base'}
tokenizers = {'bert':'bert-base-uncased', 'xlnet': 'xlnet-base-cased','longformer':'allenai/longformer-base-4096','bigbird':'google/bigbird-roberta-base','roberta':'roberta-base'}
batch_size = 4


#model choices
#Bert, Roberta, Xlnet, Longformer, Bigbird
model_choice = models['roberta']

#initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_choice, use_fast=True)

#Initalize Model
model = AutoModelForMultipleChoice.from_pretrained(model_choice)
# model = XLNetForMultipleChoice.from_pretrained(models['xlnet'])
# model = LongformerForMultipleChoice.from_pretrained(models['longformer'])
# model = BigBirdForMultipleChoice.from_pretrained(models['bigbird'])
# model = RobertaForMultipleChoice.from_pretrained("roberta-base")

In [None]:
ending_names = ["choice1", "choice2", "choice3", "choice4"]

def preprocess_function(examples):
    # Repeat each first sentence four times to go with the four possibilities of answer choices.
    first_sentences = [[context] * 4 for context in examples["story"]]
    # Grab all choices for each context.
    question_headers = examples["question_text"]
    second_sentences = [[f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)]

    # Flatten everything
    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])

    # Tokenize
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
    # Un-flatten
    return {k: [v[i:i+4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}

In [None]:
encoded_datasets = other_datasets.map(preprocess_function, batched=True)

#### Data collator

In [None]:
@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [None]:
accepted_keys = ["input_ids", "attention_mask", "label"]
features = [{k: v for k, v in encoded_datasets["train"][i].items() if k in accepted_keys} for i in range(10)]
# print(features[0])
batch = DataCollatorForMultipleChoice(tokenizer)(features)

In [None]:
[tokenizer.decode(batch["input_ids"][8][i].tolist()) for i in range(4)]

### Define Evaluation metrics

In [None]:
def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

In [None]:
torch.cuda.empty_cache()

### Define training arguments and trainer

In [None]:
model_name = model_choice.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-tr",
    evaluation_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_datasets["train"],
    eval_dataset=encoded_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer),
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
# Save the model checkpoint
#!zip -r /content/roberta-base-finetuned-tr /content

In [None]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_choice)
model = AutoModelForMultipleChoice.from_pretrained('my_model')


predicted_labels = []

for index, row in val_df_2nd.iterrows():
    prompt = row['story']
    candidates = [row[f'choice{i}'] for i in range(1, 5)]

    # Create inputs for each candidate
    inputs = tokenizer([[prompt, candidate] for candidate in candidates], return_tensors="pt", padding=True)

    # Define labels (0 for the first choice, 1 for the second, etc.)
    labels = torch.tensor(0).unsqueeze(0)  # Adjust the label index as needed

    # Perform inference
    outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
    logits = outputs.logits
    predicted_class = logits.argmax().item()

    # Append the predicted label to the list
    predicted_labels.append(predicted_class)
