## Install and Import Libraries

In [1]:
! pip install -U accelerate
! pip install -U transformers
! pip install datasets

Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0
Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.5 

In [2]:
import pandas as pd
import re
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import json
from transformers import AutoTokenizer
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer, XLNetForMultipleChoice, LongformerForMultipleChoice, BigBirdForMultipleChoice, RobertaForMultipleChoice
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch
import numpy as np

## Import Data

In [3]:
df = pd.read_csv('/content/train_data.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,story,story_id,question,answer,reason,question_id
0,0,"In 2011, a group of geography students from Ox...",50,When did the team start focusing on Eastern Eu...,B. 2015,"In the story, it is mentioned that the team sw...",2
1,1,"In 2011, a group of geography students from Ox...",50,What era did the students focus on when compil...,C. Post-Cold War era,The students focused on the post-Cold War era ...,3
2,2,"In 2011, a group of geography students from Ox...",50,What period did the students focus on while ga...,C. From the end of the Second World War till t...,"As per the text, the students compiled data fo...",4
3,3,"In 2011, a group of geography students from Ox...",50,When was the project completed?\nAnswer choice...,C. 2020,The text explicitly mentions that the project ...,5
4,4,"In 2011, a group of geography students from Ox...",50,In what region did they first start their data...,B. Western Europe,The text mentions that the geography students ...,6


## Data Preprocessing

In [5]:
def split_question(row):
    question_text = ""
    choices_text = ""

    if 'Answer Choices:' in row['question']:
        # Split using "Answer Choices:"
        parts = row['question'].split('Answer Choices:')
        question_text = parts[0].strip()
        choices_text = parts[1]
    elif 'Answer choices:' in row['question']:
        # Handle case with just "Answer choices"
        parts = row['question'].split('Answer choices:')
        question_text = parts[0].strip()
        choices_text = parts[1]
    elif 'Answer Choices' in row['question']:
        # Handle case with just "Answer choices"
        parts = row['question'].split('Answer Choices')
        question_text = parts[0].strip()
        choices_text = parts[1]
    elif 'Answer choices' in row['question']:
        # Handle case with just "Answer choices"
        parts = row['question'].split('Answer choices')
        question_text = parts[0].strip()
        choices_text = parts[1]
    elif '?' in row['question']:
        # Split on the last question mark '?'
        last_question_mark = row['question'].rfind('?')
        question_text = row['question'][:last_question_mark + 1].strip()
        choices_text = row['question'][last_question_mark + 1:]


    # Handle cases where there are no choices
    if not choices_text.strip():
        return pd.Series([question_text, "", "", "", ""])

    # Split the choices and remove the first three characters from each choice
    choices = [choice[3:].strip() for choice in choices_text.split('\n') if choice.strip()]

    # Pad the choices list if there are fewer than 4 choices
    choices += [""] * (4 - len(choices))

    return pd.Series([question_text] + choices[:4])



In [6]:
# Apply the function to each row
new_columns = df.apply(split_question, axis=1)
new_columns.columns = ['question_text', 'choice1', 'choice2', 'choice3', 'choice4']

# Concatenate the new columns with the original dataframe
result_df = pd.concat([df, new_columns], axis=1)

result_df['answer'] = result_df['answer'].str[3:]

result_df = result_df[['story','question_text',
       'choice1', 'choice2', 'choice3', 'choice4','answer']]

In [7]:
result_df.head(10)

Unnamed: 0,story,question_text,choice1,choice2,choice3,choice4,answer
0,"In 2011, a group of geography students from Ox...",When did the team start focusing on Eastern Eu...,2011,2015,2020,2000,2015
1,"In 2011, a group of geography students from Ox...",What era did the students focus on when compil...,World War II,Cold War era,Post-Cold War era,The start of the 21st century,Post-Cold War era
2,"In 2011, a group of geography students from Ox...",What period did the students focus on while ga...,Post-Cold War era,Pre-Second World War,From the end of the Second World War till the ...,21st-century era,From the end of the Second World War till the ...
3,"In 2011, a group of geography students from Ox...",When was the project completed?,2011,2015,2020,Not yet completed,2020
4,"In 2011, a group of geography students from Ox...",In what region did they first start their data...,Eastern Europe,Western Europe,Asia,America,Western Europe
5,"In the 1950s, the United States implemented a ...",Which country was the first to extend its comp...,Japan,Finland,United States,Canada,United States
6,"In the 1950s, the United States implemented a ...",Which country followed the Canadian model of e...,United States,Australia,Japan,Finland,Australia
7,"In the 1950s, the United States implemented a ...",Which country made changes to their compulsory...,Japan,Finland,Canada,Australia,Japan
8,"In the 1950s, the United States implemented a ...",Which country included an extra year in their ...,Canada,United States,Finland,Australia,Canada
9,"In the 1950s, the United States implemented a ...",Which country was the last to reform its educa...,Japan,Finland,Canada,Australia,Australia


### Convert answer to label

In [8]:
def answer_to_label(row):
    choices = [row['choice1'].strip().lower().replace('.',''), row['choice2'].strip().lower().replace('.',''),
               row['choice3'].strip().lower().replace('.',''), row['choice4'].strip().lower().replace('.','')]
    correct_answer = row['answer'].strip().lower().replace('.','')

    # Check which choice matches the correct answer
    for i, choice in enumerate(choices):
        if choice == correct_answer:
            return i
    return 'unknown'

# Apply the function to each row
result_df['answer'] = result_df.apply(answer_to_label, axis=1)

### Remove data with wrong answers

In [9]:
result_df[result_df['answer']=='unknown']
result_df = result_df[result_df['answer'] != 'unknown']
result_df = result_df.rename(columns={"answer": "label"})

### Remove data with missing answer choices

In [10]:
 result_df[(result_df['choice1'] == "") | (result_df['choice2'] == "") | (result_df['choice3'] == "") | (result_df['choice4'] == "")]
 result_df =  result_df[(result_df['choice1'] != "") & (result_df['choice2'] != "") & (result_df['choice3'] != "") & (result_df['choice4'] != "")]

## Convert preprocessed data to CSV

In [11]:
result_df.to_csv("preprocessed_data.csv")

## Convert data to form usable by model

In [12]:
def create_dataset_dict(df, test_size=0.2):

    # Split the DataFrame into training and validation sets
    train_df, val_df = train_test_split(df, test_size=test_size)

    # Convert pandas DataFrames to Hugging Face Datasets
    train_dataset = Dataset.from_pandas(train_df)
    validation_dataset = Dataset.from_pandas(val_df)

    # Create a DatasetDict
    dataset_dict = DatasetDict({
        "train": train_dataset,
        "validation": validation_dataset
    })

    return dataset_dict

# Example usage
datasets = create_dataset_dict(result_df, test_size=0.2)

# Now dataset_dict contains train and validation datasets
print(datasets)


DatasetDict({
    train: Dataset({
        features: ['story', 'question_text', 'choice1', 'choice2', 'choice3', 'choice4', 'label', '__index_level_0__'],
        num_rows: 2867
    })
    validation: Dataset({
        features: ['story', 'question_text', 'choice1', 'choice2', 'choice3', 'choice4', 'label', '__index_level_0__'],
        num_rows: 717
    })
})


### Example of processed Data

In [13]:
def show_one(example):
    print(f"Context: {example['story']}")
    print(f"  A - {example['question_text']} {example['choice1']}")
    print(f"  B - {example['question_text']} {example['choice2']}")
    print(f"  C - {example['question_text']} {example['choice3']}")
    print(f"  D - {example['question_text']} {example['choice4']}")
    print(f"\nGround truth: option {['A', 'B', 'C', 'D'][example['label']]}")
show_one(datasets["train"][0])

Context: In 2028, an 8.2 earthquake devastated the cities of Long Beach and Santa Monica in California on March 30. Just two months later, Midwest was rattled by a series of deadly tornado outbreaks which obliterated rural communities in Nebraska and Kansas in the final week of May. In the ensuing summer, Central America faced its worst drought in Hurricane Alley leading to devastating famine in Belize and Costa Rica. The following December, extreme snowstorms engulfed Greenland causing a significant rise in sea levels. The subsequent spring of 2029, the monsoons came early to South Asia causing massive floods in Bangladesh, India, and Sri Lanka.
  A - Which region did not face a disaster in 2028? Greenland
  B - Which region did not face a disaster in 2028? South Asia
  C - Which region did not face a disaster in 2028? Central America
  D - Which region did not face a disaster in 2028? Nebraska and Kansas

Ground truth: option B


## Model definition and Selection

In [14]:
models = {'bert':'bert-base-uncased', 'xlnet': 'xlnet-base-cased','longformer':'allenai/longformer-base-4096','bigbird':'google/bigbird-roberta-base','roberta':'roberta-base'}
tokenizers = {'bert':'bert-base-uncased', 'xlnet': 'xlnet-base-cased','longformer':'allenai/longformer-base-4096','bigbird':'google/bigbird-roberta-base','roberta':'roberta-base'}
batch_size = 8


#model choices - Bert, Roberta, Xlnet, Longformer, Bigbird
model_choice = models['xlnet']

#initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_choice, use_fast=True)

#Initalize Model
model = AutoModelForMultipleChoice.from_pretrained(model_choice)
# model = XLNetForMultipleChoice.from_pretrained(models['xlnet'])
# model = LongformerForMultipleChoice.from_pretrained(models['longformer'])
# model = BigBirdForMultipleChoice.from_pretrained(models['bigbird'])
# model = RobertaForMultipleChoice.from_pretrained("roberta-base")

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of XLNetForMultipleChoice were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
ending_names = ["choice1", "choice2", "choice3", "choice4"]

def preprocess_function(examples):
    # Repeat each first sentence four times to go with the four possibilities of answer choices.
    first_sentences = [[context] * 4 for context in examples["story"]]
    # Grab all choices for each context.
    question_headers = examples["question_text"]
    second_sentences = [[f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)]

    # Flatten everything
    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])

    # Tokenize
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
    # Un-flatten
    return {k: [v[i:i+4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}

In [16]:
encoded_datasets = datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/2867 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/717 [00:00<?, ? examples/s]

#### Data collator

In [17]:
@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [18]:
accepted_keys = ["input_ids", "attention_mask", "label"]
features = [{k: v for k, v in encoded_datasets["train"][i].items() if k in accepted_keys} for i in range(10)]
# print(features[0])
batch = DataCollatorForMultipleChoice(tokenizer)(features)

You're using a XLNetTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [19]:
[tokenizer.decode(batch["input_ids"][8][i].tolist()) for i in range(4)]

['<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad> Jeremy, a climatologist, was sent to Antarctica in the beginning of January to analyze weather and climate change data. Upon arrival, he set up his weather station and data monitoring devices. In March, he shifted to the base near the South Pole to conduct further research. In July, he moved to Greenland to compare data collected from both polar regions. After analyzing the data in September, he returned to his office in Seattle to finalize his report.<sep> What was Jeremy doing in July? Setting up weather stations in Antarctica<sep><cls>',
 '<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pa

### Define Evaluation metrics

In [20]:
def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

In [21]:
torch.cuda.empty_cache()

### Define training arguments and trainer

In [22]:
model_name = model_choice.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-tr",
    evaluation_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
)

In [23]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_datasets["train"],
    eval_dataset=encoded_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer),
    compute_metrics=compute_metrics,
)

In [24]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
0,No log,1.248682,0.423989
1,No log,1.09207,0.577406
2,No log,1.047492,0.591353


TrainOutput(global_step=267, training_loss=1.16791743017761, metrics={'train_runtime': 2023.529, 'train_samples_per_second': 4.25, 'train_steps_per_second': 0.132, 'total_flos': 3733879281269592.0, 'train_loss': 1.16791743017761, 'epoch': 2.97})

### Save the model

In [25]:
trainer.save_model("my_model")

In [34]:
# Save the model checkpoint
!zip -r /content/my_model.zip /content/my_model


  adding: content/my_model/ (stored 0%)
  adding: content/my_model/training_args.bin (deflated 51%)
  adding: content/my_model/model.safetensors (deflated 7%)
  adding: content/my_model/config.json (deflated 52%)
  adding: content/my_model/tokenizer_config.json (deflated 82%)
  adding: content/my_model/tokenizer.json (deflated 75%)
  adding: content/my_model/special_tokens_map.json (deflated 52%)


In [35]:
from google.colab import files
files.download("/content/my_model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Preprocess the training data

In [29]:
test_df = pd.read_csv('/content/validation_data.csv')

In [36]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,story,story_id,question,answer,reason,question_id,Label 1,Label 2 Pujith,Label 3,Label 4
0,0.0,"In 1987, the residents of the small coastal vi...",0.0,Which city suffered from storms twice?\nAnswer...,b) Vladost,The story explicitly mentions that Vladost was...,2.0,True,True,True,
1,1.0,"In 1987, the residents of the small coastal vi...",0.0,"If there were to be a storm in 2016, which cit...",a) Islywid,The story states that after the 1987 storm in ...,3.0,True,True,True,
2,2.0,"In 1987, the residents of the small coastal vi...",0.0,Which city was the latest to experience a disa...,a) Ellban,The story says Ellban was hit by a flood in 20...,4.0,True,True,True,
3,3.0,"In 1987, the residents of the small coastal vi...",0.0,Which city experienced a disaster first in the...,c) Islywid,According to the progression of events given i...,5.0,True,True,True,
4,4.0,"In 1987, the residents of the small coastal vi...",0.0,Which city has never faced any severe storm?\n...,c) Ellban,The storyline indicates that a flood (not a st...,6.0,True,False,True,


In [38]:
test_df = test_df.dropna(subset=['question'])

In [39]:
# Apply the function to each row
new_columns = test_df.apply(split_question, axis=1)
new_columns.columns = ['question_text', 'choice1', 'choice2', 'choice3', 'choice4']

# Concatenate the new columns with the original dataframe
result_test_df = pd.concat([test_df, new_columns], axis=1)

result_test_df['answer'] = result_test_df['answer'].str[3:]

result_test_df = result_test_df[['story','question_text',
       'choice1', 'choice2', 'choice3', 'choice4','answer']]

In [42]:
result_test_df.head(2)

Unnamed: 0,story,question_text,choice1,choice2,choice3,choice4
0,"In 1987, the residents of the small coastal vi...",Which city suffered from storms twice?,Ellban,Vladost,Islywid,None of the above
1,"In 1987, the residents of the small coastal vi...","If there were to be a storm in 2016, which cit...",Islywid,Vladost,Ellban,None of the above


In [40]:
def answer_to_label(row):
    choices = [row['choice1'].strip().lower().replace('.',''), row['choice2'].strip().lower().replace('.',''),
               row['choice3'].strip().lower().replace('.',''), row['choice4'].strip().lower().replace('.','')]
    correct_answer = row['answer'].strip().lower().replace('.','')

    # Check which choice matches the correct answer
    for i, choice in enumerate(choices):
        if choice == correct_answer:
            return i
    return 'unknown'

# Apply the function to each row
result_test_df['answer'] = result_test_df.apply(answer_to_label, axis=1)
result_test_df = result_test_df[result_test_df['answer'] != 'unknown']
result_test_df =  result_test_df[(result_test_df['choice1'] != "") & (result_test_df['choice2'] != "") & (result_test_df['choice3'] != "") & (result_test_df['choice4'] != "")]

In [41]:
answers = result_test_df['answer']
result_test_df = result_test_df.drop(columns=['answer'])

In [45]:
result_test_df.head(2)

Unnamed: 0,story,question_text,choice1,choice2,choice3,choice4
0,"In 1987, the residents of the small coastal vi...",Which city suffered from storms twice?,Ellban,Vladost,Islywid,None of the above
1,"In 1987, the residents of the small coastal vi...","If there were to be a storm in 2016, which cit...",Islywid,Vladost,Ellban,None of the above


In [43]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_choice)
model = AutoModelForMultipleChoice.from_pretrained('my_model')


predicted_labels = []

for index, row in result_test_df.iterrows():
    prompt = row['story']
    candidates = [row[f'choice{i}'] for i in range(1, 5)]

    # Create inputs for each candidate
    inputs = tokenizer([[prompt, candidate] for candidate in candidates], return_tensors="pt", padding=True)

    # Define labels (0 for the first choice, 1 for the second, etc.)
    labels = torch.tensor(0).unsqueeze(0)  # Adjust the label index as needed

    # Perform inference
    outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
    logits = outputs.logits
    predicted_class = logits.argmax().item()

    # Append the predicted label to the list
    predicted_labels.append(predicted_class)


In [48]:
result_test_df['answer'] = list(answers)
result_test_df['predicted_labels'] = predicted_labels
result_test_df.head(2)

Unnamed: 0,story,question_text,choice1,choice2,choice3,choice4,answer,predicted_labels
0,"In 1987, the residents of the small coastal vi...",Which city suffered from storms twice?,Ellban,Vladost,Islywid,None of the above,1,2
1,"In 1987, the residents of the small coastal vi...","If there were to be a storm in 2016, which cit...",Islywid,Vladost,Ellban,None of the above,0,0


In [50]:
# Assuming you have a DataFrame named result_test_df
correct_predictions = result_test_df['answer'] == result_test_df['predicted_labels']

# Calculate accuracy
accuracy = correct_predictions.sum() / len(correct_predictions)

print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 40.98%


In [49]:
result_test_df.to_csv('results.csv')