In [2]:
import json
import torch
from transformers import BertTokenizer, BertForMultipleChoice
import json
from transformers import RobertaTokenizer, RobertaForMultipleChoice
from word2number import w2n

In [3]:
def load_data(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

In [4]:
def input_reframing(text):
    #Convert numbers in text to words for improved numeracy handling.
    words = text.split()
    for i, word in enumerate(words):
        try:
            words[i] = w2n.word_to_num(word)
        except:
            pass
    return ' '.join(map(str, words))

In [5]:
def preprocess_data_wo_ir(data):
    # Preprocess data by extracting question, options, and correct answer wihtout any input reframing
    questions, choices, labels = [], [], []
    for item in data:
        questions.append(item['question'])
        choices.append([item['Option1'], item['Option2']])
        labels.append(0 if item['answer'] == 'Option 1' else 1)
    return questions, choices, labels


In [6]:
def preprocess_data_ir(data):
    #Preprocess data by extracting question, options, and correct answer and applying input reframing on questions
    questions, choices, labels = [], [], []
    for item in data:
        question = input_reframing(item['question'])
        questions.append(question)
        choices.append([input_reframing(item['Option1']), input_reframing(item['Option2'])])
        labels.append(0 if item['answer'] == 'Option 1' else 1)
    return questions, choices, labels

In [7]:
# Load datasets
train_data = load_data("QQA_train.json")
dev_data = load_data("QQA_dev.json")
test_data = load_data("QQA_test.json")

In [8]:
# Preprocess datasets without any input reframing
train_qs_1, train_choices_1, train_labels_1 = preprocess_data_wo_ir(train_data)
dev_qs_1, dev_choices_1, dev_labels_1 = preprocess_data_wo_ir(dev_data)
test_qs_1, test_choices_1, test_labels_1 = preprocess_data_wo_ir(test_data)

In [9]:
# Preprocess datasets with input reframing
train_qs_2, train_choices_2, train_labels_2 = preprocess_data_ir(train_data)
dev_qs_2, dev_choices_2, dev_labels_2 = preprocess_data_ir(dev_data)
test_qs_2, test_choices_2, test_labels_2 = preprocess_data_ir(test_data)

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [11]:
# Load BERT and RoBERTa tokenizer and pretrained model
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertForMultipleChoice.from_pretrained("bert-base-uncased").to(device)
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = RobertaForMultipleChoice.from_pretrained("roberta-base").to(device)

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Example tokenization for one sample
def encode_example(tokenizer,question, choices):
    encodings = tokenizer([question] * 2, choices, padding=True, truncation=True, return_tensors="pt")
    return {key: val.unsqueeze(0).to(device) for key, val in encodings.items()}

In [13]:
# Evaluate on dev set
def evaluate(model,tokenizer,dev_qs, dev_choices, dev_labels):
    correct = 0
    model.eval()
    with torch.no_grad():
        for q, c, label in zip(dev_qs, dev_choices, dev_labels):
            inputs = encode_example(tokenizer,q, c)
            logits = model(**inputs).logits
            pred = torch.argmax(logits, dim=1).item()
            correct += (pred == label)
    return correct / len(dev_labels)

In [14]:
accuracy = evaluate(bert_model,bert_tokenizer,dev_qs_1, dev_choices_1, dev_labels_1)
print("BERT without input reframing")
print(f"Baseline Accuracy: {accuracy:.4f}")

BERT without input reframing
Baseline Accuracy: 0.5556


In [15]:
accuracy = evaluate(roberta_model,roberta_tokenizer,dev_qs_1, dev_choices_1, dev_labels_1)
print("RoBERTa without input reframing")
print(f"Baseline Accuracy: {accuracy:.4f}")

RoBERTa without input reframing
Baseline Accuracy: 0.4691


In [16]:
accuracy = evaluate(bert_model,bert_tokenizer,dev_qs_2, dev_choices_2, dev_labels_2)
print("BERT with input reframing")
print(f"Baseline Accuracy: {accuracy:.4f}")

BERT with input reframing
Baseline Accuracy: 0.5309


In [17]:
accuracy = evaluate(roberta_model,roberta_tokenizer,dev_qs_2, dev_choices_2, dev_labels_2)
print("RoBERTa with input reframing")
print(f"Baseline Accuracy: {accuracy:.4f}")

RoBERTa with input reframing
Baseline Accuracy: 0.4321
