In [5]:
from datasets import load_dataset
import matplotlib.pyplot as plt

This is my (JC) contribution to this project. I've implemented the code for splitting the dataset into seperate datasets that we can then use for evaulation, and hence scoring of the pre-trained models we're attempting to use

In [6]:
# Define a function to categorize questions
def naive_categorize_question(example):
    question = example['question'].lower()
    if 'date' in question:
        return 'date'
    if 'during' in question:
        return 'during'
    if 'how are' in question:
        return 'how are'
    if 'how big' in question or 'how large' in question:
        return 'how big/size'
    if 'how many' in question or 'how much' in question:
        return 'how m/m'
    if 'how old' in question:
        return 'how old'
    if 'what time' in question:
        return 'what time'
    if 'what' in question or 'which' in question:
        return 'what'
    if 'when' in question:
        return 'when'
    if 'where' in question:
        return 'where'
    if 'who' in question:
        return 'who'
    if 'whom' in question:
        return 'whom'
    if 'why' in question:
        return 'why'
    else:
        return 'undefined'

 Run the cell below to split the squad dataset into question classes and save them in `question_classes.txt`.

In [7]:
# Load the SQuAD dataset
import json
dataset = load_dataset('squad', split='validation')

# Add a new field for question categories
dataset = dataset.map(lambda entry: {"category": naive_categorize_question(entry)})

# Split the dataset into categories
categories = dataset.unique('category')
categorized_datasets = {category: dataset.filter(lambda example: example['category'] == category) for category in categories}
jsonfile = {}
for cat in categorized_datasets:
    catclasses = []
    for row in categorized_datasets.get(cat):
        catclasses.append(row)
    jsonfile[cat] = catclasses

with open("question_classes.txt", "w") as f:
    json.dump(jsonfile, f)

# Print out the size of each categorized dataset
for category, subset in categorized_datasets.items():
    print(f"Category '{category}': {len(subset)} questions")

# If you want to save these subsets, you can do so like this:
# for category, subset in categorized_datasets.items():
#     subset.to_csv(f'squad_{category}.csv')

Found cached dataset squad (/Users/justincheng/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
Loading cached processed dataset at /Users/justincheng/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-cfa73881a5e79563.arrow
Loading cached processed dataset at /Users/justincheng/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-fddd1f6518fa3d11.arrow
Loading cached processed dataset at /Users/justincheng/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-5a08184c3489f6af.arrow
Loading cached processed dataset at /Users/justincheng/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-bef8d2ee0b44d316.arrow
Loading cached processed dataset at 

Category 'what': 6595 questions
Category 'where': 464 questions
Category 'who': 1151 questions
Category 'undefined': 421 questions
Category 'how m/m': 771 questions
Category 'during': 172 questions
Category 'when': 710 questions
Category 'date': 60 questions
Category 'how old': 21 questions
Category 'why': 148 questions
Category 'how big/size': 12 questions
Category 'what time': 10 questions
Category 'how are': 35 questions


In [2]:
import transformers
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

from transformers import pipeline




In [8]:
from transformers.pipelines.pt_utils import KeyPairDataset
from datasets import load_metric
import numpy as np

metric = load_metric("squad")
# Function to calculate metrics
def compute_metrics(eval_preds):
    
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Pretrained models to evaluate
model_names = [

    'distilbert-base-cased-distilled-squad',
    'distilbert-base-uncased-distilled-squad',
    'quangb1910128/bert-finetuned-squad'
]

# Evaluate each model
for model_name in model_names:
    print(f"Evaluating {model_name}...")

    # Create a QA pipeline
    qa_pipeline = pipeline("question-answering", model=model_name)

    # for category in categorized_datasets.keys():
    for category in ['what time']:
        print(f"Starting eval on category: {category}")
        dataset = categorized_datasets[category]
        # Making predictions and evaluating
        
        predictions = []
        for entry in dataset:
            result = qa_pipeline(entry["question"], entry["context"])
            pred = {'id': entry['id'], 'prediction_text': result['answer']}
            predictions.append(pred)

        ref_ds = dataset.select_columns(['id', 'answers'])
        references = []
        for entry in ref_ds:
            answer = {'id': entry['id'], 'answers': entry['answers']}
            references.append(answer)

        results = metric.compute(predictions=predictions, references=references)

        print(f"Model: {model_name} Category: {category}")
        print(f"Exact Match (EM): {results['exact_match']:.2f}")
        print(f"F1 Score: {results['f1']:.2f}\n")

# You would need to implement the compute_f1 function based on the SQuAD evaluation script or import it if available.
# Also note that making predictions on the full dataset would be slow and resource-intensive;
# you might want to batch the predictions or use a subset of the data.

Evaluating distilbert-base-cased-distilled-squad...
Starting eval on category: what time
Model: distilbert-base-cased-distilled-squad Category: what time
Exact Match (EM): 80.00
F1 Score: 87.33

Evaluating distilbert-base-uncased-distilled-squad...
Starting eval on category: what time
Model: distilbert-base-uncased-distilled-squad Category: what time
Exact Match (EM): 80.00
F1 Score: 88.33

Evaluating quangb1910128/bert-finetuned-squad...
Starting eval on category: what time
Model: quangb1910128/bert-finetuned-squad Category: what time
Exact Match (EM): 90.00
F1 Score: 93.33



## Custom ensemble method (by Justin)

Using the results of the pretrained, finetuned and custom finetuned models on the validation set, we can create a customised ensemble method that:

1. chooses a model out of the top 3 performing models of that particular category based on probability, where the top performing model gets chosen 95% of the time, the 2nd best performing model gets chosen 3% of the time, and the 3rd best performing model gets chosen 2% of the time.

OR

2. chooses the best performing model based on the particular category of question

In [52]:
# Load respective json files into their dictionaries

import json

with open('../preds_for_ensemble/finetune/squad-albert-base-v2_preds.json', 'r') as f:
    squad_albert_base_v2_data_dict = json.load(f)

with open('../preds_for_ensemble/finetune/squad-bert-base-uncased_preds.json', 'r') as f:
    squad_bert_base_uncased_data_dict = json.load(f)

with open('../preds_for_ensemble/finetune/squad-distilbert-base-uncased_preds.json', 'r') as f:
    squad_distilbert_base_uncased_data_dict = json.load(f)

with open('../preds_for_ensemble/finetune/squad-roberta-base.json', 'r') as f:
    squad_roberta_base_data_dict = json.load(f)

with open('../preds_for_ensemble/finetune/squad-xlm-roberta-base_preds.json', 'r') as f:
    squad_xlm_roberta_base_data_dict = json.load(f)

with open('../preds_for_ensemble/finetune/squad-xlnet-base-cased_preds.json', 'r') as f:
    squad_xlnet_base_cased_data_dict = json.load(f)

with open('../preds_for_ensemble/pretrained/distilbert-base-cased-distilled-squad_preds.json', 'r') as f:
    pretrained_distilbert_base_cased = json.load(f)

with open('../preds_for_ensemble/pretrained/distilbert-base-uncased-distilled-squad_preds.json', 'r') as f:
    pretrained_distilbert_base_uncased = json.load(f)

with open('../preds_for_ensemble/pretrained/quangb1910128_bert-finetuned-squad_preds.json', 'r') as f:
    pretrained_quang_bert = json.load(f)

with open('../preds_for_ensemble/finetune_custom/frozen-bert-custom_preds.json', 'r') as f:
    custom_frozen_bert = json.load(f)

with open('../preds_for_ensemble/finetune_custom/frozen-roberta-custom_preds.json', 'r') as f:
    custom_frozen_roberta = json.load(f)  

### Data preprocessing

In [73]:
import csv
import random

metric = load_metric("squad")

# load squad dataset
dataset = load_dataset('squad', split='validation')

print(dataset)

# Ensure the data in the csv file is updated with the scores of each model in each respective category from the google sheets
model_data_csv = 'model_data.csv'

# split csv file accoring to their columns
cols = {'what': 1, 'where': 2, 'who': 3, 'undefined': 4, 'how m/m': 5, 'during': 6, 'when': 7, 'date': 8, 'how old': 9, 'why': 10, 'how big/size': 11, 'what time': 12, 'how are': 13}

# compile a list of all the predictions from every model
# NOTE: the order of the models should be same as that of the csv file / google sheets
list_of_data_dict = [pretrained_distilbert_base_cased, 
                     pretrained_distilbert_base_uncased, 
                     pretrained_quang_bert, 
                     squad_distilbert_base_uncased_data_dict, 
                     squad_xlm_roberta_base_data_dict, 
                     squad_roberta_base_data_dict, 
                     squad_bert_base_uncased_data_dict, 
                     squad_xlnet_base_cased_data_dict, 
                     squad_albert_base_v2_data_dict, 
                     custom_frozen_roberta, 
                     custom_frozen_bert]
    


Found cached dataset squad (/Users/justincheng/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10570
})


### First method: Using probabilities

For each category of questions, we will choose the best model 95% of the time, the 2nd best model 3% of the time, and the 3rd best model 2% of the time.

In [69]:

def get_values_of_col(index):
    """
    Gets the values of the particular column of a csv file
    """
    result_list = []
    with open(model_data_csv, 'r') as f:
        reader = csv.reader(f)
        next(reader)
        for row in reader:
            data_row = row[0]
            data_split = data_row.split()
            data = data_split[index]
            result_list.append(data)
    return result_list

def random_choice(model_indices):
    """
    Randomly chooses a model based on the scores of the top 3 models.
    The best model will be chosen 95% of the time, the second best model will be chosen 3% of the time, and the third best model will be chosen 2% of the time.
    """
    random_number = random.random()
    if random_number >= 0 and random_number < 0.95:
        return model_indices[0]
    if random_number >= 0.95 and random_number < 0.98:
        return model_indices[1]
    else:
        return model_indices[2]
        

# get list of models in csv file
model_names = get_values_of_col(0)

print(model_names)

predictions = []

curr = 0

for entry in dataset:
    # identify the id of the question
    question_id = entry['id']
    # identify the category of the question
    category = naive_categorize_question(entry)
    index_csv = cols.get(category)
    scores_str = get_values_of_col(index_csv)
    # get scores of models performing in that particular category
    scores = list(map(float, scores_str))
    # sort the scores from highest to lowest and retrieve the top 3 scores
    top_3_scores = sorted(scores, reverse=True)[:3]
    # obtain indices of top 3 models
    # note that model_indices are already sorted from largest to smallest
    model_indices = []
    for score in top_3_scores:
        model_index = scores.index(score)
        model_indices.append(model_index)
    # get chosen model in string
    chosen_model_index = random_choice(model_indices)
    # obtain predictions of the chosen model
    chosen_model_preds = list_of_data_dict[chosen_model_index]
    # obtain question entry by choosing question id
    question = chosen_model_preds.get(question_id)
    pred = {'id': question_id, 'prediction_text': question['answer']}
    predictions.append(pred)
    print("curr iter: ", curr)
    curr += 1

print("prediction done")        

ref_ds = dataset.select_columns(['id', 'answers'])
references = []
for entry in ref_ds:
    answer = {'id': entry['id'], 'answers': entry['answers']}
    references.append(answer)

results = metric.compute(predictions=predictions, references=references)

['distilbert-base-cased-distilled-squad', 'distilbert-base-uncased-distilled-squad', 'quangb1910128/bert-finetuned-squad', 'diffuserconfuser/squad-distilbert-base-uncased', 'diffuserconfuser/squad-xlm-roberta-base', 'diffuserconfuser/squad-roberta-base', 'diffuserconfuser/squad-bert-base-uncased', 'diffuserconfuser/squad-xlnet-base-cased', 'diffuserconfuser/squad-albert-base-v2', 'amaachii/frozen-roberta-custom', 'amaachii/frozen-bert-custom']
curr iter:  0
curr iter:  1
curr iter:  2
curr iter:  3
curr iter:  4
curr iter:  5
curr iter:  6
curr iter:  7
curr iter:  8
curr iter:  9
curr iter:  10
curr iter:  11
curr iter:  12
curr iter:  13
curr iter:  14
curr iter:  15
curr iter:  16
curr iter:  17
curr iter:  18
curr iter:  19
curr iter:  20
curr iter:  21
curr iter:  22
curr iter:  23
curr iter:  24
curr iter:  25
curr iter:  26
curr iter:  27
curr iter:  28
curr iter:  29
curr iter:  30
curr iter:  31
curr iter:  32
curr iter:  33
curr iter:  34
curr iter:  35
curr iter:  36
curr it

### Results of the first method

In [70]:
print(f"Exact Match (EM): {results['exact_match']:.2f}")
print(f"F1 Score: {results['f1']:.2f}\n")

Exact Match (EM): 85.34
F1 Score: 91.98



### Second method: Using the best model for each category of question

This will produce a deterministic result, as contrast to the first method.

In [71]:
predictions = []

curr = 0

for entry in dataset:
    # identify the id of the question
    question_id = entry['id']
    # identify the category of the question
    category = naive_categorize_question(entry)
    index_csv = cols.get(category)
    scores_str = get_values_of_col(index_csv)
    # get scores of models performing in that particular category
    scores = list(map(float, scores_str))
    # choose the best performing model
    chosen_model_index = scores.index(sorted(scores, reverse=True)[0])
    # obtain predictions of the chosen model
    chosen_model_preds = list_of_data_dict[chosen_model_index]
    # obtain question entry by choosing question id
    question = chosen_model_preds.get(question_id)
    pred = {'id': question_id, 'prediction_text': question['answer']}
    predictions.append(pred)
    print("curr iter: ", curr)
    curr += 1

print("prediction done")        

ref_ds = dataset.select_columns(['id', 'answers'])
references = []
for entry in ref_ds:
    answer = {'id': entry['id'], 'answers': entry['answers']}
    references.append(answer)

results = metric.compute(predictions=predictions, references=references)

curr iter:  0
curr iter:  1
curr iter:  2
curr iter:  3
curr iter:  4
curr iter:  5
curr iter:  6
curr iter:  7
curr iter:  8
curr iter:  9
curr iter:  10
curr iter:  11
curr iter:  12
curr iter:  13
curr iter:  14
curr iter:  15
curr iter:  16
curr iter:  17
curr iter:  18
curr iter:  19
curr iter:  20
curr iter:  21
curr iter:  22
curr iter:  23
curr iter:  24
curr iter:  25
curr iter:  26
curr iter:  27
curr iter:  28
curr iter:  29
curr iter:  30
curr iter:  31
curr iter:  32
curr iter:  33
curr iter:  34
curr iter:  35
curr iter:  36
curr iter:  37
curr iter:  38
curr iter:  39
curr iter:  40
curr iter:  41
curr iter:  42
curr iter:  43
curr iter:  44
curr iter:  45
curr iter:  46
curr iter:  47
curr iter:  48
curr iter:  49
curr iter:  50
curr iter:  51
curr iter:  52
curr iter:  53
curr iter:  54
curr iter:  55
curr iter:  56
curr iter:  57
curr iter:  58
curr iter:  59
curr iter:  60
curr iter:  61
curr iter:  62
curr iter:  63
curr iter:  64
curr iter:  65
curr iter:  66
curr 

### Results of the second method



In [72]:
print(f"Exact Match (EM): {results['exact_match']:.2f}")
print(f"F1 Score: {results['f1']:.2f}\n")

Exact Match (EM): 85.50
F1 Score: 92.06

