In [3]:
from datasets import load_dataset
import matplotlib.pyplot as plt

This is my (JC) contribution to this project. I've implemented the code for splitting the dataset into seperate datasets that we can then use for evaulation, and hence scoring of the pre-trained models we're attempting to use

In [4]:
# Define a function to categorize questions
def naive_categorize_question(example):
    question = example['question'].lower()
    if 'date' in question:
        return 'date'
    if 'during' in question:
        return 'during'
    if 'how are' in question:
        return 'how are'
    if 'how big' in question or 'how large' in question:
        return 'how big/size'
    if 'how many' in question or 'how much' in question:
        return 'how m/m'
    if 'how old' in question:
        return 'how old'
    if 'what time' in question:
        return 'what time'
    if 'what' in question or 'which' in question:
        return 'what'
    if 'when' in question:
        return 'when'
    if 'where' in question:
        return 'where'
    if 'who' in question:
        return 'who'
    if 'whom' in question:
        return 'whom'
    if 'why' in question:
        return 'why'
    else:
        return 'undefined'

In [12]:
# Load the SQuAD dataset
import json
dataset = load_dataset('squad', split='validation')

# Add a new field for question categories
dataset = dataset.map(lambda entry: {"category": naive_categorize_question(entry)})

# Split the dataset into categories
categories = dataset.unique('category')
categorized_datasets = {category: dataset.filter(lambda example: example['category'] == category) for category in categories}
jsonfile = {}
for cat in categorized_datasets:
    catclasses = []
    for row in categorized_datasets.get(cat):
        catclasses.append(row)
    jsonfile[cat] = catclasses

with open("question_classes.txt", "w") as f:
    json.dump(jsonfile, f)

# Print out the size of each categorized dataset
for category, subset in categorized_datasets.items():
    print(f"Category '{category}': {len(subset)} questions")

# If you want to save these subsets, you can do so like this:
# for category, subset in categorized_datasets.items():
#     subset.to_csv(f'squad_{category}.csv')

{'what': Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'category'],
    num_rows: 6595
}), 'where': Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'category'],
    num_rows: 464
}), 'who': Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'category'],
    num_rows: 1151
}), 'undefined': Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'category'],
    num_rows: 421
}), 'how m/m': Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'category'],
    num_rows: 771
}), 'during': Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'category'],
    num_rows: 172
}), 'when': Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'category'],
    num_rows: 710
}), 'date': Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'category'],
    num_rows: 60
}), 'how old': Dataset({
    features: ['id', 'title', '

In [1]:
import transformers
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

from transformers import pipeline


  from .autonotebook import tqdm as notebook_tqdm
2023-11-05 15:38:46.865895: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from transformers.pipelines.pt_utils import KeyPairDataset
from datasets import load_metric
import numpy as np

metric = load_metric("squad")
# Function to calculate metrics
def compute_metrics(eval_preds):
    
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Pretrained models to evaluate
model_names = [

    'distilbert-base-cased-distilled-squad',
    'distilbert-base-uncased-distilled-squad',
    'quangb1910128/bert-finetuned-squad'
]

# Evaluate each model
for model_name in model_names:
    print(f"Evaluating {model_name}...")

    # Create a QA pipeline
    qa_pipeline = pipeline("question-answering", model=model_name)

    # for category in categorized_datasets.keys():
    for category in ['what time']:
        print(f"Starting eval on category: {category}")
        dataset = categorized_datasets[category]
        # Making predictions and evaluating
        
        predictions = []
        for entry in dataset:
            result = qa_pipeline(entry["question"], entry["context"])
            pred = {'id': entry['id'], 'prediction_text': result['answer']}
            predictions.append(pred)

        ref_ds = dataset.select_columns(['id', 'answers'])
        references = []
        for entry in ref_ds:
            answer = {'id': entry['id'], 'answers': entry['answers']}
            references.append(answer)

        results = metric.compute(predictions=predictions, references=references)

        print(f"Model: {model_name} Category: {category}")
        print(f"Exact Match (EM): {results['exact_match']:.2f}")
        print(f"F1 Score: {results['f1']:.2f}\n")

# You would need to implement the compute_f1 function based on the SQuAD evaluation script or import it if available.
# Also note that making predictions on the full dataset would be slow and resource-intensive;
# you might want to batch the predictions or use a subset of the data.

Evaluating distilbert-base-cased-distilled-squad...
creating QA pipeline
Starting eval on category: what time
Model: distilbert-base-cased-distilled-squad Category: what time
Exact Match (EM): 80.00
F1 Score: 87.33

Evaluating distilbert-base-uncased-distilled-squad...
creating QA pipeline


(…)distilled-squad/resolve/main/config.json: 100%|██████████| 451/451 [00:00<00:00, 49.1kB/s]
model.safetensors: 100%|██████████| 265M/265M [00:35<00:00, 7.55MB/s] 
(…)squad/resolve/main/tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 21.1kB/s]
(…)d-distilled-squad/resolve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 5.51MB/s]
(…)tilled-squad/resolve/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 5.45MB/s]


Starting eval on category: what time
Model: distilbert-base-uncased-distilled-squad Category: what time
Exact Match (EM): 80.00
F1 Score: 88.33

Evaluating quangb1910128/bert-finetuned-squad...
creating QA pipeline
Starting eval on category: what time
Model: quangb1910128/bert-finetuned-squad Category: what time
Exact Match (EM): 90.00
F1 Score: 93.33

