In [6]:
from datasets import load_dataset
import matplotlib.pyplot as plt

This is my (JC) contribution to this project. I've implemented the code for splitting the dataset into seperate datasets that we can then use for evaulation, and hence scoring of the pre-trained models we're attempting to use

In [7]:
# Define a function to categorize questions
def naive_categorize_question(example):
    question = example['question'].lower()
    if 'date' in question:
        return 'date'
    if 'during' in question:
        return 'during'
    if 'how are' in question:
        return 'how are'
    if 'how big' in question or 'how large' in question:
        return 'how big/size'
    if 'how many' in question or 'how much' in question:
        return 'how m/m'
    if 'how old' in question:
        return 'how old'
    if 'what time' in question:
        return 'what time'
    if 'what' in question or 'which' in question:
        return 'what'
    if 'when' in question:
        return 'when'
    if 'where' in question:
        return 'where'
    if 'who' in question:
        return 'who'
    if 'whom' in question:
        return 'whom'
    if 'why' in question:
        return 'why'
    else:
        return 'undefined'

In [8]:
# Load the SQuAD dataset
dataset = load_dataset('squad', split='validation')

# Add a new field for question categories
dataset = dataset.map(lambda entry: {"category": naive_categorize_question(entry)})

# Split the dataset into categories
categories = dataset.unique('category')
categorized_datasets = {category: dataset.filter(lambda example: example['category'] == category) for category in categories}

# Print out the size of each categorized dataset
for category, subset in categorized_datasets.items():
    print(f"Category '{category}': {len(subset)} questions")

# If you want to save these subsets, you can do so like this:
# for category, subset in categorized_datasets.items():
#     subset.to_csv(f'squad_{category}.csv')

Category 'what': 6595 questions
Category 'where': 464 questions
Category 'who': 1151 questions
Category 'undefined': 421 questions
Category 'how m/m': 771 questions
Category 'during': 172 questions
Category 'when': 710 questions
Category 'date': 60 questions
Category 'how old': 21 questions
Category 'why': 148 questions
Category 'how big/size': 12 questions
Category 'what time': 10 questions
Category 'how are': 35 questions


In [10]:
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer

bbc = AutoModelForQuestionAnswering.from_pretrained('bert-base-cased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
from transformers.pipelines.pt_utils import KeyPairDataset
from datasets import load_metric
import numpy as np

metric = load_metric("squad")
# Function to calculate metrics
def compute_metrics(eval_preds):
    
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Pretrained models to evaluate
model_names = [
    #
    # Amadeus I'm relying on you here HAHA
    #
    'bert-base-cased'
]

# Evaluate each model
for model_name in model_names:
    print(f"Evaluating {model_name}...")
    
    # Load the model and tokenizer
    # model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    # tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = bbc
    tokenizer = tokenizer

    # Create a QA pipeline
    print("creating QA pipeline")
    qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
    # for category in categorized_datasets.keys():
    for category in ['how big/size', 'what time']:
        print(f"starting eval on category: {category}")
        dataset = categorized_datasets[category]
        # Making predictions and evaluating
        print(dataset)
        
        predictions = []
        for entry in dataset:
            predictions.append(qa_pipeline(entry["question"], entry["context"]))
        print("predictions finished")
        print(predictions[0])
        print(predictions[1])

        references = dataset.select_columns(['answers', 'id'])
        results = metric.compute(predictions=predictions, references=references)
        print(f"Model: {model_name} Category: {category}")
        print(f"Exact Match (EM): {results['exact_match']:.2f}")
        print(f"F1 Score: {results['f1']:.2f}\n")

# You would need to implement the compute_f1 function based on the SQuAD evaluation script or import it if available.
# Also note that making predictions on the full dataset would be slow and resource-intensive;
# you might want to batch the predictions or use a subset of the data.

Evaluating bert-base-cased...
creating QA pipeline
starting eval on category: how big/size
Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'category'],
    num_rows: 12
})
predictions finished
{'score': 0.00016254832735285163, 'start': 267, 'end': 312, 'answer': 'and the ground. Sparks sprang from water line'}
{'score': 0.00021144031779840589, 'start': 373, 'end': 409, 'answer': '4m customers had subscribed to BSkyB'}


KeyError: 'id'

In [19]:
question_answerer = pipeline("question-answering", model=bbc, tokenizer=tokenizer)

print(question_answerer(question="question", context="context"))

{'score': 0.2803937792778015, 'start': 0, 'end': 7, 'answer': 'context'}
