# Model 4 - DistilBERT

In [1]:
import numpy as np

import pandas as pd

import tensorflow as tf
import tensorflow_datasets as tfds

from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!python --version

Python 3.11.2


In [3]:
tf. __version__

'2.12.0-rc1'

# 1. Load data

In [4]:
# Load and split into training and validation datasets
dataset = tfds.load('squad/v1.1')
train_ds = dataset['train']
val_ds = dataset['validation']



In [5]:
# Extract relevant data from training and validation datasets
def extract_data(instance):
    paragraph = instance['context']
    question = instance['question']
    answer = instance['answers']['text'][0]
    answer_start = instance['answers']['answer_start'][0]
    return paragraph, question, answer, answer_start

train_ds = pd.DataFrame(train_ds.map(extract_data).as_numpy_iterator())
val_ds = pd.DataFrame(val_ds.map(extract_data).as_numpy_iterator())

headers = ['Paragraph', 'Question', 'Answer', 'Answer Start']
train_ds.columns = headers
val_ds.columns = headers

In [6]:
train_ds.head()

Unnamed: 0,Paragraph,Question,Answer,Answer Start
0,b'The difference in the above factors for the ...,b'What is one use that would require an antenn...,b'mobile phones',427
1,"b""The coronation of Charlemagne as emperor on ...",b'About how many counts existed in the Carolin...,b'300',1020
2,b'Plant responses to climate and other environ...,b'How can climate changes be determined from s...,b'fossil pollen deposits in sediments',339
3,b'The Tucson metro area is served by many loca...,"b""What is Tucson's Fox station?""",b'KMSB-TV 11',347
4,"b""Situated on one of the world's largest natur...",b'What is the size of New York City in square ...,b'305',367


# 2. Get answer's end character position

In [7]:
# Get start and end character position of answer in paragraph
def get_answer_char_pos(row):
    paragraph, answer, answer_start = row['Paragraph'], row['Answer'], row['Answer Start']
    answer_end = answer_start + len(answer)

    # Check if SQuAD answers are off by one or two characters
    if paragraph[answer_start-1:answer_end-1] == answer:
        return [answer_start-1, answer_end-1]
    elif paragraph[answer_start-2:answer_end-2] == answer:
        return [answer_start-2, answer_end-2]
    else:
        return [answer_start, answer_end]

train_ds['Answer'] = train_ds.apply(get_answer_char_pos, axis=1)
train_ds = train_ds.drop('Answer Start', axis=1)

val_ds['Answer'] = val_ds.apply(get_answer_char_pos, axis=1)
val_ds = val_ds.drop('Answer Start', axis=1)

In [8]:
train_ds.head()

Unnamed: 0,Paragraph,Question,Answer
0,b'The difference in the above factors for the ...,b'What is one use that would require an antenn...,"[427, 440]"
1,"b""The coronation of Charlemagne as emperor on ...",b'About how many counts existed in the Carolin...,"[1020, 1023]"
2,b'Plant responses to climate and other environ...,b'How can climate changes be determined from s...,"[339, 374]"
3,b'The Tucson metro area is served by many loca...,"b""What is Tucson's Fox station?""","[347, 357]"
4,"b""Situated on one of the world's largest natur...",b'What is the size of New York City in square ...,"[367, 370]"


# 3. Run DistilBERT model

In [9]:
paragraph_val, question_val = val_ds['Paragraph'].tolist(), val_ds['Question'].tolist()
paragraph_val = [text.decode('utf-8') for text in paragraph_val]
question_val = [text.decode('utf-8') for text in question_val]

labels = val_ds['Answer'].tolist()

In [10]:
model = pipeline('question-answering', model='distilbert-base-uncased-distilled-squad')

predictions = []

for paragraph, question in zip(paragraph_val, question_val):
    prediction = model(question=question, context=paragraph)
    start_pred, end_pred = prediction['start'], prediction['end']
    predictions.append([start_pred, end_pred])

All model checkpoint layers were used when initializing TFDistilBertForQuestionAnswering.

All the layers of TFDistilBertForQuestionAnswering were initialized from the model checkpoint at distilbert-base-uncased-distilled-squad.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForQuestionAnswering for predictions without further training.


# 4. Evaluate DistilBERT model

In [11]:
print(labels[:5])
print(predictions[:5])

[[479, 499], [218, 242], [826, 867], [28, 42], [402, 410]]
[[479, 499], [218, 234], [770, 782], [28, 42], [402, 410]]


In [12]:
def exact_match(y_true, y_pred):
    start_true, end_true = y_true[0], y_true[1]
    start_pred, end_pred = y_pred[0], y_pred[1]

    if start_pred != start_true or end_pred != end_true:
        return float(0)
    else:
        return float(1)

def f1_score(y_true, y_pred):
    start_true, end_true = y_true[0], y_true[1]
    start_pred, end_pred = y_pred[0], y_pred[1]

    predicted_tokens = set(range(start_pred, end_pred + 1))
    true_tokens = set(range(start_true, end_true + 1))
    common_tokens = predicted_tokens.intersection(true_tokens)
    precision = len(common_tokens) / len(predicted_tokens) if len(predicted_tokens) > 0 else 0
    recall = len(common_tokens) / len(true_tokens) if len(true_tokens) > 0 else 0
    return float(2 * precision * recall / (precision + recall) if precision + recall > 0 else 0)

In [13]:
exact_match_res = 0
f1_score_res = 0
count = 0

for y_true, y_pred in zip(labels, predictions):
    exact_match_res += exact_match(y_true, y_pred)
    f1_score_res += f1_score(y_true, y_pred)
    count += 1

exact_match_res /= count
f1_score_res /= count

print('Exact match:', exact_match_res)
print('F1 score:', f1_score_res)

Exact match: 0.5698202459791863
F1 score: 0.7725855536405953


In [14]:
with open('model4.npy', 'wb') as f:
    np.save(f, np.array(predictions))