In [None]:
import torch
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

import json
from pathlib import Path

def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters


def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        # if None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

def getData(file):
    train_contexts, train_questions, train_answers = read_squad(file)
    add_end_idx(train_answers, train_contexts)
    train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
    add_token_positions(train_encodings, train_answers)
    train_dataset = SquadDataset(train_encodings)
    return train_dataset

In [26]:
from transformers import DistilBertForQuestionAnswering, Trainer, TrainingArguments
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

datasets = [{
    'name': 'bert',
    'train_dataset': model,
}, {
    'name': 'combined',
    'train_dataset': model,
}, {
    'name': 'custom',
    'train_dataset': getData('../data/squad_questions_v1.json')
}]

loading configuration file config.json from cache at C:\Users\James/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\043235d6088ecd3dd5fb5ca3592b6913fd516027\config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.23.1",
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at C:\Users\James/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\043235d6088ecd3dd5fb5ca3592b6913fd516027\pytorch_model.bin
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQues

In [None]:
#ONLY RUN THIS FOR TRAINING

for train in datasets:
    training_args = TrainingArguments(
        num_train_epochs=3,              # total number of training epochs
        per_device_train_batch_size=16,  # batch size per device during training
        per_device_eval_batch_size=64,   # batch size for evaluation
        warmup_steps=0,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,
        logging_dir='../logs',
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train['train_dataset'],
    )

    trainer.train()
    trainer.save_model('../models/' + train['name'])

In [58]:
data = {}
def make_entry(question, context, data, QAer):
    data['question'] = question
    data['result'] = QAer(question=question, context=context)
    return data['result']


def run_eval(question_answerer): 
    with open("../data/Passages/crime_scene.txt", "r", encoding="utf8") as f:
        data['crime_scene'] = {}
        context = f.read()
        result = make_entry("How was he Killed?", context, data['crime_scene'], question_answerer)
        print(f"Answer: '{result['answer']}'")
        result = make_entry("What was the murder weapon?", context, data['crime_scene'], question_answerer)
        print(f"Answer: '{result['answer']}'")
        result = make_entry("Where did the murder take place?", context, data['crime_scene'], question_answerer)
        print(f"Answer: '{result['answer']}'")

    with open("../data/Passages/introduction_of_antagonist.txt", "r", encoding="utf8") as f:
        data['introduction_of_antagonist'] = {}
        context = f.read()
        result = make_entry("Where is Charlie Jones from?", context, data['introduction_of_antagonist'], question_answerer)
        print(f"Answer: '{result['answer']}'")
        result = make_entry("How would Charlie Jones be discribed?", context, data['introduction_of_antagonist'], question_answerer)
        print(f"Answer: '{result['answer']}'")
        result = make_entry("Where are they in this scene?", context, data['introduction_of_antagonist'], question_answerer)
        print(f"Answer: '{result['answer']}'")

    with open("../data/Passages/introduction_of_protagonist.txt", "r", encoding="utf8") as f:
        data['introduction_of_protagonist'] = {}
        context = f.read()
        result = make_entry("What is his education?", context, data['introduction_of_protagonist'], question_answerer)
        print(f"Answer: '{result['answer']}'")
        result = make_entry("When was he on the boat?", context, data['introduction_of_protagonist'], question_answerer)
        print(f"Answer: '{result['answer']}'")
        result = make_entry("How old is he?", context, data['introduction_of_protagonist'], question_answerer)
        print(f"Answer: '{result['answer']}'")

    with open("../data/Passages/resolution_of_the_crime.txt", "r", encoding="utf8") as f:
        data['resolution_of_the_crime'] = {}
        context = f.read()
        result = make_entry("Who murdered him?", context, data['resolution_of_the_crime'], question_answerer)
        print(f"Answer: '{result['answer']}'")
        result = make_entry("When was he at the wheel?", context, data['resolution_of_the_crime'], question_answerer)
        print(f"Answer: '{result['answer']}'")
        result = make_entry("Who did Charlie Jones intend to kill?", context, data['resolution_of_the_crime'], question_answerer)
        print(f"Answer: '{result['answer']}'")

    with open("../data/Passages/significant_evidence.txt", "r", encoding="utf8") as f:
        data['significant_evidence'] = {}
        context = f.read()
        result = make_entry("Where was the bell coming form?", context, data['significant_evidence'], question_answerer)
        print(f"Answer: '{result['answer']}'")
        result = make_entry("Whose body was laying on the ground?", context, data['significant_evidence'], question_answerer)
        print(f"Answer: '{result['answer']}'")
        result = make_entry("Who did Charlie Jones intend to kill?", context, data['significant_evidence'], question_answerer)
        print(f"Answer: '{result['answer']}'")
    
    return data

In [59]:
from transformers import pipeline
from evaluate import evaluator
d = {}

for val in datasets:
    model = DistilBertForQuestionAnswering.from_pretrained('../models/' + val['name'])
    pl = pipeline("question-answering", model=model, tokenizer=tokenizer)
    d[val['name']] = run_eval(pl)
    print(d)

    #task_evaluator = evaluator("question-answering")
    #results = task_evaluator.compute(
    #    model_or_pipeline=pl,
    #    data=val['train_dataset'],
    #    metric="squad"
    #)


loading configuration file ../models/bert\config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForQuestionAnswering"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.23.1",
  "vocab_size": 30522
}

loading weights file ../models/bert\pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForQuestionAnswering.

All the weights of DistilBertForQuestionAnswering were initialized from the model checkpoint at ../models/bert.
If your task is similar to the task the model of the checkpoint was trained on, you can already us

Answer: 'Charlie Jones'
Answer: 'Charlie Jones'
Answer: 'Charlie Jones'
Answer: 'bigger fish'
Answer: 'in circles around the masts'
Answer: 'bigger fish'
Answer: 'Ella'
Answer: 'August 12'
Answer: 'Ella'
Answer: 'Charlie Jones'
Answer: 'a meteor or some
strange appearance of the heavens'
Answer: 'The story of the early morning of August 12 will never be fully known'
Answer: 'it seemed to shine'
Answer: 'it seemed to shine'
Answer: 'it seemed to shine'
{'bert': {'crime_scene': {'question': 'Where did the murder take place?', 'result': {'score': 0.7744999527931213, 'start': 872, 'end': 885, 'answer': 'Charlie Jones'}}, 'introduction_of_antagonist': {'question': 'Where are they in this scene?', 'result': {'score': 0.5930498242378235, 'start': 1565, 'end': 1576, 'answer': 'bigger fish'}}, 'introduction_of_protagonist': {'question': 'How old is he?', 'result': {'score': 0.2655108869075775, 'start': 429, 'end': 433, 'answer': 'Ella'}}, 'resolution_of_the_crime': {'question': 'Who did Charlie

loading configuration file ../models/combined\config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForQuestionAnswering"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.23.1",
  "vocab_size": 30522
}

loading weights file ../models/combined\pytorch_model.bin





All model checkpoint weights were used when initializing DistilBertForQuestionAnswering.

All the weights of DistilBertForQuestionAnswering were initialized from the model checkpoint at ../models/combined.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForQuestionAnswering for predictions without further training.


Answer: 'He had been almost cut to pieces with an axe'
Answer: 'Vail’s been murdered'
Answer: 'Vail’s been murdered'
Answer: 'between the sky and the sea'
