In [1]:
import torch
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

import json
from pathlib import Path

def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters


def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        # if None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

def getData(file):
    train_contexts, train_questions, train_answers = read_squad(file)
    add_end_idx(train_answers, train_contexts)
    train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
    add_token_positions(train_encodings, train_answers)
    train_dataset = SquadDataset(train_encodings)
    return train_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import DistilBertForQuestionAnswering, Trainer, TrainingArguments
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

datasets = [{
    'name': 'bert',
    'train_dataset': model,
}, {
    'name': 'combined',
    'train_dataset': model,
}, {
    'name': 'custom',
    'train_dataset': getData('../data/squad_questions_v1.json')
}]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

In [None]:
#ONLY RUN THIS FOR TRAINING

for train in datasets:
    training_args = TrainingArguments(
        num_train_epochs=3,              # total number of training epochs
        per_device_train_batch_size=16,  # batch size per device during training
        per_device_eval_batch_size=64,   # batch size for evaluation
        warmup_steps=0,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,
        logging_dir='../logs',
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train['train_dataset'],
    )

    trainer.train()
    trainer.save_model('../models/' + train['name'])

In [5]:
data = {}

def make_entry(question, context, data, QAer):
    nd = {}
    nd['question'] = question
    nd['result'] = QAer(question=question, context=context)
    data.append(nd)
    return nd['result']


def run_eval(question_answerer): 
    with open("../data/Passages/crime_scene.txt", "r", encoding="utf8") as f:
        data['crime_scene'] = []
        context = f.read()
        result = make_entry("How was he Killed?", context, data['crime_scene'], question_answerer)
        print(f"Answer: '{result['answer']}'")
        result = make_entry("What was the murder weapon?", context, data['crime_scene'], question_answerer)
        print(f"Answer: '{result['answer']}'")
        result = make_entry("Where did the murder take place?", context, data['crime_scene'], question_answerer)
        print(f"Answer: '{result['answer']}'")

    with open("../data/Passages/introduction_of_antagonist.txt", "r", encoding="utf8") as f:
        data['introduction_of_antagonist'] = []
        context = f.read()
        result = make_entry("Where is Charlie Jones from?", context, data['introduction_of_antagonist'], question_answerer)
        print(f"Answer: '{result['answer']}'")
        result = make_entry("How would Charlie Jones be discribed?", context, data['introduction_of_antagonist'], question_answerer)
        print(f"Answer: '{result['answer']}'")
        result = make_entry("Where are they in this scene?", context, data['introduction_of_antagonist'], question_answerer)
        print(f"Answer: '{result['answer']}'")

    with open("../data/Passages/introduction_of_protagonist.txt", "r", encoding="utf8") as f:
        data['introduction_of_protagonist'] = []
        context = f.read()
        result = make_entry("What is his education?", context, data['introduction_of_protagonist'], question_answerer)
        print(f"Answer: '{result['answer']}'")
        result = make_entry("When was he on the boat?", context, data['introduction_of_protagonist'], question_answerer)
        print(f"Answer: '{result['answer']}'")
        result = make_entry("How old is he?", context, data['introduction_of_protagonist'], question_answerer)
        print(f"Answer: '{result['answer']}'")

    with open("../data/Passages/resolution_of_the_crime.txt", "r", encoding="utf8") as f:
        data['resolution_of_the_crime'] = []
        context = f.read()
        result = make_entry("Who murdered him?", context, data['resolution_of_the_crime'], question_answerer)
        print(f"Answer: '{result['answer']}'")
        result = make_entry("When was he at the wheel?", context, data['resolution_of_the_crime'], question_answerer)
        print(f"Answer: '{result['answer']}'")
        result = make_entry("Who did Charlie Jones intend to kill?", context, data['resolution_of_the_crime'], question_answerer)
        print(f"Answer: '{result['answer']}'")

    with open("../data/Passages/significant_evidence.txt", "r", encoding="utf8") as f:
        data['significant_evidence'] = []
        context = f.read()
        result = make_entry("Where was the bell coming form?", context, data['significant_evidence'], question_answerer)
        print(f"Answer: '{result['answer']}'")
        result = make_entry("Whose body was laying on the ground?", context, data['significant_evidence'], question_answerer)
        print(f"Answer: '{result['answer']}'")
        result = make_entry("Who did Charlie Jones intend to kill?", context, data['significant_evidence'], question_answerer)
        print(f"Answer: '{result['answer']}'")
    
    return data

In [6]:
from transformers import pipeline
from evaluate import evaluator
import json

d = {}
for val in datasets:
    model = DistilBertForQuestionAnswering.from_pretrained('../models/' + val['name'])
    pl = pipeline("question-answering", model=model, tokenizer=tokenizer)
    d[val['name']] = run_eval(pl)

    #task_evaluator = evaluator("question-answering")
    #results = task_evaluator.compute(
    #    model_or_pipeline=pl,
    #    data=val['train_dataset'],
    #    metric="squad"
    #)

with open("./results.json", "w") as outfile:
    json.dump(d, outfile)


Answer: 'Charlie Jones'
Answer: 'Charlie Jones'
Answer: 'Charlie Jones'


AttributeError: 'dict' object has no attribute 'append'