<a href="https://colab.research.google.com/github/alexpod1000/SQuAD-QA/blob/main/ModelTrainExperimentalCode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#%%bash
#[[ ! -e /colabtools ]] && exit  # Continue only if running on Google Colab

# Clone repository
# https://sysadmins.co.za/clone-a-private-github-repo-with-personal-access-token/
# For cloning the main branch:
#!git clone https://fb5b65b126107273e595ce8b6c9d2d533103c6e2:x-oauth-basic@github.com/alexpod1000/SQuAD-QA.git
# For cloning the "evaluation-features" branch
#!git clone --branch evaluation-features https://fb5b65b126107273e595ce8b6c9d2d533103c6e2:x-oauth-basic@github.com/alexpod1000/SQuAD-QA.git
# Change current working directory to match project
#%cd SQuAD-QA/
#!pwd

In [2]:
# External imports
import copy
import nltk
import numpy as np
import pandas as pd
import string
import torch

from functools import partial
from nltk.tokenize import TreebankWordTokenizer, SpaceTokenizer
from typing import Tuple, List, Dict, Any, Union

# Project imports
from squad_data.parser import SquadFileParser
from squad_data.utils import build_mappers_and_dataframe, add_paragraphs_spans
from evaluation.evaluation_metrics import Evaluator
from evaluation.utils import extract_answer, build_evaluation_dict

### Download Embedding

In [3]:
from utils.embedding_utils import EmbeddingDownloader

embedding_downloader = EmbeddingDownloader(
    "embedding_models", 
    "embedding_model.kv", 
    model_name="fasttext-wiki-news-subwords-300"
)

embedding_model = embedding_downloader.load()

Loading pre-downloaded embeddings from /home/alexpod/uni/magistrale_ai/secondo_anno/nlp/project/SQuAD-QA/embedding_models/embedding_model.kv
End!
Embedding dimension: 300


### Parse the json and get the data

In [4]:
parser = SquadFileParser("squad_data/data/training_set.json")
data = parser.parse_documents()

########################### DEBUG
# reduce size for faster testing
#full_data = data
#data = []
#for i in range(1): # use only the first 1 documents
#  data.append(full_data[i])

### Prepare the mappers and datafram

In [5]:
def bert_tokenizer_fn(question, paragraph, tokenizer, max_length=384):
    doc_stride = 128
    pad_on_right = tokenizer.padding_side == "right"
    # TODO: add max_length and doc_stride
    tokenized_input_pair = tokenizer(
        question,
        paragraph,
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    return tokenized_input_pair

In [6]:
from transformers import AutoTokenizer, PreTrainedTokenizerFast
from typing import Union, Tuple, List, Dict, Any
from squad_data import Document

def index_of_first(lst, pred):
    for i, v in enumerate(lst):
        if pred(v):
            return i
    return None

def split_paragraph_if_needed(paragraph, question, answer_span, tokenizer_fn):
    """
    Attempts to tokenize a paragraph and question together, if too long
    because of tokenizer's max length, then will split the paragraph into
    multiple slices.
    
    Returns a list of paragraph slices with answer span, such that:
        - a paragraph slice with no answer will have answer mapped to (CLS, CLS)
        - a paragraph slice with answer will be mapped to the index of answer.
    """
    tokenized_input_pair = tokenizer_fn(question, paragraph)
    # outputs
    paragraph_splits = []
    answer_spans = []
    # get answer end char idx
    ans_start = answer_span[0]
    ans_end = answer_span[1]
    
    #n_special_tokens = 3 # there are at least 3 special tokens introduced by tokenizer
    """
    1) Find index of context segments in the tokenized example
    2) Within the context segments (start from context_segment_idx), 
       find the token corresponding to span of answer: start and end.
    """
    for offset_idx, offset in enumerate(tokenized_input_pair.offset_mapping):
        # get sequence ids
        sequence_ids = tokenized_input_pair.sequence_ids(offset_idx)
        # find start index of context segment
        context_segment_idx = sequence_ids.index(1)
        # TODO(Alex): ADD QUICK FIX WITH n_special_tokens (but it's not a proper solution)
        span_start_offset_idx = index_of_first(
            tokenized_input_pair.offset_mapping[offset_idx][context_segment_idx:], 
            lambda span: span[0] <= ans_start <= span[1]
        )
        span_end_offset_idx = index_of_first(
            tokenized_input_pair.offset_mapping[offset_idx][context_segment_idx:], 
            lambda span: span[0] <= ans_end <= span[1]
        )
        # Decode split into a string
        decoded_split = tokenizer.decode(tokenized_input_pair.input_ids[offset_idx][context_segment_idx:], skip_special_tokens=True)
        # 
        paragraph_splits.append(decoded_split)
        if span_start_offset_idx is not None and span_end_offset_idx is not None:
            # If answer span is fully in current slice
            # add segment idx offset
            span_start_offset_idx += context_segment_idx
            span_end_offset_idx += context_segment_idx + 1 # the plus 1 is needed for correct slicing
            answer_spans.append((span_start_offset_idx, span_end_offset_idx))
            # TODO: we have span indexes now, map them to groundtruth (?)
        elif span_start_offset_idx is None and span_end_offset_idx is None:
            # If span not in this slice, but in another slice
            # map answer to (CLS, CLS)
            cls_idx = tokenized_input_pair.input_ids[offset_idx].index(tokenizer.cls_token_id)
            # NOTE(Alex): although I think it's always 0
            answer_spans.append((cls_idx, cls_idx))
        else:
            # span spans along multiple slices -> throw the sample away 
            # (should be only like 4 samples across the whole dataset)
            # Discard sample
            pass
    
    return (paragraph_splits, answer_spans)

def build_bert_mappers_and_dataframe(
    tokenizer_fn,
    documents_list: List[Document], 
    limit_answers: int = -1
    ) -> Tuple[Dict[str, str], pd.DataFrame]:
    """
    Given a list of SQuAD Document objects, returns mapper to transform from
    paragraph id to paragraph text and a dataframe containing paragraph id, 
    question id, text and answer details.
    Args:
        tokenizer_fn: Huggingface tokenizer
        documents_list (List[Document]): list of parsed SQuAD document objects.
        limit_answers (int): limit number of returned answers per question
            to this amount (-1 to return all the available answers).

    Returns:
        paragraphs_mapper: mapper from paragraph id to paragraph text
        dataframe: Pandas dataframe with the following schema
            (paragraph_id, question_id, question_text, answer_id, answer_start, answer_text)
    """

    # type for np array: np.ndarray
    # given a paragraph id, maps the paragraph to its text or embeddings (or both)
    split_paragraphs_mapper = {}
    # dataframe
    dataframe_list = []
    for doc_idx, document in enumerate(documents_list):
        # for each paragraph
        for par_idx, paragraph in enumerate(document.paragraphs):
            par_text = paragraph.context.strip()

            # for each question
            for question in paragraph.questions:
                question_id = question.id
                question_text = question.question.strip()

                # TODO(Alex): at this level, could need to start building par/question

                # take only "limit_answers" answers for every question.
                answer_range = len(question.answers) if limit_answers == -1 else limit_answers
                for answer_id, answer in enumerate(question.answers[:answer_range]):
                    # NOTE: in training set, there's only one answer per question.
                    answer_text = answer.text.strip()
                    # get span
                    answer_start = answer.answer_start
                    answer_end = answer.answer_start + len(answer_text)

                    par_splits, split_answer_spans = split_paragraph_if_needed(par_text, question_text, (answer_start, answer_end), tokenizer_fn)
                    
                    pair_overflows = len(par_splits) > 1
                    
                    for split_idx, (split_text, split_ans_span) in enumerate(zip(par_splits, split_answer_spans)):
                        """
                        NOTE(Alex): since in tokenization phase we also use question, our ID depends on question too
                                    For example if for question1, the pair <question1, par> goes above the limit,
                                    but for <question2, par> it does not, then we'll still need to keep track of
                                    different splits of par, depending on each question.
                        """
                        if pair_overflows:
                            split_par_id = "{}_{}_{}_{}".format(doc_idx, par_idx, question_id, split_idx)
                        else:
                            """
                            If no length overflow, then we don't need question_id or split_idx
                            To optimize memory, we can map same splits to same id 
                            (some pairs <question, par> won't overflow anyway)
                            """
                            split_par_id = "{}_{}".format(doc_idx, par_idx)
                        split_paragraphs_mapper[split_par_id] = split_text
                        # build dataframe entry
                        dataframe_list.append({
                            "paragraph_id": split_par_id,
                            "question_id": question_id,
                            "answer_id": answer_id,
                            "answer_start": answer_start,
                            "answer_text": answer_text,
                            "question_text": question_text,
                            "tokenizer_answer_start": split_ans_span[0],
                            "tokenizer_answer_end": split_ans_span[1],
                        })
    return split_paragraphs_mapper, pd.DataFrame(dataframe_list)

In [7]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer_fn_preprocess = partial(bert_tokenizer_fn, tokenizer=tokenizer, max_length=380)
tokenizer_fn_train = partial(bert_tokenizer_fn, tokenizer=tokenizer, max_length=384)

In [8]:
#par_text, ques_text, ans_start, ans_text = build_bert_mappers_and_dataframe(tokenizer, data, limit_answers=1)
paragraphs_mapper, df = build_bert_mappers_and_dataframe(tokenizer_fn_preprocess, data, limit_answers=1)

In [9]:
import transformers

In [13]:
import torch

class Custom_BERT_QADataset(torch.utils.data.Dataset):
    """Custom text dataset."""

    def __init__(self, tokenizer_fn, df, paragraphs_mapper):
        self.input_list = df[["paragraph_id", "question_text", "question_id"]]
        self.output_list = df[["tokenizer_answer_start", "tokenizer_answer_end"]]
        self.paragraphs_mapper = paragraphs_mapper
        self.tokenizer_fn = tokenizer_fn

    def __len__(self):
        return len(self.input_list)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        paragraph_id = self.input_list.iloc[idx]["paragraph_id"]
        question_id = self.input_list.iloc[idx]["question_id"]
        question_text = self.input_list.iloc[idx]["question_text"]
        tokenizer_answer_start = self.output_list.iloc[idx]["tokenizer_answer_start"]
        tokenizer_answer_end = self.output_list.iloc[idx]["tokenizer_answer_end"]

        paragraph_text = self.paragraphs_mapper[paragraph_id]
        tokenized_input_pair = self.tokenizer_fn(question_text, paragraph_text)
        
        #input_ids = torch.tensor(tokenized_input_pair["input_ids"], dtype=torch.long)
        #attention_mask = torch.tensor(tokenized_input_pair["attention_mask"], dtype=torch.long)
        input_ids = tokenized_input_pair["input_ids"]
        attention_mask = tokenized_input_pair["attention_mask"]

        out_span = torch.tensor([tokenizer_answer_start, tokenizer_answer_end])
        
        # DistilBERT doesn’t have token_type_ids
        
        return input_ids, attention_mask, out_span, paragraph_id, question_id, idx
    
def bert_padder_collate_fn(sample_list):
    # NOTE: the tokenizer in dataloader already pads inputs to have same length of 384
    input_ids_padded = [sample[0] for sample in sample_list]
    attention_mask_padded = [sample[1] for sample in sample_list]
    out = [sample[2] for sample in sample_list]
    paragraph_id = [sample[3] for sample in sample_list]
    question_id = [sample[4] for sample in sample_list]
    debug_idx = [sample[5] for sample in sample_list]
    
    #input_ids_padded = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
    #attention_mask_padded = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True)
    
    prev_shape_inp = None
    prev_shape_attn = None
    
    try:
        input_ids_padded = torch.tensor(input_ids_padded, dtype=torch.long)
        attention_mask_padded = torch.tensor(attention_mask_padded, dtype=torch.long)
        
        prev_shape_inp = input_ids_padded.shape
        prev_shape_attn = attention_mask_padded.shape
        
        input_ids_padded = input_ids_padded[:, 0, :]
        attention_mask_padded = attention_mask_padded[:, 0, :]
    except:
        print(paragraph_id)
        print(question_id)
        print(debug_idx)
        print(prev_shape_inp)
        print(prev_shape_attn)
    
    #answer_emb_padded = torch.nn.utils.rnn.pad_sequence(out, batch_first=True)
    return {"input_ids": input_ids_padded,
            "attention_mask": attention_mask_padded,
            "y_gt":torch.stack(out),
            "paragraph_id":paragraph_id,
            "question_id":question_id}


In [11]:
class DistilBertBaseQA(torch.nn.Module):

    def __init__(self, hidden_size, num_labels):
        super(DistilBertBaseQA, self).__init__()

        """
        DistilBertConfig Model Test
        """
        bert_config = transformers.DistilBertConfig(max_position_embeddings=384)
        bert_model = transformers.DistilBertModel(bert_config)

        self.hidden_size = hidden_size
        self.num_labels = num_labels
        self.config = transformers.DistilBertConfig(max_position_embeddings=384)
        self.bert = transformers.DistilBertModel(bert_config)
        self.qa_outputs = torch.nn.Linear(self.hidden_size, self.num_labels)

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids = input_ids, attention_mask = attention_mask)

        sequence_output = output[0]   #(None, seq_len, hidden_size)
        logits = self.qa_outputs(sequence_output) #(None, seq_len, hidden_size)*(hidden_size, 2)=(None, seq_len, 2)
        start_logits, end_logits = logits.split(1, dim=-1)    #(None, seq_len, 1), (None, seq_len, 1)
        start_logits = start_logits.squeeze(-1)  #(None, seq_len)
        end_logits = end_logits.squeeze(-1)    #(None, seq_len)


        outputs = (start_logits, end_logits,) 

        return outputs

In [14]:
datasetBertQA = Custom_BERT_QADataset(tokenizer_fn_train, df, paragraphs_mapper)
data_loader = torch.utils.data.DataLoader(datasetBertQA, collate_fn = bert_padder_collate_fn, batch_size=10, shuffle=True)

test_batch = next(iter(data_loader))
#print(test_batch["paragraph_emb"].shape)
#print(test_batch["y_gt"].shape)

In [21]:
model = DistilBertBaseQA(768, 2).to(device)

loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001, amsgrad=True)

In [22]:
train_data_loader = torch.utils.data.DataLoader(datasetBertQA, collate_fn = bert_padder_collate_fn, batch_size=16, shuffle=True)

In [23]:
idxs = [23768, 22675, 44840, 70921, 17668, 83128, 2196, 24744, 76377, 20475, 61752, 74694, 84931, 77013, 8950, 88488]
failed = bert_padder_collate_fn([datasetBertQA[i] for i in idxs])

Failed it failed precisely on: 418_19_57312110a5e9cc1400cdbc57_0

['102_68', '99_18', '210_62', '359_5', '76_48', '418_19_57312110a5e9cc1400cdbc57_0', '5_74', '107_9', '383_22', '89_25', '304_55', '374_28', '424_63', '388_5', '22_24', '441_55']
['56fb85aab28b3419009f1dfc', '56fa74058f12f31900630149', '5727ea07ff5b5019007d9864', '572e83d3c246551400ce429f', '56e78a2e00c9c71400d77270', '57312110a5e9cc1400cdbc57', '56d3a2cd59d6e4140014684a', '5705e91d52bb891400689681', '572f51f5a23a5019007fc538', '56f7f909a6d7ea1400e17343', '5728a4d44b864d1900164b4e', '57307b79069b531400832116', '573212f70fdd8d15006c6760', '572f6db904bcaa1900d76935', '5733229dd058e614000b571a', '5735d0f46c16ec1900b92824']
[23768, 22675, 44840, 70921, 17668, 83128, 2196, 24744, 76377, 20475, 61752, 74694, 84931, 77013, 8950, 88488]
None
None


Might be because we use original, not tokenized question together with a detokenized context.

In [132]:
print(data[418].paragraphs[19].context)
print(data[418].paragraphs[19].questions[4].question)

Like the reptiles, birds are primarily uricotelic, that is, their kidneys extract nitrogenous waste from their bloodstream and excrete it as uric acid instead of urea or ammonia through the ureters into the intestine. Birds do not have a urinary bladder or external urethral opening and (with exception of the ostrich) uric acid is excreted along with feces as a semisolid waste. However, birds such as hummingbirds can be facultatively ammonotelic, excreting most of the nitrogenous wastes as ammonia. They also excrete creatine, rather than creatinine like mammals. This material, as well as the output of the intestines, emerges from the bird's cloaca. The cloaca is a multi-purpose opening: waste is expelled through it, most birds mate by joining cloaca, and females lay eggs from it. In addition, many species of birds regurgitate pellets. Males within Palaeognathae (with the exception of the kiwis), the Anseriformes (with the exception of screamers), and in rudimentary forms in Galliformes 

In [133]:
thefuck = tokenizer_fn(data[418].paragraphs[19].questions[4].question, data[418].paragraphs[19].context)

In [134]:
qq = tokenizer.decode(thefuck.input_ids[0][:thefuck.sequence_ids(0).index(1)], skip_special_tokens=True)

In [135]:
qq

'what aids birds with flight?'

In [147]:
omg = tokenizer.decode(thefuck.input_ids[0])
#omg = tokenizer.decode(thefuck.input_ids[0][thefuck.sequence_ids(0).index(1):-2], skip_special_tokens=True)

In [148]:
omg

"[CLS] what aids birds with flight? [SEP] like the reptiles, birds are primarily uricotelic, that is, their kidneys extract nitrogenous waste from their bloodstream and excrete it as uric acid instead of urea or ammonia through the ureters into the intestine. birds don't have a urinary bladder or external urethral opening and ( with exception of the ostrich ) uric acid is excreted along with feces as a semisolid waste. however, birds such as hummingbirds can be facultatively ammonotelic, excreting most of the nitrogenous wastes as ammonia. they also excrete creatine, rather than creatinine like mammals. this material, as well as the output of the intestines, emerges from the bird's cloaca. the cloaca is a multi - purpose opening : waste is expelled through it, most birds mate by joining cloaca, and females lay eggs from it. in addition, many species of birds regurgitate pellets. males within palaeognathae ( with the exception of the kiwis ), the anseriformes ( with the exception of scr

In [138]:
thefuck2 = tokenizer_fn(data[418].paragraphs[19].questions[4].question, omg)

In [139]:
len(thefuck2.input_ids)

1

In [140]:
thefuck3 = tokenizer_fn('what aids birds with flight?', omg)

In [141]:
len(thefuck3.input_ids)

1

In [104]:
paragraphs_mapper["418_19_57312110a5e9cc1400cdbc57_1"]

'and in rudimentary forms in galliformes ( but fully developed in cracidae ) possess a penis, which is never present in neoaves. the length is thought to be related to sperm competition. when not copulating, it is hidden within the proctodeum compartment within the cloaca, just inside the vent. the digestive system of birds is unique, with a crop for storage and a gizzard that contains swallowed stones for grinding food to compensate for the lack of teeth. most birds are highly adapted for rapid digestion to aid with flight. some migratory birds have adapted to use protein from many parts of their bodies, including protein from the intestines, as additional energy during migration.'

In [146]:
offset_idx = 0
sequence_ids = tttt.sequence_ids(offset_idx)
# find start index of context segment
context_segment_idx = sequence_ids.index(1)
# Decode split into a string
decoded_split = tokenizer.decode(tttt.input_ids[offset_idx])
print(decoded_split)

[CLS] what aids birds with flight? [SEP] like the reptiles, birds are primarily uricotelic, that is, their kidneys extract nitrogenous waste from their bloodstream and excrete it as uric acid instead of urea or ammonia through the ureters into the intestine. birds don't have a urinary bladder or external urethral opening and ( with exception of the ostrich ) uric acid is excreted along with feces as a semisolid waste. however, birds such as hummingbirds can be facultatively ammonotelic, excreting most of the nitrogenous wastes as ammonia. they also excrete creatine, rather than creatinine like mammals. this material, as well as the output of the intestines, emerges from the bird's cloaca. the cloaca is a multi - purpose opening : waste is expelled through it, most birds mate by joining cloaca, and females lay eggs from it. in addition, many species of birds regurgitate pellets. males within palaeognathae ( with the exception of the kiwis ), the anseriformes ( with the exception of scre

In [75]:
print(len(decoded_split.split()))

259


In [28]:
df[df.paragraph_id == "418_19_57312110a5e9cc1400cdbc57_0"]

Unnamed: 0,paragraph_id,question_id,answer_id,answer_start,answer_text,question_text,tokenizer_answer_start,tokenizer_answer_end
83178,418_19_57312110a5e9cc1400cdbc57_0,57312110a5e9cc1400cdbc57,0,1452,rapid digestion,What aids birds with flight?,363,366


In [86]:
df[df.paragraph_id == "418_19_57312110a5e9cc1400cdbc57_1"]

Unnamed: 0,paragraph_id,question_id,answer_id,answer_start,answer_text,question_text,tokenizer_answer_start,tokenizer_answer_end
83129,418_19_57312110a5e9cc1400cdbc57_1,57312110a5e9cc1400cdbc57,0,1452,rapid digestion,What aids birds with flight?,116,119


In [26]:
tttt = tokenizer_fn_train("What aids birds with flight?", paragraphs_mapper["418_19_57312110a5e9cc1400cdbc57_0"])

In [32]:
outs_mod = model(torch.tensor(tttt["input_ids"]).to(device), torch.tensor(tttt["attention_mask"]).to(device))

In [35]:
SpanExtractor.extract_most_probable(outs_mod[0], outs_mod[1])

(tensor([2]), tensor([184]))

In [34]:
outs_mod[0].shape

torch.Size([1, 384])

In [27]:
tttt

{'input_ids': [[101, 2054, 8387, 5055, 2007, 3462, 1029, 102, 2066, 1996, 20978, 1010, 5055, 2024, 3952, 24471, 11261, 9834, 2594, 1010, 2008, 2003, 1010, 2037, 14234, 2015, 14817, 14114, 3560, 5949, 2013, 2037, 2668, 21422, 1998, 4654, 16748, 2618, 2009, 2004, 24471, 2594, 5648, 2612, 1997, 24471, 5243, 2030, 25874, 2083, 1996, 24471, 15141, 2015, 2046, 1996, 20014, 4355, 3170, 1012, 5055, 2123, 1005, 1056, 2031, 1037, 24471, 3981, 2854, 24176, 2030, 6327, 24471, 11031, 7941, 3098, 1998, 1006, 2007, 6453, 1997, 1996, 9808, 12412, 2232, 1007, 24471, 2594, 5648, 2003, 4654, 16748, 3064, 2247, 2007, 10768, 9623, 2004, 1037, 4100, 19454, 3593, 5949, 1012, 2174, 1010, 5055, 2107, 2004, 20364, 12887, 2064, 2022, 6904, 10841, 24458, 25499, 2572, 8202, 12184, 10415, 1010, 4654, 16748, 3436, 2087, 1997, 1996, 14114, 3560, 5949, 2015, 2004, 25874, 1012, 2027, 2036, 4654, 16748, 2618, 13675, 5243, 10196, 1010, 2738, 2084, 13675, 5243, 7629, 3170, 2066, 11993, 1012, 2023, 3430, 1010, 2004, 2092, 

In [83]:
#tttt["input_ids"][1]

In [61]:
len(paragraphs_mapper["418_19_57312110a5e9cc1400cdbc57_0"].split())

260

In [60]:
paragraphs_mapper["418_19_57312110a5e9cc1400cdbc57_1"]

'and in rudimentary forms in galliformes ( but fully developed in cracidae ) possess a penis, which is never present in neoaves. the length is thought to be related to sperm competition. when not copulating, it is hidden within the proctodeum compartment within the cloaca, just inside the vent. the digestive system of birds is unique, with a crop for storage and a gizzard that contains swallowed stones for grinding food to compensate for the lack of teeth. most birds are highly adapted for rapid digestion to aid with flight. some migratory birds have adapted to use protein from many parts of their bodies, including protein from the intestines, as additional energy during migration.'

In [53]:
df.iloc[83128]

paragraph_id              418_19_57312110a5e9cc1400cdbc57_0
question_id                        57312110a5e9cc1400cdbc57
answer_id                                                 0
answer_start                                           1452
answer_text                                 rapid digestion
question_text                  What aids birds with flight?
tokenizer_answer_start                                  363
tokenizer_answer_end                                    366
Name: 83128, dtype: object

In [93]:
#failed['input_ids'][5]

In [46]:
for failed_id in failed['input_ids']:
    print(len(failed_id))

1
1
1
1
1
2
1
1
1
1
1
1
1
1
1
1


In [35]:
torch.tensor(failed['input_ids'])

ValueError: expected sequence of length 1 at dim 1 (got 2)

In [24]:
history = {"train_loss": [], "train_acc_start": [], "train_acc_end": []}
loop_start = timer()
# lr scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=5, threshold=0.01)
for epoch in range(50):
    train_dict = train_step(model, optimizer, loss_function, train_data_loader, device=device)
    #eval_results = evaluate_model_on_data(model, evaluator, train_data_loader, paragraphs_mapper, device, debug=True)
    cur_lr = optimizer.param_groups[0]['lr']
    print(f'Epoch: {epoch}, lr: {cur_lr}, Train loss: {train_dict["loss"]:.4f},  Train acc start: {train_dict["accuracy_start"]:.4f}, Train acc end: {train_dict["accuracy_end"]:.4f}, Time: {train_dict["time"]:.4f}')
    history["train_loss"].append(train_dict["loss"]);history["train_acc_start"].append(train_dict["accuracy_start"]);history["train_acc_end"].append(train_dict["accuracy_end"]);
    #history["val_loss"].append(val_dict["loss"]);history["val_acc"].append(val_dict["accuracy"]);
    #scheduler.step(val_dict["loss"])
    #print(f"Evaluation Results: {eval_results}")
loop_end = timer()
print(f"Elapsed time: {(loop_end - loop_start):.4f}")

Epoch: 0, lr: 0.01, Train loss: 11.9244,  Train acc start: 0.0037, Train acc end: 0.0015, Time: 2007.7466


KeyboardInterrupt: 

In [38]:
"""
NOTE: this logic is used for sample creation only, such that each sample is "short enough" for BERT; 
      a duplicate of this logic will need to be used in QADataset Dataloader class when we'll take
      short samples' text, tokenize them again, and find the correct index
ALTERNATIVE: for BERT models we could directly get the answer spans, and pass them in dataframe to another QADataset
             built specifically for BERT, that will just take the data from dataframe (way nicer and faster solution).
SUGGESTION: we could also use specific dict keys and in QADataset pick stuff from these keys: 
                - if these keys are absent then don't use BERT logic (eg span_start and span_end) and use previous logic
                - if these keys are present, then just use them and gather the BERT samples.
                Call these keys like "tokenizer_span_idx" (to make them kinda unique)
"""

'\nNOTE: this logic is used for sample creation only, such that each sample is "short enough" for BERT; \n      a duplicate of this logic will need to be used in QADataset Dataloader class when we\'ll take\n      short samples\' text, tokenize them again, and find the correct index\nALTERNATIVE: for BERT models we could directly get the answer spans, and pass them in dataframe to another QADataset\n             built specifically for BERT, that will just take the data from dataframe (way nicer and faster solution).\nSUGGESTION: we could also use specific dict keys and in QADataset pick stuff from these keys: \n                - if these keys are absent then don\'t use BERT logic (eg span_start and span_end) and use previous logic\n                - if these keys are present, then just use them and gather the BERT samples.\n                Call these keys like "tokenizer_span_idx" (to make them kinda unique)\n'

In [5]:
paragraphs_mapper, df = build_mappers_and_dataframe(data, limit_answers=1)
print(paragraphs_mapper[next(iter(paragraphs_mapper))])
df.head()

Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.


Unnamed: 0,paragraph_id,question_id,answer_id,answer_start,answer_text,question_text
0,0_0,5733be284776f41900661182,0,515,Saint Bernadette Soubirous,To whom did the Virgin Mary allegedly appear i...
1,0_0,5733be284776f4190066117f,0,188,a copper statue of Christ,What is in front of the Notre Dame Main Building?
2,0_0,5733be284776f41900661180,0,279,the Main Building,The Basilica of the Sacred heart at Notre Dame...
3,0_0,5733be284776f41900661181,0,381,a Marian place of prayer and reflection,What is the Grotto at Notre Dame?
4,0_0,5733be284776f4190066117e,0,92,a golden statue of the Virgin Mary,What sits on top of the Main Building at Notre...


In [6]:
def preprocess_text(text_dict: Dict[str, Any], text_key: Union[str, None] = None) -> Any:
    text_dict = copy.deepcopy(text_dict)
    # just tokenize and remove punctuation for now
    # TODO: add better punctuation removal later
    tokenizer = SpaceTokenizer()#TreebankWordTokenizer()
    for key in text_dict.keys():
        if text_key is not None:
            text = tokenizer.tokenize(text_dict[key][text_key])
            text_dict[key][text_key] = text
        else:
            text = tokenizer.tokenize(text_dict[key])
            text_dict[key] = text
    return text_dict

In [7]:
paragraphs_mapper = preprocess_text(paragraphs_mapper)
df['question_text'] = df.apply(lambda row: nltk.word_tokenize(row['question_text']), axis=1)

In [8]:
# Extend the paragraphs mapper to include spans
paragraphs_spans_mapper = add_paragraphs_spans(paragraphs_mapper)

In [9]:
print(paragraphs_spans_mapper['0_0']['text'])
print(paragraphs_spans_mapper['0_0']['spans'])

['Architecturally,', 'the', 'school', 'has', 'a', 'Catholic', 'character.', 'Atop', 'the', 'Main', "Building's", 'gold', 'dome', 'is', 'a', 'golden', 'statue', 'of', 'the', 'Virgin', 'Mary.', 'Immediately', 'in', 'front', 'of', 'the', 'Main', 'Building', 'and', 'facing', 'it,', 'is', 'a', 'copper', 'statue', 'of', 'Christ', 'with', 'arms', 'upraised', 'with', 'the', 'legend', '"Venite', 'Ad', 'Me', 'Omnes".', 'Next', 'to', 'the', 'Main', 'Building', 'is', 'the', 'Basilica', 'of', 'the', 'Sacred', 'Heart.', 'Immediately', 'behind', 'the', 'basilica', 'is', 'the', 'Grotto,', 'a', 'Marian', 'place', 'of', 'prayer', 'and', 'reflection.', 'It', 'is', 'a', 'replica', 'of', 'the', 'grotto', 'at', 'Lourdes,', 'France', 'where', 'the', 'Virgin', 'Mary', 'reputedly', 'appeared', 'to', 'Saint', 'Bernadette', 'Soubirous', 'in', '1858.', 'At', 'the', 'end', 'of', 'the', 'main', 'drive', '(and', 'in', 'a', 'direct', 'line', 'that', 'connects', 'through', '3', 'statues', 'and', 'the', 'Gold', 'Dome),

In [10]:
len(df)

87599

### DataConverter and CustomQADataset

In [95]:
from data_loading.utils import DataConverter, padder_collate_fn
from data_loading.qa_dataset import CustomQADataset

data_converter = DataConverter(embedding_model, paragraphs_spans_mapper)
datasetQA = CustomQADataset(data_converter, df, paragraphs_mapper)
data_loader = torch.utils.data.DataLoader(datasetQA, collate_fn = padder_collate_fn, batch_size=10, shuffle=True)

test_batch = next(iter(data_loader))
print(test_batch["paragraph_emb"].shape)
print(test_batch["y_gt"].shape)

NameError: name 'paragraphs_spans_mapper' is not defined

# Model train

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from timeit import default_timer as timer

from models.utils import SpanExtractor

In [17]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"The device is {device}")

The device is cuda


Model:

(paragraph_emb, question_emb) -> (answer_start, answer_end) // for each token in paragraph_emb

In [18]:
def train_step(model, optimizer, loss_function, dataloader, device="cpu"):
    acc_loss = 0
    acc_start_accuracy = 0
    acc_end_accuracy = 0
    count = 0

    time_start = timer()
    
    model.train()
    for batch in dataloader:
        input_ids_in = batch["input_ids"]
        atten_mask_in = batch["attention_mask"]
        answer_spans_start = batch["y_gt"][:, 0]
        answer_spans_end = batch["y_gt"][:, 1]
        # Clear gradients
        model.zero_grad()
        # Place to right device
        input_ids_in = input_ids_in.to(device)
        atten_mask_in = atten_mask_in.to(device)
        answer_spans_start = answer_spans_start.to(device)
        answer_spans_end = answer_spans_end.to(device)
        # Run forward pass
        pred_answer_start_scores, pred_answer_end_scores = model(input_ids_in, atten_mask_in)
        # Compute the CrossEntropyLoss
        loss = loss_function(pred_answer_start_scores, answer_spans_start) + loss_function(pred_answer_end_scores, answer_spans_end)
        # Compute gradients
        loss.backward()
        # Optimizer step
        optimizer.step()
        # --- Compute metrics ---
        # Get span indexes
        pred_span_start_idxs, pred_span_end_idxs = SpanExtractor.extract_most_probable(pred_answer_start_scores, pred_answer_end_scores)
        gt_start_idxs = answer_spans_start.cpu().detach()
        gt_end_idxs = answer_spans_end.cpu().detach()
        # two accs
        start_accuracy = torch.sum(gt_start_idxs == pred_span_start_idxs) / len(pred_span_start_idxs)
        end_accuracy = torch.sum(gt_end_idxs == pred_span_end_idxs) / len(pred_span_end_idxs)
        # Gather stats
        acc_loss += loss.item()
        acc_start_accuracy += start_accuracy.item()
        acc_end_accuracy += end_accuracy.item()
        count += 1
    time_end = timer()
    return {
        "loss": acc_loss / count, 
        "accuracy_start": acc_start_accuracy / count, 
        "accuracy_end": acc_end_accuracy / count,
        "time": time_end - time_start
    }

In [19]:
# create Evaluator object
evaluator = Evaluator(documents_list=data)

In [20]:
def evaluate_model_on_data(model, evaluator, dataloader, paragraphs_mapper, device, debug=False):
    eval_dict = build_evaluation_dict(model, dataloader, paragraphs_mapper, device)
    if debug:
        print(f"DEBUG: Eval_dict: {eval_dict}")
    stats = {}
    stats['exact_match'] = evaluator.ExactMatch(eval_dict)
    stats['f1'] = evaluator.F1(eval_dict)
    return stats

In [17]:
class WeightedSum(nn.Module):
    def __init__(self, input_dim):
        """
        General idea, given a random dummy weights vector, 
        learn to weight it based on query
        """
        super(WeightedSum, self).__init__()
        self.weights = nn.Parameter(torch.randn(input_dim))

    def forward(self, input_emb, mask=None):
        # TODO: if needed, implement time masking
        batch, timesteps, embed_dim = input_emb.shape
        # w dot q_j
        dot_prods = torch.matmul(input_emb, self.weights)
        # exp(w dot q_j)
        exp_prods = torch.exp(dot_prods)
        # normalization factor
        sum_exp_prods = torch.sum(exp_prods, dim=1)
        sum_exp_prods = sum_exp_prods.repeat(timesteps, 1).T
        # b_j
        b = exp_prods / sum_exp_prods
        # q (embedding) = sum_t(b_t * q_t)
        b_scal_q = input_emb * b[:, :, None]
        # now sum along correct axis
        q = torch.sum(b_scal_q, axis=1)
        return q

**Compatibility functions**

**Multiplicative (dot)**:

p = paragraph emb shape: [B, T, E] (Query)

q = question weighted shape: [B, E] reshaped to [B, E, 1] (Keys)

scores = p @ q (of shape: [B, T, 1])

**General bilinear**:

p = paragraph emb shape: [B, T, Ep] (Query)

q = question weighted shape: [B, Eq] reshaped to [B, Eq, 1] (Keys)

W = parameter matrix of shape: [Ep, Eq]

scores = p @ W @ q (of shape: [B, T, 1])

In [18]:
class BilinearCompatibility(nn.Module):
    def __init__(self, query_dim, keys_dim):
        """
        Perform bilinear compatibility f(q, K) = q.T @ W @ K
        Recall: multiplicative/dot compatibility is f(q, K) = q.T @ K
        
        Where: 
            q -> embedded paragraphs (p in DrQA)
            K -> embedded question (q in DrQA)
        """
        super(BilinearCompatibility, self).__init__()
        self.weights = nn.Parameter(torch.randn(query_dim, keys_dim))

    def forward(self, query, keys):
        """
        query: batch of shape (batch, seq_len, query_dim) (Query)
        keys = batch of shape (batch, key_dim) which will be reshaped into [batch, key_dim, 1] (Keys)
        """
        return query @ self.weights @ keys[:, :, None]

In [19]:
class LSTM_QA(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, tagset_size):
        super(LSTM_QA, self).__init__()
        self.tagset_size = tagset_size
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.paragraph_embedder = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.question_embedder = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.weighted_sum = WeightedSum(hidden_dim * 2)
        # used to compute similarity scores
        self.general_bilinear_start = BilinearCompatibility(hidden_dim * 2, hidden_dim * 2)
        self.general_bilinear_end = BilinearCompatibility(hidden_dim * 2, hidden_dim * 2)
        # to classify from similarity to prob of start and prob of end
        self.sim_to_start = nn.Linear(1, 1) # given a similarity score, predict P(start)
        self.sim_to_end = nn.Linear(1, 1) # given a similarity score, predict P(end)

    def forward(self, paragraphs, questions):
        batch_size, seq_len, n_feat = paragraphs.shape
        # As we assume batch_first true, then our sentence_embeddings will have correct shape
        paragraphs_seq_emb, _ = self.paragraph_embedder(paragraphs) # (batch, seq_len, n_feats * n_dirs)
        questions_seq_emb, _ = self.question_embedder(questions) # (batch, seq_len, n_feats * n_dirs)
        # weighted sum
        questions_state_repr = self.weighted_sum(questions_seq_emb)
        #return paragraphs_seq_emb, questions_state_repr
        # compute similarities -> (batch, timestep, 1)
        similarities_start = self.general_bilinear_start(paragraphs_seq_emb, questions_state_repr)
        similarities_end = self.general_bilinear_start(paragraphs_seq_emb, questions_state_repr)
        #print(f"INSIDE MODEL: similarities shape: {similarities.shape}") #DEBUG
        # --- Given a similarity score, predict P(start), P(end) ---
        # similarities flattened
        similarities_start = similarities_start.contiguous()
        similarities_start = similarities_start.view(-1, 1) # as similarity dim is 1 -> viewed shape is (batch*timestep, 1)
        start_scores = self.sim_to_start(similarities_start)
        start_logits = start_scores.view(batch_size, seq_len) # P(start)
        
        similarities_end = similarities_end.contiguous()
        similarities_end = similarities_end.view(-1, 1) # as similarity dim is 1 -> viewed shape is (batch*timestep, 1)
        end_scores = self.sim_to_end(similarities_end)
        end_logits = end_scores.view(batch_size, seq_len) # P(end)
        
        # if we view each sequence of tokens as a feature vector
        # we can interpret the start/end assignation problem as 
        # a classification with a variable number of classes
        # thus assume that our model outputs logits that will just be passed
        # to a softmax, to build a probable distribution of the start token
        return start_logits, end_logits

In [20]:
#torch.nn.functional.softmax(outs_mod[0])

In [30]:
# Define baseline model
model = LSTM_QA(embedding_model.vector_size, 128, 10).to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001, amsgrad=True)

In [31]:
data_converter = DataConverter(embedding_model, paragraphs_spans_mapper)
datasetQA = CustomQADataset(data_converter, df, paragraphs_mapper)

In [32]:
train_data_loader = torch.utils.data.DataLoader(datasetQA, collate_fn = padder_collate_fn, batch_size=64, shuffle=True)

In [33]:
history = {"train_loss": [], "train_acc_start": [], "train_acc_end": []}
loop_start = timer()
# lr scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=5, threshold=0.01)
for epoch in range(50):
    train_dict = train_step(model, optimizer, loss_function, train_data_loader, device=device)
    eval_results = evaluate_model_on_data(model, evaluator, train_data_loader, paragraphs_mapper, device, debug=True)
    cur_lr = optimizer.param_groups[0]['lr']
    print(f'Epoch: {epoch}, lr: {cur_lr}, Train loss: {train_dict["loss"]:.4f},  Train acc start: {train_dict["accuracy_start"]:.4f}, Train acc end: {train_dict["accuracy_end"]:.4f}, Time: {train_dict["time"]:.4f}')
    history["train_loss"].append(train_dict["loss"]);history["train_acc_start"].append(train_dict["accuracy_start"]);history["train_acc_end"].append(train_dict["accuracy_end"]);
    #history["val_loss"].append(val_dict["loss"]);history["val_acc"].append(val_dict["accuracy"]);
    #scheduler.step(val_dict["loss"])
    print(f"Evaluation Results: {eval_results}")
loop_end = timer()
print(f"Elapsed time: {(loop_end - loop_start):.4f}")

Might fail ['When', 'the', 'news', 'arrived', 'in', 'England', 'it', 'caused', 'an', 'outcry.', 'In', 'response,', 'a', 'combined', 'bounty', 'of', '£1,000', 'was', 'offered', 'for', "Every's", 'capture', 'by', 'the', 'Privy', 'Council', 'and', 'East', 'India', 'Company,', 'leading', 'to', 'the', 'first', 'worldwide', 'manhunt', 'in', 'recorded', 'history.', 'The', 'plunder', 'of', "Aurangzeb's", 'treasure', 'ship', 'had', 'serious', 'consequences', 'for', 'the', 'English', 'East', 'India', 'Company.', 'The', 'furious', 'Mughal', 'Emperor', 'Aurangzeb', 'ordered', 'Sidi', 'Yaqub', 'and', 'Nawab', 'Daud', 'Khan', 'to', 'attack', 'and', 'close', 'four', 'of', 'the', "company's", 'factories', 'in', 'India', 'and', 'imprison', 'their', 'officers,', 'who', 'were', 'almost', 'lynched', 'by', 'a', 'mob', 'of', 'angry', 'Mughals,', 'blaming', 'them', 'for', 'their', "countryman's", 'depredations,', 'and', 'threatened', 'to', 'put', 'an', 'end', 'to', 'all', 'English', 'trading', 'in', 'India.'

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch: 0, lr: 0.01, Train loss: 8.3310,  Train acc start: 0.0423, Train acc end: 0.0603, Time: 398.3827
Evaluation Results: {'exact_match': 5.293439422824462, 'f1': 9.209568529959142}


KeyboardInterrupt: 