In [None]:
import json
import re

In [None]:
def load_json(path):
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

In [None]:
def parse_data(data):
    data = data['data']
    qa_list = []

    for paragraphs in data:

        for para in paragraphs['paragraphs']:
            context = para['context']

            for qa in para['qas']:
                
                id = qa['id']
                question = qa['question']
                
                for ans in qa['answers']:
                    answer = ans['text']
                    ans_start = ans['answer_start']
                    ans_end = ans_start + len(answer)
                    
                    qa_dict = {}
                    qa_dict['id'] = id
                    qa_dict['context'] = context
                    qa_dict['question'] = question
                    qa_dict['label'] = [ans_start, ans_end]

                    qa_dict['answer'] = answer
                    qa_list.append(qa_dict)
    return qa_list

In [None]:
def text_to_ids(text, word2idx, nlp):
    toks = [(w.text, w.pos, w.ent_type) for w in nlp(text)]
    text_ids = [word2idx[t[0]] for t in toks]
    fts = [(t[1],t[2])for t in toks]
    return text_ids , fts

In [None]:
def index_answer(row, idx2word,nlp):    
    context_span = [(word.idx, word.idx + len(word.text)) for word in nlp(row.context, disable=['parser','tagger','ner'])]
    starts, ends = zip(*context_span)
    answer_start, answer_end = row.label
    start_idx = starts.index(answer_start)
    end_idx  = ends.index(answer_end)
    ans_toks = [w.text for w in nlp(row.answer,disable=['parser','tagger','ner'])]
    ans_start = ans_toks[0]
    ans_end = ans_toks[-1]    
    return [start_idx, end_idx]

In [None]:
def normalize_spaces(text):
    text = re.sub(r'\s', ' ', text)
    return text

In [None]:
def test_indices(df, idx2word,nlp):
    start_value_error = []
    end_value_error = []
    assert_error = []
    for index, row in df.iterrows():

        answer_tokens = [w.text for w in nlp(row['answer'], disable=['parser','tagger','ner'])]

        start_token = answer_tokens[0]
        end_token = answer_tokens[-1]
        
        context_span  = [(word.idx, word.idx + len(word.text)) 
                         for word in nlp(row['context'], disable=['parser','tagger','ner'])]

        starts, ends = zip(*context_span)

        answer_start, answer_end = row['label']

        try:
            start_idx = starts.index(answer_start)
        except:
            start_value_error.append(index)
        try:
            end_idx  = ends.index(answer_end)
        except:
            end_value_error.append(index)

        try:
            assert idx2word[row['context_ids'][0][start_idx]] == answer_tokens[0]
            assert idx2word[row['context_ids'][0][end_idx]] == answer_tokens[-1]
        except:
            assert_error.append(index)


    return start_value_error, end_value_error, assert_error



def get_error_indices(df, idx2word, nlp):
    start_value_error, end_value_error, assert_error = test_indices(df, idx2word,nlp)
    err_idx = start_value_error + end_value_error + assert_error
    err_idx = set(err_idx)
    return err_idx

In [None]:
def index_answer(row, idx2word,nlp):    
    context_span = [(word.idx, word.idx + len(word.text)) for word in nlp(row.context, disable=['parser','tagger','ner'])]
    starts, ends = zip(*context_span)
    answer_start, answer_end = row.label
    start_idx = starts.index(answer_start)
    end_idx  = ends.index(answer_end)
    ans_toks = [w.text for w in nlp(row.answer,disable=['parser','tagger','ner'])]
    ans_start = ans_toks[0]
    ans_end = ans_toks[-1]
    assert idx2word[row.context_ids[0][start_idx]] == ans_start
    assert idx2word[row.context_ids[0][end_idx]] == ans_end
    return [start_idx, end_idx]
