In [1]:
import pandas as pd
import numpy as np
import spacy
import json,re
import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys
sys.path.append('../..')

# Dataset
Download dataset from [here](https://rajpurkar.github.io/SQuAD-explorer/) and save it under the `offline_training/data/raw/squad` folder.

Task: Question and Answering

Dataset: SQuAD2.0 - The Stanford Question Answering Dataset 


In [3]:
from offline_training.utils.preprocessing_squad import *

### Load Data into DataFrames

In [4]:
train_dataset= load_json('../data/raw/squad/train-v2.0.json')
validation_dataset= load_json('../data/raw/squad/dev-v2.0.json') 

Length of data:  442
Keys of data:  dict_keys(['title', 'paragraphs'])
Title of data:  Beyoncé 

Length of data:  35
Keys of data:  dict_keys(['title', 'paragraphs'])
Title of data:  Normans 



In [5]:
train_data_list=parse_data(train_dataset['data'])
val_data_list=parse_data(validation_dataset['data'])

In [6]:
train_df=pd.DataFrame(train_data_list)
val_df=pd.DataFrame(val_data_list)
train_df.context=train_df.context.apply(remove_redundant_symbols)
val_df.context=val_df.context.apply(remove_redundant_symbols)

In [7]:
train_df.context.head()

0    Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
1    Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
2    Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
3    Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
4    Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
Name: context, dtype: object

### Build a word vocabulary

In [8]:
agg_txt_list=aggregate_text([train_df,val_df])

In [9]:
agg_txt_list[1]

'Following the disbandment of Destiny\'s Child in June 2005, she released her second solo album, B\'Day (2006), which contained hits "Déjà Vu", "Irreplaceable", and "Beautiful Liar". Beyoncé also ventured into acting, with a Golden Globe-nominated performance in Dreamgirls (2006), and starring roles in The Pink Panther (2006) and Obsessed (2009). Her marriage to rapper Jay Z and portrayal of Etta James in Cadillac Records (2008) influenced her third album, I Am... Sasha Fierce (2008), which saw the birth of her alter-ego Sasha Fierce and earned a record-setting six Grammy Awards in 2010, including Song of the Year for "Single Ladies (Put a Ring on It)". Beyoncé took a hiatus from music in 2010 and took over management of her career; her fourth album 4 (2011) was subsequently mellower in tone, exploring 1970s funk, 1980s pop, and 1990s soul. Her critically acclaimed fifth studio album, Beyoncé (2013), was distinguished from previous releases by its experimental production and exploratio

In [10]:
word2idx, idx2word, word_vocabulary=create_word_vocabulary(agg_txt_list)

raw-vocab length: 107739
final vocab length: 107741
word2idx-length: 107741


In [None]:
word2idx 

In [None]:
idx2word

In [None]:
word_vocabulary

In [None]:

def cq_to_id_converter(text, word2idx):
    '''
    Converts context and questions text to their respective ids by mapping each word
    using word2idx. Input text is tokenized using spacy tokenizer first.

    :param str text: context or question text to be converted
    :param dict word2idx: word to id mapping
    :returns list text_ids: list of mapped ids

    :raises assertion error: sanity check

    '''

    text_tokens = [w.text for w in nlp(text, disable=['parser', 'tagger', 'ner', 'lemmatizer'])]
    text_ids = [word2idx[word] for word in text_tokens]

    assert len(text_ids) == len(text_tokens)
    return text_ids

In [None]:
# numericalize context and questions for training and validation set
train_df['context_ids'] = train_df.context.apply(cq_to_id_converter, word2idx=word2idx)
train_df['question_ids'] = train_df.question.apply(cq_to_id_converter,  word2idx=word2idx)


In [None]:

val_df['context_ids'] = val_df.context.apply(cq_to_id_converter, word2idx=word2idx)
val_df['question_ids'] = val_df.question.apply(cq_to_id_converter,  word2idx=word2idx)

In [14]:
word_vocabulary

['<unk>',
 '<pad>',
 'the',
 ',',
 'of',
 '.',
 '?',
 'and',
 'in',
 'to',
 'a',
 'What',
 'is',
 'was',
 '"',
 '-',
 'The',
 'as',
 'for',
 "'s",
 'that',
 'by',
 '(',
 ')',
 'with',
 'on',
 'did',
 'are',
 'from',
 'what',
 'were',
 'In',
 'which',
 'be',
 'or',
 'an',
 'at',
 'have',
 'many',
 'it',
 'How',
 'Who',
 'has',
 'not',
 'their',
 'his',
 'first',
 'its',
 'had',
 'also',
 'When',
 'other',
 'one',
 'most',
 ';',
 'does',
 'used',
 'year',
 'can',
 'this',
 'been',
 'but',
 'more',
 'such',
 'city',
 'into',
 'who',
 'two',
 'Which',
 'do',
 'than',
 'they',
 "'",
 'time',
 'between',
 'all',
 'during',
 'after',
 'name',
 'he',
 'some',
 'when',
 'century',
 'people',
 '%',
 'Where',
 ':',
 'would',
 'state',
 'known',
 'use',
 'over',
 'called',
 'new',
 'only',
 'United',
 'about',
 'New',
 'A',
 'This',
 'system',
 'there',
 'years',
 'It',
 'government',
 'part',
 'made',
 'type',
 'population',
 'up',
 'may',
 'American',
 'much',
 'world',
 'these',
 'States',
 'wh

In [15]:

def cq_to_id_converter(text, word2idx):
    '''
    Converts context and questions text to their respective ids by mapping each word
    using word2idx. Input text is tokenized using spacy tokenizer first.

    :param str text: context or question text to be converted
    :param dict word2idx: word to id mapping
    :returns list text_ids: list of mapped ids

    :raises assertion error: sanity check

    '''

    text_tokens = [w.text for w in nlp(text, disable=['parser', 'tagger', 'ner', 'lemmatizer'])]
    text_ids = [word2idx[word] for word in text_tokens]

    assert len(text_ids) == len(text_tokens)
    return text_ids

In [17]:
# numericalize context and questions for training and validation set
train_df['context_ids'] = train_df.context.apply(cq_to_id_converter, word2idx=word2idx)
train_df['question_ids'] = train_df.question.apply(cq_to_id_converter,  word2idx=word2idx)


In [18]:

val_df['context_ids'] = val_df.context.apply(cq_to_id_converter, word2idx=word2idx)
val_df['question_ids'] = val_df.question.apply(cq_to_id_converter,  word2idx=word2idx)

In [None]:
# get indices with tokenization errors and drop those indices 

train_err = get_error_indices(train_df, idx2word)
valid_err = get_error_indices(valid_df, idx2word)

train_df.drop(train_err, inplace=True)
valid_df.drop(valid_err, inplace=True)

In [None]:
# get start and end positions of answers from the context
# this is basically the label for training QA models

train_label_idx = train_df.apply(index_answer, axis=1, idx2word=idx2word)
valid_label_idx = valid_df.apply(index_answer, axis=1, idx2word=idx2word)

train_df['label_idx'] = train_label_idx
valid_df['label_idx'] = valid_label_idx

### Build a character vocabulary

In [None]:
char_vocabulary=create_char_vocabulary(agg_txt_list)