In [1]:
import pandas as pd
import numpy as np
import spacy
import json,re
import tqdm

In [2]:
import sys
sys.path.append('../..')

# Dataset
Download dataset from [here](https://rajpurkar.github.io/SQuAD-explorer/) and save it under the `offline_training/data/raw/squad` folder.

Task: Question and Answering

Dataset: SQuAD2.0 - The Stanford Question Answering Dataset 


In [3]:
from offline_training.utils.preprocessing_squad import *

### Load Data into DataFrames

In [4]:
train_dataset= load_json('../data/raw/squad/train-v2.0.json')
validation_dataset= load_json('../data/raw/squad/dev-v2.0.json') 

Length of data:  442
Keys of data:  dict_keys(['title', 'paragraphs'])
Title of data:  Beyoncé 

Length of data:  35
Keys of data:  dict_keys(['title', 'paragraphs'])
Title of data:  Normans 



In [5]:
train_data_list=parse_data(train_dataset['data'])
val_data_list=parse_data(validation_dataset['data'])

In [6]:
train_df=pd.DataFrame(train_data_list)
val_df=pd.DataFrame(val_data_list)
train_df.context=train_df.context.apply(remove_redundant_symbols)
val_df.context=val_df.context.apply(remove_redundant_symbols)

In [7]:
train_df.context.head()

0    Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
1    Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
2    Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
3    Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
4    Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
Name: context, dtype: object

### Build a word vocabulary

In [8]:
agg_txt_list=aggregate_text([train_df,val_df])

In [9]:
agg_txt_list[1]

'Following the disbandment of Destiny\'s Child in June 2005, she released her second solo album, B\'Day (2006), which contained hits "Déjà Vu", "Irreplaceable", and "Beautiful Liar". Beyoncé also ventured into acting, with a Golden Globe-nominated performance in Dreamgirls (2006), and starring roles in The Pink Panther (2006) and Obsessed (2009). Her marriage to rapper Jay Z and portrayal of Etta James in Cadillac Records (2008) influenced her third album, I Am... Sasha Fierce (2008), which saw the birth of her alter-ego Sasha Fierce and earned a record-setting six Grammy Awards in 2010, including Song of the Year for "Single Ladies (Put a Ring on It)". Beyoncé took a hiatus from music in 2010 and took over management of her career; her fourth album 4 (2011) was subsequently mellower in tone, exploring 1970s funk, 1980s pop, and 1990s soul. Her critically acclaimed fifth studio album, Beyoncé (2013), was distinguished from previous releases by its experimental production and exploratio

In [10]:
word2idx, idx2word, word_vocabulary=create_word_vocabulary(agg_txt_list)

raw-vocab length: 107739
final vocab length: 107741
word2idx-length: 107741


In [11]:
word2idx 

{'<unk>': 0,
 '<pad>': 1,
 'the': 2,
 ',': 3,
 'of': 4,
 '.': 5,
 '?': 6,
 'and': 7,
 'in': 8,
 'to': 9,
 'a': 10,
 'What': 11,
 'is': 12,
 'was': 13,
 '"': 14,
 '-': 15,
 'The': 16,
 'as': 17,
 'for': 18,
 "'s": 19,
 'that': 20,
 'by': 21,
 '(': 22,
 ')': 23,
 'with': 24,
 'on': 25,
 'did': 26,
 'are': 27,
 'from': 28,
 'what': 29,
 'were': 30,
 'In': 31,
 'which': 32,
 'be': 33,
 'or': 34,
 'an': 35,
 'at': 36,
 'have': 37,
 'many': 38,
 'it': 39,
 'How': 40,
 'Who': 41,
 'has': 42,
 'not': 43,
 'their': 44,
 'his': 45,
 'first': 46,
 'its': 47,
 'had': 48,
 'also': 49,
 'When': 50,
 'other': 51,
 'one': 52,
 'most': 53,
 ';': 54,
 'does': 55,
 'used': 56,
 'year': 57,
 'can': 58,
 'this': 59,
 'been': 60,
 'but': 61,
 'more': 62,
 'such': 63,
 'city': 64,
 'into': 65,
 'who': 66,
 'two': 67,
 'Which': 68,
 'do': 69,
 'than': 70,
 'they': 71,
 "'": 72,
 'time': 73,
 'between': 74,
 'all': 75,
 'during': 76,
 'after': 77,
 'name': 78,
 'he': 79,
 'some': 80,
 'when': 81,
 'century': 8

In [12]:
idx2word

{0: '<unk>',
 1: '<pad>',
 2: 'the',
 3: ',',
 4: 'of',
 5: '.',
 6: '?',
 7: 'and',
 8: 'in',
 9: 'to',
 10: 'a',
 11: 'What',
 12: 'is',
 13: 'was',
 14: '"',
 15: '-',
 16: 'The',
 17: 'as',
 18: 'for',
 19: "'s",
 20: 'that',
 21: 'by',
 22: '(',
 23: ')',
 24: 'with',
 25: 'on',
 26: 'did',
 27: 'are',
 28: 'from',
 29: 'what',
 30: 'were',
 31: 'In',
 32: 'which',
 33: 'be',
 34: 'or',
 35: 'an',
 36: 'at',
 37: 'have',
 38: 'many',
 39: 'it',
 40: 'How',
 41: 'Who',
 42: 'has',
 43: 'not',
 44: 'their',
 45: 'his',
 46: 'first',
 47: 'its',
 48: 'had',
 49: 'also',
 50: 'When',
 51: 'other',
 52: 'one',
 53: 'most',
 54: ';',
 55: 'does',
 56: 'used',
 57: 'year',
 58: 'can',
 59: 'this',
 60: 'been',
 61: 'but',
 62: 'more',
 63: 'such',
 64: 'city',
 65: 'into',
 66: 'who',
 67: 'two',
 68: 'Which',
 69: 'do',
 70: 'than',
 71: 'they',
 72: "'",
 73: 'time',
 74: 'between',
 75: 'all',
 76: 'during',
 77: 'after',
 78: 'name',
 79: 'he',
 80: 'some',
 81: 'when',
 82: 'century

In [13]:
word_vocabulary

['<unk>',
 '<pad>',
 'the',
 ',',
 'of',
 '.',
 '?',
 'and',
 'in',
 'to',
 'a',
 'What',
 'is',
 'was',
 '"',
 '-',
 'The',
 'as',
 'for',
 "'s",
 'that',
 'by',
 '(',
 ')',
 'with',
 'on',
 'did',
 'are',
 'from',
 'what',
 'were',
 'In',
 'which',
 'be',
 'or',
 'an',
 'at',
 'have',
 'many',
 'it',
 'How',
 'Who',
 'has',
 'not',
 'their',
 'his',
 'first',
 'its',
 'had',
 'also',
 'When',
 'other',
 'one',
 'most',
 ';',
 'does',
 'used',
 'year',
 'can',
 'this',
 'been',
 'but',
 'more',
 'such',
 'city',
 'into',
 'who',
 'two',
 'Which',
 'do',
 'than',
 'they',
 "'",
 'time',
 'between',
 'all',
 'during',
 'after',
 'name',
 'he',
 'some',
 'when',
 'century',
 'people',
 '%',
 'Where',
 ':',
 'would',
 'state',
 'known',
 'use',
 'over',
 'called',
 'new',
 'only',
 'United',
 'about',
 'New',
 'A',
 'This',
 'system',
 'there',
 'years',
 'It',
 'government',
 'part',
 'made',
 'type',
 'population',
 'up',
 'may',
 'American',
 'much',
 'world',
 'these',
 'States',
 'wh

In [15]:
# numericalize context and questions for training and validation set
train_df['context_ids'] = train_df.context.apply(cq_to_id_converter, word2idx=word2idx)
train_df['question_ids'] = train_df.question.apply(cq_to_id_converter,  word2idx=word2idx)


In [16]:

val_df['context_ids'] = val_df.context.apply(cq_to_id_converter, word2idx=word2idx)
val_df['question_ids'] = val_df.question.apply(cq_to_id_converter,  word2idx=word2idx)

In [19]:
# get indices with tokenization errors and drop those indices 
train_err = get_error_indices(train_df, idx2word)
train_df.drop(train_err, inplace=True)


Number of error indices: 907


In [20]:
val_err = get_error_indices(val_df, idx2word)
val_df.drop(val_err, inplace=True)

Number of error indices: 197


In [21]:
# get start and end positions of answers from the context
# this is basically the label for training QA models
train_label_idx = train_df.apply(index_answer, axis=1, idx2word=idx2word)
train_df['label_idx'] = train_label_idx


In [22]:
val_label_idx = val_df.apply(index_answer, axis=1, idx2word=idx2word)
val_df['label_idx'] = val_label_idx

## Save processed data to json files
We use JSON for saving the data, due to its serializing speed, readability and suitability and safety for using in combination with other programming languages.

In [23]:
train_df.to_json('../data/processed/squad/train_data.json')
val_df.to_json('../data/processed/squad/val_data.json')

In [24]:
import json 

with open("../data/processed/squad/word2idx.json", "w") as outfile:
    # Serializing json  
    json.dump(word2idx, outfile)



In [25]:

with open("../data/processed/squad/idx2word.json", "w") as outfile:
    # Serializing json  
    json.dump(idx2word, outfile)

In [26]:

with open("../data/processed/squad/word_vocabulary.json", "w") as outfile:
    # Serializing json  
    json.dump(word_vocabulary, outfile)