In [32]:
import pandas as pd
import numpy as np
import json
import nltk
from nltk.tokenize import word_tokenize

## Load json files

In [3]:
f_train = open('../data/train-v2.0.json')
f_dev = open('../data/dev-v2.0.json')
train = json.load(f_train)
dev = json.load(f_dev)

In [4]:
titles_train = [e['title'] for e in train['data']]
titles_dev = [e['title'] for e in dev['data']]

In [5]:
len(titles_train),len(titles_dev)

(442, 35)

## Transform json to pandas

In [6]:
train.keys()

dict_keys(['version', 'data'])

In [7]:
train['version']

'v2.0'

In [8]:
train['data'][0].keys()

dict_keys(['title', 'paragraphs'])

In [9]:
train['data'][0]['title']

'Beyoncé'

In [10]:
train['data'][0]['paragraphs'][0].keys()

dict_keys(['qas', 'context'])

In [11]:
train['data'][0]['paragraphs'][0]['context']

'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'

In [12]:
train['data'][0]['paragraphs'][0]['qas'][0].keys()

dict_keys(['question', 'id', 'answers', 'is_impossible'])

In [13]:
train['data'][0]['paragraphs'][0]['qas'][0]['question']

'When did Beyonce start becoming popular?'

In [14]:
train['data'][0]['paragraphs'][0]['qas'][0]['id']

'56be85543aeaaa14008c9063'

In [15]:
train['data'][0]['paragraphs'][0]['qas'][0]['answers']

[{'text': 'in the late 1990s', 'answer_start': 269}]

In [16]:
train['data'][0]['paragraphs'][0]['qas'][0]['is_impossible']

False

In [17]:
def turn_json_to_pd(df_json):
    titles=[]
    contexts = []
    questions = []
    answers_text = []
    answers_start = []
    is_impossible = []
    for subject in df_json['data']:
        for para in subject['paragraphs']:
            for qas in para['qas']:
                contexts.append(para['context'])
                questions.append(qas['question'])
                titles.append(subject['title'])
                is_impossible.append(qas['is_impossible'])
                if(len(qas['answers'])>0):
                    answers_text.append(qas['answers'][0]['text'])
                    answers_start.append(qas['answers'][0]['answer_start'])
                else:
                    answers_text.append('')
                    answers_start.append(-1)
                    
    df_pandas = pd.DataFrame({"title":titles,"context":contexts, "question": questions, "answer_start": answers_start,\
                              "text": answers_text,"is_impossible":is_impossible})

    return df_pandas

In [18]:
df_pandas_train = turn_json_to_pd(train)
df_pandas_dev = turn_json_to_pd(dev)

In [19]:
df_pandas_train['is_impossible'] = df_pandas_train['is_impossible'].astype(int)

In [20]:
df_pandas_train.query('is_impossible == 1').shape

(43498, 6)

In [21]:
df_pandas_train.shape

(130319, 6)

In [22]:
df_pandas_dev.query('is_impossible == 1').shape

(5945, 6)

In [23]:
df_pandas_dev.shape

(11873, 6)

In [24]:
pd.to_pickle(df_pandas_train,'../data/train.pkl')
pd.to_pickle(df_pandas_dev,'../data/dev.pkl')

## Some explo

In [25]:
df_pandas_train.query('is_impossible == 1').shape

(43498, 6)

In [26]:
df_pandas_train.head()

Unnamed: 0,title,context,question,answer_start,text,is_impossible
0,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,269,in the late 1990s,0
1,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,207,singing and dancing,0
2,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,526,2003,0
3,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,166,"Houston, Texas",0
4,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,276,late 1990s,0


In [55]:
df_pandas_train.query('answer_start < 0').shape

(43498, 6)

In [38]:
question_words = ['What','Which','Who','Where','Why','When','How','Whose','Whom']

In [39]:
question_words = [e.lower() for e in question_words]

In [40]:
questions = list(df_pandas_train['question'])

In [41]:
len(questions)

130319

In [42]:
questions00 = {}
for e in questions:
    #l=[]
    words = word_tokenize(e)
    words = [word.lower() for word in words]
    l=set(words).intersection(set(question_words))
    l=list(l)
    questions00[e] = l

In [43]:
len([e for e in questions00 if( len(questions00[e]))==0 ]),\
len([e for e in questions00 if( len(questions00[e]))==1 ]),\
len([e for e in questions00 if( len(questions00[e]))==2 ]),\
len([e for e in questions00 if( len(questions00[e]))==3 ]),\
len([e for e in questions00 if( len(questions00[e]))==4 ]),\


(1820, 124673, 3693, 31, 0)

In [45]:
124673 / len(questions)

0.9566755423230687

In [46]:
questions01 = [e for e in questions00 if( len(questions00[e]))==1 ]

In [58]:
i=1000
questions01[i],questions00[questions01[i]]

('What was the date that LIszt first saw Chopin perform?', ['what'])

In [60]:
len(set(questions01))

124673

In [61]:
dict00={}
for e in question_words:
    dict00[e] = 0
for e in questions01:
    dict00[questions00[e][0]] += 1

In [64]:
dict00

{'what': 74852,
 'which': 8056,
 'who': 13088,
 'where': 5157,
 'why': 1870,
 'when': 7804,
 'how': 12957,
 'whose': 447,
 'whom': 442}

In [65]:
256 * 1024 + 1024

263168