Importing of GitHub repository

In [1]:
! git clone https://github.com/amrlnic/SQuAD.git

fatal: destination path 'SQuAD' already exists and is not an empty directory.


Importing of necessary libraries

In [2]:
import json
import os
import io
import requests
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
from nltk import word_tokenize
nltk.download('punkt')
import gensim.downloader as gloader
from sklearn.model_selection import train_test_split
import re
import pickle
import tensorflow as tf

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Glove download function

In [3]:
EMBEDDING_SIZE = 300

def download_glove_model(embedding_dimension = 50):

  """
  download glove model
  """

  download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
  try:
    print('[INFO] downloading glove {}'.format(embedding_dimension))
    emb_model = gloader.load(download_path)
    print('[INFO] done !')
  except ValueError as e:
      print("Glove: 50, 100, 200, 300")
      raise e
  return emb_model

Extracting the dataset from the GitHub folder

In [4]:

with open("SQuAD/data/training_set.json") as f:
    data = json.load(f)


def load_dataset(file, record_path = ['data', 'paragraphs', 'qas', 'answers'], verbose = True):

  """
  parse the SQUAD dataset into a dataframe
  """

  if verbose:
      print("Reading the json file")

  if verbose:
      print("[INFO] processing...")

  # parsing different level's in the json file
  js = pd.json_normalize(file , record_path )
  m = pd.json_normalize(file, record_path[:-1] )
  r = pd.json_normalize(file, record_path[:-2])
  t = pd.json_normalize(file, record_path[0])

  title = pd.json_normalize(file['data'], record_path = ['paragraphs'], meta = 'title')

  #combining it into single dataframe
  idx = np.repeat(r['context'].values, r.qas.str.len())
  ndx  = np.repeat(m['id'].values, m['answers'].str.len())
  m['context'] = idx
  m['title'] = np.repeat(title['title'].values, r.qas.str.len())
  js['q_idx'] = ndx
  main = pd.concat([ m[['id','question','context', 'title']].set_index('id'), js.set_index('q_idx')], 1, sort = False).reset_index()
  main['c_id'] = main['context'].factorize()[0]
  if verbose:
      print(f"[INFO] there are {main.shape[0]} questions with single answer")
      print(f"[INFO] there are {main.groupby('c_id').sum().shape[0]} different contexts")
      print(f"[INFO] there are {len(t)} unrelated subjects")
      print("[INFO] Done")
  return main

squad_dataset = load_dataset(data)

Reading the json file
[INFO] processing...
[INFO] there are 87599 questions with single answer
[INFO] there are 18891 different contexts
[INFO] there are 442 unrelated subjects
[INFO] Done


Examining the head of the dataset

In [5]:
squad_dataset.head()

Unnamed: 0,index,question,context,title,answer_start,text,c_id
0,5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",University_of_Notre_Dame,515,Saint Bernadette Soubirous,0
1,5733be284776f4190066117f,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...",University_of_Notre_Dame,188,a copper statue of Christ,0
2,5733be284776f41900661180,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...",University_of_Notre_Dame,279,the Main Building,0
3,5733be284776f41900661181,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...",University_of_Notre_Dame,381,a Marian place of prayer and reflection,0
4,5733be284776f4190066117e,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...",University_of_Notre_Dame,92,a golden statue of the Virgin Mary,0


In [6]:
SAMPLES = squad_dataset.shape[0]

#---lowercase and strip the given text---#
def preprocess_sentence(text): 
  text = text.lower()
  text = text.strip()
  return text
#----------------------------------------#

#---preprocess the dataset---------------#
def clean_dataset(dataset):

  _dataset = dataset.copy()

  cleaned_questions = _dataset['question'].apply(preprocess_sentence)
  cleaned_texts = _dataset['text'].apply(preprocess_sentence)

#---we process only different contexts and then we duplicate them---#
  unique_context = pd.Series(_dataset['context'].unique())
  count_c = _dataset.groupby('c_id').count()['text']
  cleaned_contexts = unique_context.apply(preprocess_sentence)

  _dataset['question'] = cleaned_questions
  _dataset['text'] = cleaned_texts
  _dataset['context'] = pd.Series(np.repeat(cleaned_contexts, count_c).tolist())

  return _dataset
#----------------------------------------#
squad_dataset = clean_dataset(squad_dataset)

Train and validation datasets creation

In [7]:
def get_tokenizer(dataset, glove_model = None):


#----create the word and char tokenizer and feed them on the given dataset----#

  tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token = 'UNK', filters = '')

  # we will only keep the 200 - 1 most frequent characters (otherwise oom issue)
  # others tokens are replaced by UNK token 
  # we keep 199 most frequent tokens and indice 1 is UNK token (so we keep 198 tokens)

  char_tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level = True, filters = '', oov_token = 'UNK', num_words = 200)

  if glove_model == None:
    glove_model = download_glove_model(EMBEDDING_SIZE)

  tokenized_questions = dataset['question'].apply(word_tokenize).to_list()

  contexts = pd.Series(dataset['context'].unique())
  tokenized_contexts = contexts.apply(word_tokenize).to_list()

  sequences = glove_model.index2entity + tokenized_questions + tokenized_contexts

  del glove_model # we  don't need anymore the glove model

  tokenizer.fit_on_texts(sequences)
  char_tokenizer.fit_on_texts(dataset['question'].to_list() + contexts.to_list())

  return tokenizer, char_tokenizer


def update_tokenizer(dataset, tokenizer, char_tokenizer):

  """
  update the existing word/char vocabulary on a new dataset
  """

  tokenized_questions = dataset['question'].apply(word_tokenize).to_list()

  contexts = pd.Series(dataset['context'].unique())
  tokenized_contexts = contexts.apply(word_tokenize).to_list()

  sequences = tokenized_questions + tokenized_contexts
  tokenizer.fit_on_texts(sequences)

  char_tokenizer.fit_on_texts(dataset['question'].to_list() + contexts.to_list())

def get_start_end(row):

  """
  get the start and end span for each sample,
  if the span cannot be found return -1
  """

  context = row['context']
  answer = row['text']
  tok_answer = word_tokenize(answer)

  _start = context.find(answer)

  if _start == -1:
    # the answer is not in the context
    # maybe due to a typo
    row['start'] = -1
    row['end'] = -1
    return row

  lc = context[:_start]
  lc = word_tokenize(lc)

  start = len(lc)
  end = start + len(tok_answer)

  row['start'] = start
  row['end'] = end

  return row

def tokenize(dataset, tokenizer, char_tokenizer):

  """
  tokenize the given dataset
  """

  _dataset = dataset.copy()

  tokenized_questions = _dataset['question'].apply(word_tokenize).to_list()
  tokenized_contexts = _dataset['context'].apply(word_tokenize).to_list()

  t_q = tokenizer.texts_to_sequences(tokenized_questions)
  t_c = tokenizer.texts_to_sequences(tokenized_contexts)

  c_q = []
  c_c = []

  for question, context in zip(tokenized_questions, tokenized_contexts):
    _q = char_tokenizer.texts_to_sequences(question)
    _c = char_tokenizer.texts_to_sequences(context)
    c_q.append(_q)
    c_c.append(_c)

  _dataset['tokenized_question'] = t_q
  _dataset['tokenized_context'] = t_c

  _dataset['char_tokenized_question'] = c_q
  _dataset['char_tokenized_context'] = c_c

  return _dataset

def split(dataset, test_size = 0.2, random_state = 42):

  """
  split the dataset in two part: the training and the validation
  """

  # random_state for deterministic state
  tr, vl = train_test_split(dataset, test_size = test_size, random_state = random_state)
  tr.reset_index(drop = True, inplace = True)
  vl.reset_index(drop = True, inplace = True)

  return tr,vl

def df_to_json(df, path):

  """
  parse the given dataframe into the SQUAD json format
  """
  
  data = []

  for title, articles in df.groupby('title'):
    chapter = {'title': title}
    paragraphs = []
    for context, contents in articles.groupby('context'):
      paragraph = {'context': context}
      qas = []
      for i, content in contents.iterrows():
        qa = {'answers': [{'answer_start': content['answer_start'], 'text': content['text']}], 'question': content['question'], 'id': content['index']}
        qas.append(qa)
      paragraph.update({'qas': qas})
      paragraphs.append(paragraph)
    chapter.update({'paragraphs': paragraphs})
    data.append(chapter)
  raw_data = {'data': data}

  with open(path, 'w') as handle:
    json.dump(raw_data, handle)

  print(f'dataset saved in {path}')

In [8]:
tr_df, vl_df = split(squad_dataset)
tr_df.shape[0],vl_df.shape[0]

(70079, 17520)

Our vocabulary is based on the Glove vocabulary, and we add terms from the training set

In [9]:
tokenizer, char_tokenizer = get_tokenizer(tr_df)

[INFO] downloading glove 300
[INFO] done !


In [10]:
print(len(tokenizer.word_index))
len(char_tokenizer.word_index)

429064


1263

We then update our vocabulary with terms from the validation set

In [11]:
update_tokenizer(vl_df, tokenizer, char_tokenizer)
print(len(tokenizer.word_index))
len(char_tokenizer.word_index)

429758


1265

Training and validation datasets update

In [12]:
# take a while
tr_df = tr_df.apply(get_start_end, axis = 1)
vl_df = vl_df.apply(get_start_end, axis = 1)

we get rid of samples where the answer doesn't match the context (maybe there is a typo in the answer or the context).  
To avoid to discard many samples, we could lemmatize / stem the text.   
Obviously, lemmatization is a better choice for our task, but if we want a really accurate lemmatization processing, we need to do POS tagging.

In [13]:
tr_df[tr_df['start'] == -1].shape[0], vl_df[vl_df['start'] == -1].shape[0]

(69, 15)

In [14]:
tr_df[tr_df['start'] == -1]

Unnamed: 0,index,question,context,title,answer_start,text,c_id,start,end
87,56de31984396321400ee2672,on what date was the 2013 human development re...,some countries were not included for various r...,Human_Development_Index,92,"march 14, 2013",2185,-1,-1
3133,56e17f5de3433e1400422f8c,what field studies the placement of catalan in...,"in central catalan, unstressed vowels reduce t...",Catalan_language,0,catalan sociolinguistics,3470,-1,-1
3983,56e1b97fcd28a01900c67ad8,what is the official regulating body of valen...,"valencian is classified as a western dialect, ...",Catalan_language,168,the valencian academy of language,3488,-1,-1
6198,56e1b4decd28a01900c67a91,what language is the regulator meant to standa...,"in alghero, the iec has adapted its standard t...",Catalan_language,103,catalan,3486,-1,-1
6994,56e1b738cd28a01900c67aae,where are the provinces of lleida and tarragona?,"in 2011, the aragonese government passed a dec...",Catalan_language,94,western catalonia,3487,-1,-1
...,...,...,...,...,...,...,...,...,...
66889,572e8003c246551400ce425f,what did great britain gain in the west indies...,"many middle and small powers in europe, unlike...",Seven_Years%27_War,113,some individual caribbean islands in the west ...,15282,-1,-1
66972,572e81f2cb0c0d14000f1206,"what is the precedent for the ""second hundred ...","the war was successful for great britain, whic...",Seven_Years%27_War,446,reminiscent of the more famous and compact str...,15283,-1,-1
67376,572e8578c246551400ce42bd,who would sicily and savoy normally align with?,"realizing that war was imminent, prussia preem...",Seven_Years%27_War,434,"sicily, and savoy, although sided with franco-...",15281,-1,-1
69867,56e180f5e3433e1400422f96,what do the dialects of catalan feature?,catalan sociolinguistics studies the situation...,Catalan_language,56,uniformity,3471,-1,-1


In [15]:
vl_df[vl_df['start'] == -1]

Unnamed: 0,index,question,context,title,answer_start,text,c_id,start,end
171,56e18a90e3433e1400422fac,in what densely populated area is it spoken?,western catalan comprises the two dialects of ...,Catalan_language,166,barcelona province,3474,-1,-1
1536,56e1a3cbe3433e1400423066,where is iec's standard used?,"standard catalan, virtually accepted by all sp...",Catalan_language,3,the balearic islands,3484,-1,-1
3007,56e18710cd28a01900c679b9,what have a and e done in eastern dialects?,the dialects of the catalan language feature a...,Catalan_language,162,merged,3472,-1,-1
4925,56e1b4decd28a01900c67a8e,where is the catalan speaking part of aragon?,"in alghero, the iec has adapted its standard t...",Catalan_language,114,la franja,3486,-1,-1
5782,56e18bfbe3433e1400422fb5,how many stressed phonemes are there in catalan?,central catalan is considered the standard pro...,Catalan_language,69,seven,3468,-1,-1
5897,56e18710cd28a01900c679b7,what is the major difference between the two b...,the dialects of the catalan language feature a...,Catalan_language,118,treatment of unstressed a and e,3472,-1,-1
5937,56e18bfbe3433e1400422fb4,what is the vowel system of catalan?,western catalan comprises the two dialects of ...,Catalan_language,50,vulgar latin,3468,-1,-1
6561,56e1b264e3433e14004230a6,where has the iec adapted its standard to the ...,the most notable difference between both stand...,Catalan_language,3,alghero,3485,-1,-1
7378,572e81f2cb0c0d14000f1207,what was a later conflict that some considered...,"the war was successful for great britain, whic...",Seven_Years%27_War,246,to later conflicts like the napoleonic wars,15283,-1,-1
11252,56e17b08cd28a01900c679af,where do you find dialectic vowel reductions?,catalan has inherited the typical vowel system...,Catalan_language,176,section pronunciation,3469,-1,-1


In [16]:
print(len(tr_df))
print(len(vl_df))

70079
17520


Now we get rid of samples where the answer doesn't match the context

In [17]:
# we get rid of samples where the answer doesn't match the context
tr_df = tr_df[tr_df['start'] != -1]
vl_df = vl_df[vl_df['start'] != -1]

We also need to eliminate samples where the answer is after the context.

In [18]:
test = 20461


print(tr_df.iloc[test]['answer_start'])
print(len(tr_df.iloc[test]['text']))
print(len(tr_df.iloc[test]['context']))
print()
print(tr_df.iloc[test]['text'])
#print(tr_df.iloc[test]['context'])
print()
tr_df.iloc[test][2]
#tr_df.iloc[test]['answer_start'] + len(tr_df.iloc[test]['text']) >= len(tr_df.iloc[test]['context'])

301
67
629

at least one appointee from the state where the project is to occur



'the question to be answered is whether a listed species will be harmed by the action and, if so, how the harm can be minimized. if harm cannot be avoided, the project agency can seek an exemption from the endangered species committee, an ad hoc panel composed of members from the executive branch and at least one appointee from the state where the project is to occur. five of the seven committee members must vote for the exemption to allow taking (to harass, harm, pursue, hunt, shoot, wound, kill, trap, capture, or collect, or significant habitat modification, or to attempt to engage in any such conduct) of listed species.'

In [28]:
#tr_df[ tr_df['answer_start'] + len(tr_df['text']) >= len(tr_df['context']) ]
#tr_df['answer_start'] + len(tr_df['text']) >= len(tr_df['context']) 
tr_mask = np.array([ tr_df.iloc[k]['answer_start'] + len(tr_df.iloc[k]['text']) < len(tr_df.iloc[k]['context']) for k in range(len(tr_df)) ])
vl_mask = np.array([ vl_df.iloc[k]['answer_start'] + len(vl_df.iloc[k]['text']) < len(vl_df.iloc[k]['context']) for k in range(len(vl_df)) ])

In [21]:
#debug window
"""
tr_df[ mask ].iloc[0]['answer_start'] + len(tr_df[mask].iloc[0]['text'])
print(tr_df[ mask ].iloc[0]['answer_start'])
print(tr_df[mask].iloc[0]['text'])
print(len(tr_df[mask].iloc[0]['text']))
print(tr_df[mask].iloc[0]['context'])
print(len(tr_df[mask].iloc[0]['context']))
print()
print(tr_df[mask].iloc[0]['context'][195:])
"""

"\ntr_df[ mask ].iloc[0]['answer_start'] + len(tr_df[mask].iloc[0]['text'])\nprint(tr_df[ mask ].iloc[0]['answer_start'])\nprint(tr_df[mask].iloc[0]['text'])\nprint(len(tr_df[mask].iloc[0]['text']))\nprint(tr_df[mask].iloc[0]['context'])\nprint(len(tr_df[mask].iloc[0]['context']))\nprint()\nprint(tr_df[mask].iloc[0]['context'][195:])\n"

In [29]:
tr_df[tr_mask]

Unnamed: 0,index,question,context,title,answer_start,text,c_id,start,end
0,572667e6708984140094c4f9,what team had dallas green managed in 1980?,"after over a dozen more subpar seasons, in 198...",Chicago_Cubs,154,phillies,8880,29,30
1,56dec2483277331400b4d712,which candidate withdrew from the presidential...,schwarzenegger's endorsement in the republican...,Arnold_Schwarzenegger,156,rudy giuliani,2311,23,25
2,5726e5995951b619008f81bb,captive animals can distinguish co-inhabitats ...,it has been observed that well-fed predator an...,Predation,224,wild ones outside the area,9822,38,43
3,5726486f708984140094c157,the results of which battle allowed the britis...,"after returning from egypt, napoleon engineere...",Napoleon,919,the battle of trafalgar,8418,158,162
4,5730299db2c2fd14005689a7,how was vesey executed in 1822?,"by 1820, charleston's population had grown to ...","Charleston,_South_Carolina",382,hanged,15719,74,75
...,...,...,...,...,...,...,...,...,...
70073,57320f26e17f3d1400422651,where do male emporer penguins keep eggs?,bird eggs are usually laid in a nest. most spe...,Bird,730,between their body and feet,17809,136,141
70074,56d12d3c17492d1400aabb6b,on what day did the final coroner's report sho...,adams sent condolences to donda west's family ...,Kanye_West,640,"january 10, 2008",1082,117,121
70075,5727c461ff5b5019007d94b1,in which u.s. state was the oldest definitive ...,tuberculosis has been present in humans since ...,Tuberculosis,171,wyoming,11805,27,28
70076,57343f804776f41900661afa,what major cities later adopted tucson's city ...,tucson is known for being a trailblazer in vol...,"Tucson,_Arizona",757,san francisco and new york city,16641,145,151


In [30]:
vl_df[vl_mask]

Unnamed: 0,index,question,context,title,answer_start,text,c_id,start,end
0,56de4d9ecffd8e1900b4b7e2,what year was the banská akadémia founded?,the world's first institution of technology or...,Institute_of_technology,167,1735,1860,26,27
1,572674a05951b619008f7319,what is another speed that can also be reporte...,the standard specifies how speed ratings shoul...,Film_speed,793,sos-based speed,9354,145,147
2,5730bb058ab72b1400f9c72c,where were the use of advanced materials and t...,the most impressive and famous of sumerian bui...,Sumer,421,sumerian temples and palaces,17505,74,78
3,572781a5f1498d1400e8fa1f,who is elected every even numbered year?,ann arbor has a council-manager form of govern...,"Ann_Arbor,_Michigan",192,mayor,10585,18,19
4,572843ce4b864d190016485c,what was the purpose of top secret icbm commit...,"shortly before his death, when he was already ...",John_von_Neumann,194,decide on the feasibility of building an icbm ...,11497,38,53
...,...,...,...,...,...,...,...,...,...
17515,570b4f4dec8fbc190045b976,what country refused to allow forces to stage ...,after the lengthy iraq disarmament crisis culm...,Military_history_of_the_United_States,443,turkey,6586,76,77
17516,571a2b2410f8ca1400304f2a,who is the representative for seattle's district?,"like most parts of the united states, governme...",Seattle,438,jim mcdermott,7943,76,78
17517,572970803f37b319004783c8,at what temperature to zinc become brittle?,"zinc is a bluish-white, lustrous, diamagnetic ...",Zinc,477,210 °c,14548,93,95
17518,572acb23f75d5e190021fcdb,how expensive was kerry's yacht?,"according to the boston herald, dated july 23,...",John_Kerry,94,$7 million,15037,18,21


In [25]:
~tr_mask

array([ True,  True,  True, ...,  True,  True, False])

In [32]:
# we get rid of samples where we have answer overflow
tr_df2 = tr_df[tr_mask] 
vl_df2 = vl_df[vl_mask] 

In [33]:
print(len(tr_df2))
print(len(vl_df2))

69572
17394


In [34]:
tr_df2 = tokenize(tr_df2, tokenizer, char_tokenizer)
vl_df2 = tokenize(vl_df2, tokenizer, char_tokenizer)

Now we explore the cleaned datasets

In [35]:
tr_df2.head()

Unnamed: 0,index,question,context,title,answer_start,text,c_id,start,end,tokenized_question,tokenized_context,char_tokenized_question,char_tokenized_context
0,572667e6708984140094c4f9,what team had dallas green managed in 1980?,"after over a dozen more subpar seasons, in 198...",Chicago_Cubs,154,phillies,8880,29,30,"[11, 309, 49, 11808, 646, 2132, 6, 2627, 9]","[61, 83, 10, 6737, 62, 70020, 1740, 3, 6, 3372...","[[20, 11, 5, 4], [4, 3, 5, 16], [11, 5, 13], [...","[[5, 17, 4, 3, 10], [8, 24, 3, 10], [5], [13, ..."
1,56dec2483277331400b4d712,which candidate withdrew from the presidential...,schwarzenegger's endorsement in the republican...,Arnold_Schwarzenegger,156,rudy giuliani,2311,23,25,"[27, 2789, 4161, 23, 2, 1534, 698, 6, 417, 4, ...","[1084, 19, 9106, 6, 2, 1467, 477, 4, 2, 420, 1...","[[20, 11, 6, 14, 11], [14, 5, 7, 13, 6, 13, 5,...","[[9, 14, 11, 20, 5, 10, 39, 3, 7, 3, 19, 19, 3..."
2,5726e5995951b619008f81bb,captive animals can distinguish co-inhabitats ...,it has been observed that well-fed predator an...,Predation,224,wild ones outside the area,9822,38,43,"[11888, 727, 65, 3733, 419169, 23, 11, 48, 136...","[30, 40, 59, 2316, 20, 63225, 4421, 727, 6, 10...","[[14, 5, 18, 4, 6, 24, 3], [5, 7, 6, 16, 5, 12...","[[6, 4], [11, 5, 9], [22, 3, 3, 7], [8, 22, 9,..."
3,5726486f708984140094c157,the results of which battle allowed the britis...,"after returning from egypt, napoleon engineere...",Napoleon,919,the battle of trafalgar,8418,158,162,"[2, 1324, 4, 27, 326, 495, 2, 132, 8, 6280, 15...","[61, 3986, 23, 598, 3, 545, 9789, 10, 2313, 6,...","[[4, 11, 3], [10, 3, 9, 15, 12, 4, 9], [8, 17]...","[[5, 17, 4, 3, 10], [10, 3, 4, 15, 10, 7, 6, 7..."
4,5730299db2c2fd14005689a7,how was vesey executed in 1822?,"by 1820, charleston's population had grown to ...","Charleston,_South_Carolina",382,hanged,15719,74,75,"[44, 13, 25121, 2181, 6, 10202, 9]","[18, 9015, 3, 1909, 19, 104, 49, 2555, 8, 2106...","[[11, 8, 20], [20, 5, 9], [24, 3, 9, 3, 21], [...","[[22, 21], [28, 40, 31, 29], [23], [14, 11, 5,..."


In [36]:
print(tr_df2['tokenized_question'].str.len().describe())
vl_df2['tokenized_question'].str.len().describe()

count    69572.000000
mean        11.274579
std          3.714049
min          1.000000
25%          9.000000
50%         11.000000
75%         13.000000
max         60.000000
Name: tokenized_question, dtype: float64


count    17394.000000
mean        11.336438
std          3.754368
min          1.000000
25%          9.000000
50%         11.000000
75%         13.000000
max         38.000000
Name: tokenized_question, dtype: float64

In [37]:
print(tr_df2['tokenized_question'].str.len().quantile(0.99))
vl_df2['tokenized_question'].str.len().quantile(0.99)

22.0


23.0

In [38]:
print(tr_df2['tokenized_context'].str.len().describe())
vl_df2['tokenized_context'].str.len().describe()

count    69572.000000
mean       137.928678
std         56.993429
min         22.000000
25%        102.000000
50%        127.000000
75%        164.000000
max        766.000000
Name: tokenized_context, dtype: float64


count    17394.000000
mean       137.330229
std         55.948282
min         22.000000
25%        102.000000
50%        126.000000
75%        163.000000
max        766.000000
Name: tokenized_context, dtype: float64

In [39]:
print(tr_df2['tokenized_context'].str.len().quantile(0.99))
vl_df2['tokenized_context'].str.len().quantile(0.99)

324.0


324.0

In [40]:
def len_words(dataset):

  """
  return the word's length
  """

  count_q = []
  count_c = []

  for idx, row in dataset.iterrows():
    for w in row['char_tokenized_question']:
      l = len(w)
      count_q.append(l)
      
    for w in row['char_tokenized_context']:
      m = len(w)
      count_c.append(m)
  
  return pd.Series(count_q), pd.Series(count_c)

t_q,t_c = len_words(tr_df2)
v_q,v_c = len_words(vl_df2)

In [41]:
print(t_q.describe())
t_c.describe()

count    784395.000000
mean          4.447990
std           2.677571
min           1.000000
25%           2.000000
50%           4.000000
75%           6.000000
max          30.000000
dtype: float64


count    9.595974e+06
mean     4.625712e+00
std      2.969451e+00
min      1.000000e+00
25%      2.000000e+00
50%      4.000000e+00
75%      7.000000e+00
max      3.700000e+01
dtype: float64

In [42]:
print(v_q.describe())
v_c.describe()

count    197186.000000
mean          4.452943
std           2.686307
min           1.000000
25%           2.000000
50%           4.000000
75%           6.000000
max          24.000000
dtype: float64


count    2.388722e+06
mean     4.629325e+00
std      2.972493e+00
min      1.000000e+00
25%      2.000000e+00
50%      4.000000e+00
75%      7.000000e+00
max      3.700000e+01
dtype: float64

In [43]:
print(t_q.quantile(0.99))
t_c.quantile(0.99)

12.0


13.0

In [44]:
print(v_q.quantile(0.99))
v_c.quantile(0.99)

12.0


13.0

There are obviously some outliers. We are compeled to get rid of some samples because of memory issues.

We will get rid of contexts that have more than 350 characters and questions that have more than 25 words.

We will set the length of a word to 15 characters

**EDIT :** These numbers are huge but we won't get out of memory errors if we build a sequence generator. If you don't want to use the sequence generator, you should reduce these numbers.

In [46]:
QUESTION_MAXLEN = 25
CONTEXT_MAXLEN = 350
WORD_MAXLEN = 15
BATCH_SIZE = 10

In [47]:
tr_df2.shape, vl_df2.shape

((69572, 13), (17394, 13))

In [48]:
tr_df3 = tr_df2[(tr_df2['tokenized_question'].str.len() <= QUESTION_MAXLEN) & (tr_df2['tokenized_context'].str.len() <= CONTEXT_MAXLEN) & (tr_df2['start'] <= CONTEXT_MAXLEN) & (tr_df2['end'] <= CONTEXT_MAXLEN) ].reset_index(drop = True)
vl_df3 = vl_df2[(vl_df2['tokenized_question'].str.len() <= QUESTION_MAXLEN) & (vl_df2['tokenized_context'].str.len() <= CONTEXT_MAXLEN) & (vl_df2['start'] <= CONTEXT_MAXLEN) & (vl_df2['end'] <= CONTEXT_MAXLEN) ].reset_index(drop = True)

In [49]:
tr_df3.shape[0], vl_df3.shape[0]

(68920, 17237)

In [50]:
 print(f' we get rid of : {SAMPLES - (tr_df3.shape[0] + vl_df3.shape[0])} samples')

 we get rid of : 1442 samples


In [51]:
# save datasets in json format
path_to_train_set = os.path.join(os.getcwd(), 'BERT_train_set3.json')
df_to_json(tr_df3, path_to_train_set)

path_to_valid_set = os.path.join(os.getcwd(), 'BERT_valid_set3.json')
df_to_json(vl_df3, path_to_valid_set)

dataset saved in /content/BERT_train_set3.json
dataset saved in /content/BERT_valid_set3.json
