In order to run this notebook, the first things you should do are :
* pip install pandas numpy tensorflow nltk gensim sklearn
* modify the `SQUAD_PATH` variable (path to squad file)
* modify all others paths (where to save datasets, tokenizers...)



In [16]:
import json
import os
import random
import requests
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
from nltk import word_tokenize
nltk.download('punkt')
import gensim.downloader as gloader
from sklearn.model_selection import train_test_split
import re
import pickle
import tensorflow as tf

random.seed(42)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
EMBEDDING_SIZE = 300

url = "https://raw.githubusercontent.com/amrlnic/SQuAD/main/data/training_set.json" 
download = requests.get(url).content
data = json.loads(download)

def load_dataset(file, record_path = ['data', 'paragraphs', 'qas', 'answers'], verbose = True):

  """
  parse the SQUAD dataset into a dataframe
  """

  if verbose:
      print("Reading the json file")

  if verbose:
      print("[INFO] processing...")

  # parsing different level's in the json file
  js = pd.json_normalize(file , record_path )
  m = pd.json_normalize(file, record_path[:-1] )
  r = pd.json_normalize(file, record_path[:-2])
  t = pd.json_normalize(file, record_path[0])

  title = pd.json_normalize(file['data'], record_path = ['paragraphs'], meta = 'title')

  #combining it into single dataframe
  idx = np.repeat(r['context'].values, r.qas.str.len())
  ndx  = np.repeat(m['id'].values, m['answers'].str.len())
  m['context'] = idx
  m['title'] = np.repeat(title['title'].values, r.qas.str.len())
  js['q_idx'] = ndx
  main = pd.concat([ m[['id','question','context', 'title']].set_index('id'), js.set_index('q_idx')], 1, sort = False).reset_index()
  main['c_id'] = main['context'].factorize()[0]
  if verbose:
      print(f"[INFO] there are {main.shape[0]} questions with single answer")
      print(f"[INFO] there are {main.groupby('c_id').sum().shape[0]} different contexts")
      print(f"[INFO] there are {len(t)} unrelated subjects")
      print("[INFO] Done")
  return main


# Load data
squad_dataset = load_dataset(data)
def download_glove_model(embedding_dimension = 50):

  """
  download glove model
  """

  download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
  try:
    print('[INFO] downloading glove {}'.format(embedding_dimension))
    emb_model = gloader.load(download_path)
    print('[INFO] done !')
  except ValueError as e:
      print("Glove: 50, 100, 200, 300")
      raise e
  return emb_model

Reading the json file
[INFO] processing...
[INFO] there are 87599 questions with single answer
[INFO] there are 18891 different contexts
[INFO] there are 442 unrelated subjects
[INFO] Done


In [7]:
squad_dataset = load_dataset(data)

Reading the json file
[INFO] processing...
[INFO] there are 87599 questions with single answer
[INFO] there are 18891 different contexts
[INFO] there are 442 unrelated subjects
[INFO] Done


In [8]:
squad_dataset.head()

Unnamed: 0,index,question,context,title,answer_start,text,c_id
0,5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",University_of_Notre_Dame,515,Saint Bernadette Soubirous,0
1,5733be284776f4190066117f,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...",University_of_Notre_Dame,188,a copper statue of Christ,0
2,5733be284776f41900661180,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...",University_of_Notre_Dame,279,the Main Building,0
3,5733be284776f41900661181,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...",University_of_Notre_Dame,381,a Marian place of prayer and reflection,0
4,5733be284776f4190066117e,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...",University_of_Notre_Dame,92,a golden statue of the Virgin Mary,0


In [9]:
SAMPLES = squad_dataset.shape[0]

def preprocess_sentence(text):

  """
  lowercase and strip the given text
  """

  text = text.lower()
  text = text.strip()
  return text

def clean_dataset(dataset, with_answer = True):

  """
  preprocess the dataset
  """

  _dataset = dataset.copy()

  cleaned_questions = _dataset['question'].apply(preprocess_sentence)

  # we process only different contexts and then we duplicate them
  unique_context = pd.Series(_dataset['context'].unique())
  count_c = _dataset.groupby('c_id').size()
  cleaned_contexts = unique_context.apply(preprocess_sentence)

  _dataset['question'] = cleaned_questions

  if with_answer:
    cleaned_texts = _dataset['text'].apply(preprocess_sentence)
    _dataset['text'] = cleaned_texts
  _dataset['context'] = pd.Series(np.repeat(cleaned_contexts, count_c).tolist())

  return _dataset

In [10]:
squad_dataset = clean_dataset(squad_dataset)

In [17]:
def get_tokenizer(dataset, glove_model = None):

  """
  create the word and char tokenizers and feed them 
  on the given dataset and the glove vocabulary
  """

  tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token = 'UNK', filters = '')

  # we will only keep the 200 - 1 most frequent characters (otherwise oom issue)
  # others tokens are replaced by UNK token 
  # we keep 199 most frequent tokens and indice 1 is UNK token (so we keep 198 tokens)
  char_tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level = True, filters = '', oov_token = 'UNK', num_words = 200)

  if glove_model == None:
    glove_model = download_glove_model(EMBEDDING_SIZE)

  tokenized_questions = dataset['question'].apply(word_tokenize).to_list()

  contexts = pd.Series(dataset['context'].unique())
  tokenized_contexts = contexts.apply(word_tokenize).to_list()

  sequences = glove_model.index2entity + tokenized_questions + tokenized_contexts

  del glove_model # we  don't need anymore the glove model

  tokenizer.fit_on_texts(sequences)
  char_tokenizer.fit_on_texts(dataset['question'].to_list() + contexts.to_list())

  return tokenizer, char_tokenizer
  


def update_tokenizer(dataset, tokenizer, char_tokenizer):

  """
  update the existing word/char vocabulary on a new dataset
  """

  tokenized_questions = dataset['question'].apply(word_tokenize).to_list()

  contexts = pd.Series(dataset['context'].unique())
  tokenized_contexts = contexts.apply(word_tokenize).to_list()

  sequences = tokenized_questions + tokenized_contexts
  tokenizer.fit_on_texts(sequences)

  char_tokenizer.fit_on_texts(dataset['question'].to_list() + contexts.to_list())



def get_start_end(row):

  """
  get the start and end span for each sample,
  if the span cannot be found return -1
  """

  context = row['context']
  answer = row['text']
  tok_answer = word_tokenize(answer)

  _start = context.find(answer)

  if _start == -1:
    # the answer is not in the context
    # maybe due to a typo
    row['start'] = -1
    row['end'] = -1
    return row

  lc = context[:_start]
  lc = word_tokenize(lc)

  start = len(lc)
  end = start + len(tok_answer)

  row['start'] = start
  row['end'] = end

  return row

def tokenize(dataset, tokenizer, char_tokenizer):

  """
  tokenize the given dataset
  """

  _dataset = dataset.copy()

  tokenized_questions = _dataset['question'].apply(word_tokenize).to_list()
  tokenized_contexts = _dataset['context'].apply(word_tokenize).to_list()

  t_q = tokenizer.texts_to_sequences(tokenized_questions)
  t_c = tokenizer.texts_to_sequences(tokenized_contexts)

  c_q = []
  c_c = []

  for question, context in zip(tokenized_questions, tokenized_contexts):
    _q = char_tokenizer.texts_to_sequences(question)
    _c = char_tokenizer.texts_to_sequences(context)
    c_q.append(_q)
    c_c.append(_c)

  _dataset['tokenized_question'] = t_q
  _dataset['tokenized_context'] = t_c

  _dataset['char_tokenized_question'] = c_q
  _dataset['char_tokenized_context'] = c_c

  return _dataset



def split(dataset, train_size = 0.8):

  """
  split the dataset in two part: the training and the validation based on titles
  """

  # find unique titles
  titles = squad_dataset['title']
  unique_titles = titles.unique()


  n_titles = len(unique_titles)
  titles_seq = list(range(n_titles))

  train_len = int(n_titles*train_size)

  # sample train indexes
  train_ind = random.sample(titles_seq, train_len)
  test_ind = list(set(titles_seq) - set(train_ind))

  train_titles = unique_titles[train_ind]
  test_titles = unique_titles[test_ind]

  squad_columns = list(squad_dataset.columns)

  # initialize empty train and test df
  train_data = pd.DataFrame(columns = squad_columns)
  test_data = pd.DataFrame(columns = squad_columns)

  for train_title in train_titles:

    train_section = squad_dataset[squad_dataset['title'] == train_title]
    train_data = train_data.append(train_section)

  for test_title in test_titles:

    test_section = squad_dataset[squad_dataset['title'] == test_title]
    test_data = test_data.append(test_section)


  return train_data, test_data



def df_to_json(df, path, with_answer = True):

  """
  parse the given dataframe into the SQUAD json format and
  save it
  """
  
  data = []

  for title, articles in df.groupby('title'):
    chapter = {'title': title}
    paragraphs = []
    for context, contents in articles.groupby('context'):
      paragraph = {'context': context}
      qas = []
      for i, content in contents.iterrows():
        if with_answer:
          qa = {'answers': [{'answer_start': content['answer_start'], 'text': content['text']}], 'question': content['question'], 'id': content['id']}
        else:
          qa = {'question': content['question'], 'id': content['id']}
        qas.append(qa)
      paragraph.update({'qas': qas})
      paragraphs.append(paragraph)
    chapter.update({'paragraphs': paragraphs})
    data.append(chapter)
  raw_data = {'data': data}

  with open(path, 'w') as handle:
    json.dump(raw_data, handle)

  print(f'dataset saved in {path}')

In [18]:
tr_df, vl_df = split(squad_dataset)

In [19]:
tr_df.shape[0],vl_df.shape[0]

(69129, 18470)

Our vocabulary is based on the Glove vocabulary, and we add terms from the training set

In [20]:
tokenizer, char_tokenizer = get_tokenizer(tr_df)

[INFO] downloading glove 300
[INFO] done !


In [None]:
print(len(tokenizer.word_index))
len(char_tokenizer.word_index)

429064


1263

We then update our vocabulary with terms from the validation set

In [21]:
update_tokenizer(vl_df, tokenizer, char_tokenizer)

In [None]:
print(len(tokenizer.word_index))
len(char_tokenizer.word_index)

429758


1265

In [22]:
# take a while
tr_df = tr_df.apply(get_start_end, axis = 1)
vl_df = vl_df.apply(get_start_end, axis = 1)

we get rid of samples where the answer doesn't match the context (maybe there is a typo in the answer or the context).  
To avoid to discard many samples, we could lemmatize / stem the text.   
Obviously, lemmatization is a better choice for our task, but if we want a really accurate lemmatization processing, we need to do POS tagging.

In [27]:
# we get rid of samples where the answer doesn't match the context
tr_df = tr_df[tr_df['start'] != -1]
vl_df = vl_df[vl_df['start'] != -1]

In [28]:
tr_df = tokenize(tr_df, tokenizer, char_tokenizer)
vl_df = tokenize(vl_df, tokenizer, char_tokenizer)

In [29]:
tr_df.head()

Unnamed: 0,index,question,context,title,answer_start,text,c_id,start,end,tokenized_question,tokenized_context,char_tokenized_question,char_tokenized_context
65306,5728b2912ca10214002da5fa,what is the aggregate population of paris?,"since the 19th century, the built-up area of p...",Paris,171,10550350,14151,30,31,"[10, 12, 2, 9969, 101, 4, 376, 7]","[120, 2, 503, 83, 3, 2, 19402, 114, 4, 376, 40...","[[19, 11, 5, 4], [6, 9], [4, 11, 3], [5, 20, 2...","[[9, 6, 7, 14, 3], [4, 11, 3], [28, 36, 4, 11]..."
65307,5728b2912ca10214002da5fb,from what census is this information from?,"since the 19th century, the built-up area of p...",Paris,188,2012 census,14151,34,36,"[25, 10, 799, 12, 42, 506, 25, 7]","[120, 2, 503, 83, 3, 2, 19402, 114, 4, 376, 40...","[[17, 10, 8, 16], [19, 11, 5, 4], [14, 3, 7, 9...","[[9, 6, 7, 14, 3], [4, 11, 3], [28, 36, 4, 11]..."
65308,5728b2912ca10214002da5fc,what is the population of paris' metropolitan ...,"since the 19th century, the built-up area of p...",Paris,282,12341418,14151,52,53,"[10, 12, 2, 101, 4, 376, 98, 1172, 114, 7]","[120, 2, 503, 83, 3, 2, 19402, 114, 4, 376, 40...","[[19, 11, 5, 4], [6, 9], [4, 11, 3], [18, 8, 1...","[[9, 6, 7, 14, 3], [4, 11, 3], [28, 36, 4, 11]..."
65309,5728b2912ca10214002da5fd,how many kilometers does the administrative re...,"since the 19th century, the built-up area of p...",Paris,388,"12,012 km²",14151,71,73,"[36, 38, 3763, 56, 2, 1666, 221, 1437, 7]","[120, 2, 503, 83, 3, 2, 19402, 114, 4, 376, 40...","[[11, 8, 19], [16, 5, 7, 21], [27, 6, 12, 8, 1...","[[9, 6, 7, 14, 3], [4, 11, 3], [28, 36, 4, 11]..."
65310,5728b2912ca10214002da5fe,as of 2014 how many inhabitants lived in the a...,"since the 19th century, the built-up area of p...",Paris,431,12 million,14151,80,82,"[14, 4, 500, 36, 38, 1976, 1064, 6, 2, 1666, 2...","[120, 2, 503, 83, 3, 2, 19402, 114, 4, 376, 40...","[[5, 9], [8, 17], [31, 29, 28, 44], [11, 8, 19...","[[9, 6, 7, 14, 3], [4, 11, 3], [28, 36, 4, 11]..."


We display some useful stats in order to define the padding size (at the word and character level, for both question and context)

In [30]:
print(tr_df['tokenized_question'].str.len().describe())
vl_df['tokenized_question'].str.len().describe()

count    69045.000000
mean        11.337113
std          3.746685
min          1.000000
25%          9.000000
50%         11.000000
75%         13.000000
max         49.000000
Name: tokenized_question, dtype: float64


count    18470.000000
mean        11.102166
std          3.630045
min          3.000000
25%          9.000000
50%         11.000000
75%         13.000000
max         60.000000
Name: tokenized_question, dtype: float64

In [31]:
print(tr_df['tokenized_question'].str.len().quantile(0.99))
vl_df['tokenized_question'].str.len().quantile(0.99)

23.0


22.0

In [32]:
print(tr_df['tokenized_context'].str.len().describe())
vl_df['tokenized_context'].str.len().describe()

count    69045.000000
mean       138.315707
std         56.355860
min         22.000000
25%        102.000000
50%        127.000000
75%        164.000000
max        766.000000
Name: tokenized_context, dtype: float64


count    18470.000000
mean       135.406659
std         58.085212
min         25.000000
25%        100.000000
50%        124.000000
75%        161.000000
max        638.000000
Name: tokenized_context, dtype: float64

In [33]:
print(tr_df['tokenized_context'].str.len().quantile(0.99))
vl_df['tokenized_context'].str.len().quantile(0.99)

323.0


342.0

In [34]:
def len_words(dataset):

  """
  return the word's length
  """

  count_q = []
  count_c = []

  for idx, row in dataset.iterrows():
    for w in row['char_tokenized_question']:
      l = len(w)
      count_q.append(l)
      
    for w in row['char_tokenized_context']:
      m = len(w)
      count_c.append(m)
  
  return pd.Series(count_q), pd.Series(count_c)

t_q,t_c = len_words(tr_df)
v_q,v_c = len_words(vl_df)

In [35]:
print(t_q.describe())
t_c.describe()

count    782771.000000
mean          4.450666
std           2.687285
min           1.000000
25%           2.000000
50%           4.000000
75%           6.000000
max          30.000000
dtype: float64


count    9.550008e+06
mean     4.623795e+00
std      2.974525e+00
min      1.000000e+00
25%      2.000000e+00
50%      4.000000e+00
75%      7.000000e+00
max      3.700000e+01
dtype: float64

In [36]:
print(v_q.describe())
v_c.describe()

count    205057.000000
mean          4.442604
std           2.649141
min           1.000000
25%           3.000000
50%           4.000000
75%           6.000000
max          26.000000
dtype: float64


count    2.500961e+06
mean     4.638254e+00
std      2.953675e+00
min      1.000000e+00
25%      2.000000e+00
50%      4.000000e+00
75%      7.000000e+00
max      3.200000e+01
dtype: float64

In [37]:
print(t_q.quantile(0.99))
t_c.quantile(0.99)

12.0


13.0

In [38]:
print(v_q.quantile(0.99))
v_c.quantile(0.99)

12.0


13.0

There are obviously some outliers. We are compeled to get rid of some samples because of memory issues.

We will get rid of contexts that have more than 400 words and questions that have more than 25 words.

We will set the length of a word to 15 characters

**EDIT :** These numbers are huge but we won't get out of memory errors if we build a sequence generator. If you don't want to use the sequence generator, you should reduce these numbers.

**EDIT :** Now that we use a sequence generator, we could define `*_MAXLEN` variables according to the stats provided by the training set

In [39]:
QUESTION_MAXLEN = 25
CONTEXT_MAXLEN = 400
WORD_MAXLEN = 15
BATCH_SIZE = 10

In [40]:
tr_df.shape, vl_df.shape

((69045, 13), (18470, 13))

In [41]:
tr_df = tr_df[(tr_df['tokenized_question'].str.len() <= QUESTION_MAXLEN) & (tr_df['tokenized_context'].str.len() <= CONTEXT_MAXLEN) & (tr_df['start'] <= CONTEXT_MAXLEN) & (tr_df['end'] <= CONTEXT_MAXLEN) ].reset_index(drop = True)
vl_df = vl_df[(vl_df['tokenized_question'].str.len() <= QUESTION_MAXLEN) & (vl_df['tokenized_context'].str.len() <= CONTEXT_MAXLEN) & (vl_df['start'] <= CONTEXT_MAXLEN) & (vl_df['end'] <= CONTEXT_MAXLEN) ].reset_index(drop = True)

In [42]:
tr_df.shape[0], vl_df.shape[0]

(68630, 18389)

In [43]:
 print(f' we get rid of : {SAMPLES - (tr_df.shape[0] + vl_df.shape[0])} samples')

 we get rid of : 580 samples


In [None]:
# save datasets in json format
path_to_train_set = os.path.join(os.getcwd(), 'drive/MyDrive/NLP/BIDAF/utils/data/train_set.json')
df_to_json(tr_df, path_to_train_set)

path_to_valid_set = os.path.join(os.getcwd(), 'drive/MyDrive/NLP/BIDAF/utils/data/valid_set.json')
df_to_json(vl_df, path_to_valid_set)

dataset saved in /content/drive/MyDrive/NLP/BIDAF/utils/data/train_set.json
dataset saved in /content/drive/MyDrive/NLP/BIDAF/utils/data/valid_set.json


In [None]:
# we save both tokenizers
tokenizers_folder = os.path.join(os.getcwd(), 'drive/MyDrive/NLP/BIDAF/utils', 'tokenizers')
if not os.path.exists(tokenizers_folder):
  os.makedirs(tokenizers_folder)

path_word_tokenizer = os.path.join(tokenizers_folder, 'word_tokenizer.pkl')
with open(path_word_tokenizer, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol = pickle.HIGHEST_PROTOCOL)

path_char_tokenizer = os.path.join(tokenizers_folder, 'char_tokenizer.pkl')
with open(path_char_tokenizer, 'wb') as handle:
    pickle.dump(char_tokenizer, handle, protocol = pickle.HIGHEST_PROTOCOL)

We create the iterator. The iterator allows us to work with much bigger data, because it is loaded into memory only when we need them

In [44]:
# utils/datasets/dataset.py
class SQUAD_dataset(tf.keras.utils.Sequence):

  """
  utility class to create a working dataset that
  can be given to a neural network
  """

  def __init__(self, data, question_maxlen, context_maxlen, word_maxlen, batch_size, with_answer = True):
    self.QUESTION_MAXLEN = question_maxlen
    self.CONTEXT_MAXLEN = context_maxlen
    self.WORD_MAXLEN = word_maxlen
    self.batch_size = batch_size
    self.with_answer = with_answer
    self.__get_batches(data)

  def __len__(self):
    return len(self.batches)

  def __get_batches(self, data):
    batches = [data[i:i+self.batch_size] for i in range(0, len(data), self.batch_size)]
    self.batches = batches

  def __repr__(self):
    template = '''SQUAD_dataset : questions : ({0}, {1}), contexts : ({0}, {2}), char_questions : ({0}, {1}, {3}), char_contexts : ({0}, {2}, {3}), id : ({0}, 1)'''.format(self.batch_size, self.QUESTION_MAXLEN, self.CONTEXT_MAXLEN, self.WORD_MAXLEN)
    return template

  @classmethod
  def from_file(cls, path):
    path = os.path.join(os.getcwd(), path)
    with open(path, 'rb') as handle:
      dataset = pickle.load(handle)
    return dataset

  def to_pickle(self, path):
    path = os.path.join(os.getcwd(), path)
    folder = os.path.dirname(path)

    if not os.path.exists(folder):
      os.makedirs(folder)

    with open(path, 'wb') as handle:
      pickle.dump(self, handle, protocol = pickle.HIGHEST_PROTOCOL)

  def __getitem__(self, idx):
    batch = self.batches[idx].reset_index(drop = True)

    id = np.asarray(batch['id'])

    # questions and contexts words padding
    q_w = tf.keras.preprocessing.sequence.pad_sequences(batch['tokenized_question'], padding = 'post', maxlen = self.QUESTION_MAXLEN)
    c_w = tf.keras.preprocessing.sequence.pad_sequences(batch['tokenized_context'], padding = 'post', maxlen = self.CONTEXT_MAXLEN)

    # question_char padding
    q_c = np.zeros((q_w.shape[0], self.QUESTION_MAXLEN, self.WORD_MAXLEN), dtype = np.int32)

    for i, value in batch['char_tokenized_question'].iteritems():
      v = tf.keras.preprocessing.sequence.pad_sequences(value, padding = 'post', maxlen = self.WORD_MAXLEN, truncating = 'post')
      to_add = self.QUESTION_MAXLEN - v.shape[0]
      add = np.zeros((to_add, self.WORD_MAXLEN))
      arr = np.vstack([v,add])
      q_c[i] = arr

    # context_char padding
    c_c = np.zeros((q_w.shape[0], self.CONTEXT_MAXLEN, self.WORD_MAXLEN), dtype = np.int32)

    for i, value in batch['char_tokenized_context'].iteritems():
      v = tf.keras.preprocessing.sequence.pad_sequences(value, padding = 'post', maxlen = self.WORD_MAXLEN, truncating = 'post')
      to_add = self.CONTEXT_MAXLEN - v.shape[0]
      add = np.zeros((to_add, self.WORD_MAXLEN))
      arr = np.vstack([v,add])
      c_c[i] = arr

    # one hot encode start and end
    if self.with_answer:
      y_start = tf.keras.utils.to_categorical(batch['start'].values, self.CONTEXT_MAXLEN)
      y_end = tf.keras.utils.to_categorical(batch['end'].values, self.CONTEXT_MAXLEN)

      # (inputs), (outputs), (id)
      return (q_w, c_w, q_c, c_c), (y_start, y_end), (id,)
    return (q_c, c_w, q_c, c_c), (id,)

In [45]:
tr_data = SQUAD_dataset(tr_df, batch_size = BATCH_SIZE, question_maxlen = QUESTION_MAXLEN, context_maxlen = CONTEXT_MAXLEN, word_maxlen = WORD_MAXLEN)
vl_data = SQUAD_dataset(vl_df, batch_size = BATCH_SIZE, question_maxlen = QUESTION_MAXLEN, context_maxlen = CONTEXT_MAXLEN, word_maxlen = WORD_MAXLEN)

In [46]:
# number of batches
print(len(tr_data))
len(vl_data)

6863


1839

In [47]:
tr_data

SQUAD_dataset : questions : (10, 25), contexts : (10, 400), char_questions : (10, 25, 15), char_contexts : (10, 400, 15), id : (10, 1)

In [49]:
tr_data.to_pickle('drive/MyDrive/NLP/BIDAF/utils/datasets/train_dataset.pkl')
vl_data.to_pickle('drive/MyDrive/NLP/BIDAF/utils/datasets/valid_dataset.pkl')

Now that the preprocessing is over we can preprocess a (mock) unseen dataset. It is basically the same that the one we have seen just before, but it does not contain the start and end span (text and answer_start fields)

In [None]:
# with_answer = False to parse a dataset with no answer
unseen_dataset = load_dataset(SQUAD_PATH, with_answer = False)

Reading the json file
[INFO] processing...
[INFO] there are 87599 questions with single answer
[INFO] there are 18891 different contexts
[INFO] there are 442 unrelated subjects
[INFO] Done


In [None]:
s = unseen_dataset.shape[0]

In [None]:
unseen_dataset = clean_dataset(unseen_dataset, with_answer = False)
unseen_dataset = tokenize(unseen_dataset, tokenizer, char_tokenizer)
unseen_dataset = unseen_dataset[(unseen_dataset['tokenized_question'].str.len() <= QUESTION_MAXLEN) & (unseen_dataset['tokenized_context'].str.len() <= CONTEXT_MAXLEN)].reset_index(drop = True)
print(f' we get rid of : {s - (unseen_dataset.shape[0])} samples')

 we get rid of : 496 samples


In [None]:
unseen_path = os.path.join(os.getcwd(), 'drive/MyDrive/NLP/BIDAF/utils/data/unseen_set.json')
df_to_json(unseen_dataset, unseen_path, with_answer = False)

dataset saved in /content/drive/MyDrive/NLP/BIDAF/utils/data/unseen_set.json


In [None]:
unseen_data = SQUAD_dataset(unseen_dataset, batch_size = BATCH_SIZE, question_maxlen = QUESTION_MAXLEN, context_maxlen = CONTEXT_MAXLEN, word_maxlen = WORD_MAXLEN, with_answer = False)

In [None]:
unseen_data

SQUAD_dataset : questions : (10, 25), contexts : (10, 400), char_questions : (10, 25, 15), char_contexts : (10, 400, 15), id : (10, 1)

In [None]:
unseen_data.to_pickle('drive/MyDrive/NLP/BIDAF/utils/datasets/unseen_dataset.pkl')