In order to run this notebook, first things you should do are :
* mount your drive endpoint
* go at the end and modify paths ( i don't know if numpy.save() also create missing folders )


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import json
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
from nltk import word_tokenize
nltk.download('punkt')
import gensim.downloader as gloader
from sklearn.model_selection import train_test_split
import re
import pickle

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
EMBEDDING_SIZE = 300

def load_dataset(path, record_path = ['data', 'paragraphs', 'qas', 'answers'], verbose = True):

  if verbose:
      print("Reading the json file")
  # if the file encoding is not UTF8 an exception should be raised    
  file = json.loads(open(path).read())

  if verbose:
      print("[INFO] processing...")

  # parsing different level's in the json file
  js = pd.json_normalize(file , record_path )
  m = pd.json_normalize(file, record_path[:-1] )
  r = pd.json_normalize(file,record_path[:-2])
  t = pd.json_normalize(file,record_path[0])

  #combining it into single dataframe
  idx = np.repeat(r['context'].values, r.qas.str.len())
  ndx  = np.repeat(m['id'].values,m['answers'].str.len())
  m['context'] = idx
  js['q_idx'] = ndx
  main = pd.concat([ m[['id','question','context']].set_index('id'), js.set_index('q_idx')],1,sort = False).reset_index()
  main['c_id'] = main['context'].factorize()[0]
  if verbose:
      print(f"[INFO] there are {main.shape[0]} questions with single answer")
      print(f"[INFO] there are {main.groupby('c_id').sum().shape[0]} different contexts")
      print(f"[INFO] there are {len(t)} unrelated subjects")
      print("[INFO] Done")
  return main

def download_glove_model(embedding_dimension = 50):
  download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
  try:
    print('[INFO] downloading glove {}'.format(embedding_dimension))
    emb_model = gloader.load(download_path)
    print('[INFO] done !')
  except ValueError as e:
      print("Glove: 50, 100, 200, 300")
      raise e
  return emb_model

In [6]:
dataset_path = os.path.join(os.getcwd(),'drive/SQUAD_project/data/training_set.json')
squad_dataset = load_dataset(dataset_path)

NameError: ignored

In [None]:
squad_dataset.head()

Unnamed: 0,index,question,context,answer_start,text,c_id
0,5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",515,Saint Bernadette Soubirous,0
1,5733be284776f4190066117f,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...",188,a copper statue of Christ,0
2,5733be284776f41900661180,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...",279,the Main Building,0
3,5733be284776f41900661181,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...",381,a Marian place of prayer and reflection,0
4,5733be284776f4190066117e,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...",92,a golden statue of the Virgin Mary,0


In [None]:
SAMPLES = squad_dataset.shape[0]

def preprocess_sentence(text):
  text = text.lower()
  #text = re.sub(r'(\.)',r' \1 ', text)
  text = text.strip()
  return text

def clean_dataset(dataset):

  _dataset = dataset.copy()

  cleaned_questions = _dataset['question'].apply(preprocess_sentence)
  cleaned_texts = _dataset['text'].apply(preprocess_sentence)

  # we process only different contexts and then we duplicate them
  unique_context = pd.Series(_dataset['context'].unique())
  count_c = _dataset.groupby('c_id').count()['text']
  cleaned_contexts = unique_context.apply(preprocess_sentence)

  _dataset['question'] = cleaned_questions
  _dataset['text'] = cleaned_texts
  _dataset['context'] = pd.Series(np.repeat(cleaned_contexts, count_c).tolist())

  return _dataset

In [None]:
squad_dataset = clean_dataset(squad_dataset)

In [None]:
def get_tokenizer(dataset, glove_model = None):

  tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token = 'UNK', filters = '')

  # we will only keep the 200 - 1 most frequent characters (otherwise oom issue)
  # others tokens are replaced by UNK token 
  # we keep 1 - 199 and 1 is UNK token (so we keep 198 tokens)
  char_tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level = True, filters = '', oov_token = 'UNK', num_words = 200)

  if glove_model == None:
    glove_model = download_glove_model(EMBEDDING_SIZE)

  tokenized_questions = dataset['question'].apply(word_tokenize).to_list()

  contexts = pd.Series(dataset['context'].unique())
  tokenized_contexts = contexts.apply(word_tokenize).to_list()

  sequences = glove_model.index2entity + tokenized_questions + tokenized_contexts

  del glove_model # we  don't need anymore the glove model

  tokenizer.fit_on_texts(sequences)
  char_tokenizer.fit_on_texts(dataset['question'].to_list() + contexts.to_list())

  return tokenizer, char_tokenizer


def update_tokenizer(dataset, tokenizer, char_tokenizer):

  tokenized_questions = dataset['question'].apply(word_tokenize).to_list()

  contexts = pd.Series(dataset['context'].unique())
  tokenized_contexts = contexts.apply(word_tokenize).to_list()

  sequences = tokenized_questions + tokenized_contexts
  tokenizer.fit_on_texts(sequences)

  char_tokenizer.fit_on_texts(dataset['question'].to_list() + contexts.to_list())

def get_start_end(row):

  context = row['context']
  answer = row['text']
  tok_answer = word_tokenize(answer)

  _start = context.find(answer)

  if _start == -1:
    # the answer is not in the context
    # maybe due to a typo
    row['start'] = -1
    row['end'] = -1
    return row

  lc = context[:_start]
  lc = word_tokenize(lc)

  start = len(lc)
  end = start + len(tok_answer)

  row['start'] = start
  row['end'] = end

  return row

def tokenize(dataset, tokenizer, char_tokenizer):

  _dataset = dataset.copy()

  tokenized_questions = _dataset['question'].apply(word_tokenize).to_list()
  tokenized_contexts = _dataset['context'].apply(word_tokenize).to_list()

  t_q = tokenizer.texts_to_sequences(tokenized_questions)
  t_c = tokenizer.texts_to_sequences(tokenized_contexts)

  c_q = []
  c_c = []

  for question, context in zip(tokenized_questions, tokenized_contexts):
    _q = char_tokenizer.texts_to_sequences(question)
    _c = char_tokenizer.texts_to_sequences(context)
    c_q.append(_q)
    c_c.append(_c)

  _dataset['tokenized_question'] = t_q
  _dataset['tokenized_context'] = t_c

  _dataset['char_tokenized_question'] = c_q
  _dataset['char_tokenized_context'] = c_c

  return _dataset

def split(dataset, test_size = 0.2, random_state = 42):

  # random_state for deterministic state

  tr, vl = train_test_split(dataset, test_size = test_size, random_state = random_state)
  tr.reset_index(drop = True, inplace = True)
  vl.reset_index(drop = True, inplace = True)

  return tr,vl

def convert(context , coord = None, tokenizer = None):
  if coord:
    start = coord[0]
    end = coord[1]
    if type(context) == str:
      context = word_tokenize(context)
      answer = context[start:end]
      return ' '.join(answer).strip()
    else:
      answer = ''
      for i in range(start, end):
        t = context[i]
        answer+= tokenizer.index_word[t] + ' '
      return answer.strip()
  else:
    if type(context) == str:
      return context
    else:
      c = ''
      for t in context:
        c += tokenizer.index_word[t] + ' '
      return c.strip()

In [None]:
tr_df, vl_df = split(squad_dataset)

In [None]:
tr_df.shape[0],vl_df.shape[0]

(70079, 17520)

Our vocabulary is based on the Glove vocabulary, and we add terms from the training set

In [None]:
tokenizer, char_tokenizer = get_tokenizer(tr_df)

[INFO] downloading glove 300
[INFO] done !


In [None]:
print(len(tokenizer.word_index))
len(char_tokenizer.word_index)

429064


1263

We then update our vocabulary with terms from the validation set

In [None]:
update_tokenizer(vl_df, tokenizer, char_tokenizer)

In [None]:
print(len(tokenizer.word_index))
len(char_tokenizer.word_index)

429758


1265

In [None]:
# take a while
tr_df = tr_df.apply(get_start_end, axis = 1)
vl_df = vl_df.apply(get_start_end, axis = 1)

we get rid of samples where the answer doesn't match the context (maybe there is a typo in the answer or the context).  
To avoid to discard many samples, we could lemmatize / stem the text.   
Obviously, lemmatization is a better choice for our task, but if we want a really accurate lemmatization processing, we need to do POS tagging.

In [None]:
tr_df[tr_df['start'] == -1].shape[0], vl_df[vl_df['start'] == -1].shape[0]

(69, 15)

In [None]:
tr_df[tr_df['start'] == -1]

Unnamed: 0,index,question,context,answer_start,text,c_id,start,end
87,56de31984396321400ee2672,on what date was the 2013 human development re...,some countries were not included for various r...,92,"march 14, 2013",2185,-1,-1
3133,56e17f5de3433e1400422f8c,what field studies the placement of catalan in...,"in central catalan, unstressed vowels reduce t...",0,catalan sociolinguistics,3470,-1,-1
3983,56e1b97fcd28a01900c67ad8,what is the official regulating body of valen...,"valencian is classified as a western dialect, ...",168,the valencian academy of language,3488,-1,-1
6198,56e1b4decd28a01900c67a91,what language is the regulator meant to standa...,"in alghero, the iec has adapted its standard t...",103,catalan,3486,-1,-1
6994,56e1b738cd28a01900c67aae,where are the provinces of lleida and tarragona?,"in 2011, the aragonese government passed a dec...",94,western catalonia,3487,-1,-1
...,...,...,...,...,...,...,...,...
66889,572e8003c246551400ce425f,what did great britain gain in the west indies...,"many middle and small powers in europe, unlike...",113,some individual caribbean islands in the west ...,15282,-1,-1
66972,572e81f2cb0c0d14000f1206,"what is the precedent for the ""second hundred ...","the war was successful for great britain, whic...",446,reminiscent of the more famous and compact str...,15283,-1,-1
67376,572e8578c246551400ce42bd,who would sicily and savoy normally align with?,"realizing that war was imminent, prussia preem...",434,"sicily, and savoy, although sided with franco-...",15281,-1,-1
69867,56e180f5e3433e1400422f96,what do the dialects of catalan feature?,catalan sociolinguistics studies the situation...,56,uniformity,3471,-1,-1


In [None]:
vl_df[vl_df['start'] == -1]

Unnamed: 0,index,question,context,answer_start,text,c_id,start,end
171,56e18a90e3433e1400422fac,in what densely populated area is it spoken?,western catalan comprises the two dialects of ...,166,barcelona province,3474,-1,-1
1536,56e1a3cbe3433e1400423066,where is iec's standard used?,"standard catalan, virtually accepted by all sp...",3,the balearic islands,3484,-1,-1
3007,56e18710cd28a01900c679b9,what have a and e done in eastern dialects?,the dialects of the catalan language feature a...,162,merged,3472,-1,-1
4925,56e1b4decd28a01900c67a8e,where is the catalan speaking part of aragon?,"in alghero, the iec has adapted its standard t...",114,la franja,3486,-1,-1
5782,56e18bfbe3433e1400422fb5,how many stressed phonemes are there in catalan?,central catalan is considered the standard pro...,69,seven,3468,-1,-1
5897,56e18710cd28a01900c679b7,what is the major difference between the two b...,the dialects of the catalan language feature a...,118,treatment of unstressed a and e,3472,-1,-1
5937,56e18bfbe3433e1400422fb4,what is the vowel system of catalan?,western catalan comprises the two dialects of ...,50,vulgar latin,3468,-1,-1
6561,56e1b264e3433e14004230a6,where has the iec adapted its standard to the ...,the most notable difference between both stand...,3,alghero,3485,-1,-1
7378,572e81f2cb0c0d14000f1207,what was a later conflict that some considered...,"the war was successful for great britain, whic...",246,to later conflicts like the napoleonic wars,15283,-1,-1
11252,56e17b08cd28a01900c679af,where do you find dialectic vowel reductions?,catalan has inherited the typical vowel system...,176,section pronunciation,3469,-1,-1


In [None]:
# we get rid of samples where the answer doesn't match the context
tr_df = tr_df[tr_df['start'] != -1]
vl_df = vl_df[vl_df['start'] != -1]

In [None]:
tr_df = tokenize(tr_df, tokenizer, char_tokenizer)
vl_df = tokenize(vl_df, tokenizer, char_tokenizer)

In [None]:
tr_df.head()

Unnamed: 0,index,question,context,answer_start,text,c_id,start,end,tokenized_question,tokenized_context,char_tokenized_question,char_tokenized_context
0,572667e6708984140094c4f9,what team had dallas green managed in 1980?,"after over a dozen more subpar seasons, in 198...",154,phillies,8880,29,30,"[11, 309, 49, 11808, 646, 2132, 6, 2627, 9]","[61, 83, 10, 6737, 62, 70020, 1740, 3, 6, 3372...","[[20, 11, 5, 4], [4, 3, 5, 16], [11, 5, 13], [...","[[5, 17, 4, 3, 10], [8, 24, 3, 10], [5], [13, ..."
1,56dec2483277331400b4d712,which candidate withdrew from the presidential...,schwarzenegger's endorsement in the republican...,156,rudy giuliani,2311,23,25,"[27, 2789, 4161, 23, 2, 1534, 698, 6, 417, 4, ...","[1084, 19, 9106, 6, 2, 1467, 477, 4, 2, 420, 1...","[[20, 11, 6, 14, 11], [14, 5, 7, 13, 6, 13, 5,...","[[9, 14, 11, 20, 5, 10, 39, 3, 7, 3, 19, 19, 3..."
2,5726e5995951b619008f81bb,captive animals can distinguish co-inhabitats ...,it has been observed that well-fed predator an...,224,wild ones outside the area,9822,38,43,"[11888, 727, 65, 3733, 419169, 23, 11, 48, 136...","[30, 40, 59, 2316, 20, 63225, 4421, 727, 6, 10...","[[14, 5, 18, 4, 6, 24, 3], [5, 7, 6, 16, 5, 12...","[[6, 4], [11, 5, 9], [22, 3, 3, 7], [8, 22, 9,..."
3,5726486f708984140094c157,the results of which battle allowed the britis...,"after returning from egypt, napoleon engineere...",919,the battle of trafalgar,8418,158,162,"[2, 1324, 4, 27, 326, 495, 2, 132, 8, 6280, 15...","[61, 3986, 23, 598, 3, 545, 9789, 10, 2313, 6,...","[[4, 11, 3], [10, 3, 9, 15, 12, 4, 9], [8, 17]...","[[5, 17, 4, 3, 10], [10, 3, 4, 15, 10, 7, 6, 7..."
4,5730299db2c2fd14005689a7,how was vesey executed in 1822?,"by 1820, charleston's population had grown to ...",382,hanged,15719,74,75,"[44, 13, 25121, 2181, 6, 10202, 9]","[18, 9015, 3, 1909, 19, 104, 49, 2555, 8, 2106...","[[11, 8, 20], [20, 5, 9], [24, 3, 9, 3, 21], [...","[[22, 21], [28, 40, 31, 29], [23], [14, 11, 5,..."


In [None]:
print(tr_df['tokenized_question'].str.len().describe())
vl_df['tokenized_question'].str.len().describe()

count    70010.000000
mean        11.275532
std          3.715821
min          1.000000
25%          9.000000
50%         11.000000
75%         13.000000
max         60.000000
Name: tokenized_question, dtype: float64


count    17505.000000
mean        11.335504
std          3.754207
min          1.000000
25%          9.000000
50%         11.000000
75%         13.000000
max         38.000000
Name: tokenized_question, dtype: float64

In [None]:
print(tr_df['tokenized_question'].str.len().quantile(0.99))
vl_df['tokenized_question'].str.len().quantile(0.99)

22.0


23.0

In [None]:
print(tr_df['tokenized_context'].str.len().describe())
vl_df['tokenized_context'].str.len().describe()

count    70010.000000
mean       137.824439
std         56.941382
min         22.000000
25%        102.000000
50%        127.000000
75%        164.000000
max        766.000000
Name: tokenized_context, dtype: float64


count    17505.000000
mean       137.211083
std         55.912622
min         22.000000
25%        102.000000
50%        126.000000
75%        163.000000
max        766.000000
Name: tokenized_context, dtype: float64

In [None]:
print(tr_df['tokenized_context'].str.len().quantile(0.99))
vl_df['tokenized_context'].str.len().quantile(0.99)

324.0


324.0

In [None]:
def len_words(dataset):
  count_q = []
  count_c = []

  for idx, row in dataset.iterrows():
    for w in row['char_tokenized_question']:
      l = len(w)
      count_q.append(l)
      
    for w in row['char_tokenized_context']:
      m = len(w)
      count_c.append(m)
  
  return pd.Series(count_q), pd.Series(count_c)

t_q,t_c = len_words(tr_df)
v_q,v_c = len_words(vl_df)

In [None]:
print(t_q.describe())
t_c.describe()

count    789400.000000
mean          4.447926
std           2.677579
min           1.000000
25%           2.000000
50%           4.000000
75%           6.000000
max          30.000000
dtype: float64


count    9.649089e+06
mean     4.626070e+00
std      2.969605e+00
min      1.000000e+00
25%      2.000000e+00
50%      4.000000e+00
75%      7.000000e+00
max      3.700000e+01
dtype: float64

In [None]:
print(v_q.describe())
v_c.describe()

count    198428.000000
mean          4.453232
std           2.686696
min           1.000000
25%           2.000000
50%           4.000000
75%           6.000000
max          24.000000
dtype: float64


count    2.401880e+06
mean     4.629710e+00
std      2.972670e+00
min      1.000000e+00
25%      2.000000e+00
50%      4.000000e+00
75%      7.000000e+00
max      3.700000e+01
dtype: float64

In [None]:
print(t_q.quantile(0.99))
t_c.quantile(0.99)

12.0


13.0

In [None]:
print(v_q.quantile(0.99))
v_c.quantile(0.99)

12.0


13.0

There are obviously some outliers. We are compeled to get rid of some samples because of memory issues.

We will get rid of contexts that have more than 325 words and questions that have more than 23 words.

We will set the length of a word to 13 characters

In [None]:
QUESTION_MAXLEN = 23
CONTEXT_MAXLEN = 325
WORD_MAXLEN = 13

In [None]:
tr_df.shape, vl_df.shape

((70010, 12), (17505, 12))

In [None]:
tr_df = tr_df[(tr_df['tokenized_question'].str.len() <= QUESTION_MAXLEN) & (tr_df['tokenized_context'].str.len() <= CONTEXT_MAXLEN) & (tr_df['start'] <= CONTEXT_MAXLEN) & (tr_df['end'] <= CONTEXT_MAXLEN) ].reset_index(drop = True)
vl_df = vl_df[(vl_df['tokenized_question'].str.len() <= QUESTION_MAXLEN) & (vl_df['tokenized_context'].str.len() <= CONTEXT_MAXLEN) & (vl_df['start'] <= CONTEXT_MAXLEN) & (vl_df['end'] <= CONTEXT_MAXLEN) ].reset_index(drop = True)

In [None]:
tr_df.shape[0], vl_df.shape[0]

(68842, 17199)

In [None]:
 print(f' we get rid of : {SAMPLES - (tr_df.shape[0] + vl_df.shape[0])} samples')

 we get rid of : 1558 samples


In [None]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol = pickle.HIGHEST_PROTOCOL)
%mv tokenizer.pickle 'drive/MyDrive/NLP/data/' 

In [None]:
with open('char_tokenizer.pickle', 'wb') as handle:
    pickle.dump(char_tokenizer, handle, protocol = pickle.HIGHEST_PROTOCOL)
%mv char_tokenizer.pickle 'drive/MyDrive/NLP/data/' 

In [None]:
tr_padded_questions = tf.keras.preprocessing.sequence.pad_sequences(tr_df['tokenized_question'], padding = 'post', maxlen = QUESTION_MAXLEN)
tr_padded_contexts = tf.keras.preprocessing.sequence.pad_sequences(tr_df['tokenized_context'], padding = 'post', maxlen = CONTEXT_MAXLEN)

In [None]:
np.save('drive/MyDrive/NLP/data/tr_padded_questions__{}.npy'.format(QUESTION_MAXLEN), tr_padded_questions)
np.save('drive/MyDrive/NLP/data/tr_padded_contexts__{}.npy'.format(CONTEXT_MAXLEN), tr_padded_contexts)

In [None]:
vl_padded_questions = tf.keras.preprocessing.sequence.pad_sequences(vl_df['tokenized_question'], padding = 'post', maxlen = QUESTION_MAXLEN)
vl_padded_contexts = tf.keras.preprocessing.sequence.pad_sequences(vl_df['tokenized_context'], padding = 'post', maxlen = CONTEXT_MAXLEN)

In [None]:
np.save('drive/MyDrive/NLP/data/vl_padded_questions__{}.npy'.format(QUESTION_MAXLEN), vl_padded_questions)
np.save('drive/MyDrive/NLP/data/vl_padded_contexts__{}.npy'.format(CONTEXT_MAXLEN), vl_padded_contexts)

In [None]:
pad_char_c = np.zeros((vl_df.shape[0], QUESTION_MAXLEN, WORD_MAXLEN), dtype = np.int32)

for i, value in vl_df['char_tokenized_question'].iteritems():
  v = tf.keras.preprocessing.sequence.pad_sequences(value, padding = 'post', maxlen = WORD_MAXLEN, truncating = 'post')
  to_add = QUESTION_MAXLEN - v.shape[0]
  add = np.zeros((to_add, WORD_MAXLEN))
  arr = np.vstack([v,add])
  pad_char_c[i] = arr

np.save('drive/MyDrive/NLP/data/vl_char_padded_questions__{}.npy'.format(WORD_MAXLEN), pad_char_c)

In [None]:
pad_char_c = np.zeros((tr_df.shape[0], QUESTION_MAXLEN, WORD_MAXLEN), dtype = np.int32)

for i, value in tr_df['char_tokenized_question'].iteritems():
  v = tf.keras.preprocessing.sequence.pad_sequences(value, padding = 'post', maxlen = WORD_MAXLEN, truncating = 'post')
  to_add = QUESTION_MAXLEN - v.shape[0]
  add = np.zeros((to_add, WORD_MAXLEN))
  arr = np.vstack([v,add])
  pad_char_c[i] = arr

np.save('drive/MyDrive/NLP/data/tr_char_padded_questions__{}.npy'.format(WORD_MAXLEN), pad_char_c)

In [None]:
del pad_char_c # to free memory

In [None]:
pad_char_q = np.zeros((vl_df.shape[0], CONTEXT_MAXLEN, WORD_MAXLEN), dtype = np.int32)

for i, value in vl_df['char_tokenized_context'].iteritems():
  v = tf.keras.preprocessing.sequence.pad_sequences(value, padding = 'post', maxlen = WORD_MAXLEN, truncating = 'post')
  to_add = CONTEXT_MAXLEN - v.shape[0]
  add = np.zeros((to_add, WORD_MAXLEN))
  arr = np.vstack([v,add])
  pad_char_q[i] = arr

np.save('drive/MyDrive/NLP/data/vl_char_padded_contexts__{}.npy'.format(WORD_MAXLEN), pad_char_q)

In [None]:
pad_char_q =  np.zeros((tr_df.shape[0], CONTEXT_MAXLEN, WORD_MAXLEN))

for i, value in tr_df['char_tokenized_context'].iteritems():
  v = tf.keras.preprocessing.sequence.pad_sequences(value, padding = 'post', maxlen = WORD_MAXLEN, truncating = 'post')
  to_add = CONTEXT_MAXLEN - v.shape[0]
  add = np.zeros((to_add, WORD_MAXLEN))
  arr = np.vstack([v,add])
  pad_char_q[i] = arr
  
np.save('drive/MyDrive/NLP/data/tr_char_padded_contexts__{}.npy'.format(WORD_MAXLEN), pad_char_q)

In [None]:
del pad_char_q # to free memory

In [None]:
num_classes = CONTEXT_MAXLEN
y_start_train = tf.keras.utils.to_categorical(tr_df['start'].values, num_classes)
y_end_train = tf.keras.utils.to_categorical(tr_df['end'].values, num_classes)

y_start_valid = tf.keras.utils.to_categorical(vl_df['start'].values, num_classes)
y_end_valid = tf.keras.utils.to_categorical(vl_df['end'].values, num_classes)

In [None]:
np.save('drive/MyDrive/NLP/data/tr_y_start.npy', y_start_train)
np.save('drive/MyDrive/NLP/data/tr_y_end.npy', y_end_train)

In [None]:
np.save('drive/MyDrive/NLP/data/vl_y_start.npy', y_start_valid)
np.save('drive/MyDrive/NLP/data/vl_y_end.npy', y_end_valid)

In fact we have created a character tokenizer but we won't work at the character level.  
Indeed, we run oom when we try to work at this level.

**EDIT** : it seems we can work at the character level if we significantly reduce `WORD_MAXLEN` (from 37 to 13) and only treat the first 199 tokens as real tokens and the others as UNK tokens