In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 7.8MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 17.0MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |█

# Importing the needed libraries

In [32]:
import pandas as pd
import numpy as np
import json

from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import  f1_score

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

import warnings
warnings.simplefilter('ignore')

import gc


In [4]:
MAX_LEN =  512
EPOCHS = 5
MODEL = 'distilroberta-base'
BATCH_SIZE = 16

# Data

### Download and Restructure Dataset

In [5]:
!wget https://data.deepai.org/squad1.1.zip

--2021-06-11 17:25:00--  https://data.deepai.org/squad1.1.zip
Resolving data.deepai.org (data.deepai.org)... 138.201.36.183
Connecting to data.deepai.org (data.deepai.org)|138.201.36.183|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9152254 (8.7M) [application/x-zip-compressed]
Saving to: ‘squad1.1.zip’


2021-06-11 17:25:02 (7.53 MB/s) - ‘squad1.1.zip’ saved [9152254/9152254]



In [6]:
!unzip squad1.1.zip

Archive:  squad1.1.zip
  inflating: dev-v1.1.json           
  inflating: train-v1.1.json         


In [7]:
with open('train-v1.1.json','rb') as f:
    train = json.load(f)['data']

with open('dev-v1.1.json','rb') as f:
    test = json.load(f)['data']

## Helper Functions

In [17]:
def preprocess_dataset(temp):
  context = [] 
  answer_start = [] 
  answer_end = []
  answer_text = [] 
  question = [] 

  ## 
  for i in temp:
    for z in i['paragraphs']:
      for ans in z['qas']:
        for k in ans['answers']:
          answer_start.append(k['answer_start'])
          answer_text.append(k['text'])
          answer_end.append(len(k['text']) + k['answer_start'])
          context.append(z['context'])
          question.append(ans['question'])

  return {'question':question,'context':context,'answer_text':answer_text,'answer_start':answer_start,'answer_end':answer_end}




In [18]:
def prepare_train_features(examples):
    
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation = 'only_second',
        max_length=MAX_LEN,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
        is_split_into_words=False
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")
    

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []
    
    for i, offsets in enumerate(offset_mapping):
        
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

      
        sequence_ids = tokenized_examples.sequence_ids(i)


        sample_index = sample_mapping[i]
        
        answers = examples["answer_start"][sample_index]
        text = examples["answer_text"][sample_index]

        
        # Start/end character index of the answer in the text.
        start_char = answers
        end_char = start_char + len(text) 
        

        # Start token index of the context text.
        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1
        
          
        
        # End token index of the context text.
        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != 1 :
            token_end_index -= 1

        # Detect if the answer is out of the context (in which case this feature is labeled with the CLS index).
        if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
            while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                token_start_index += 1
            tokenized_examples["start_positions"].append(token_start_index - 1)
            while offsets[token_end_index][1] >= end_char:
                token_end_index -= 1
            tokenized_examples["end_positions"].append(token_end_index + 1)
        

    return tokenized_examples

In [19]:
train = preprocess_dataset(train)
test = preprocess_dataset(test)

In [20]:
train_df = pd.DataFrame(train)
test_df = pd.DataFrame(test)

In [67]:
train_df

Unnamed: 0,question,context,answer_text,answer_start,answer_end
0,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",Saint Bernadette Soubirous,515,541
1,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...",a copper statue of Christ,188,213
2,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...",the Main Building,279,296
3,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...",a Marian place of prayer and reflection,381,420
4,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...",a golden statue of the Virgin Mary,92,126
...,...,...,...,...,...
87594,In what US state did Kathmandu first establish...,"Kathmandu Metropolitan City (KMC), in order to...",Oregon,229,235
87595,What was Yangon previously known as?,"Kathmandu Metropolitan City (KMC), in order to...",Rangoon,414,421
87596,With what Belorussian city does Kathmandu have...,"Kathmandu Metropolitan City (KMC), in order to...",Minsk,476,481
87597,In what year did Kathmandu create its initial ...,"Kathmandu Metropolitan City (KMC), in order to...",1975,199,203


In [68]:
test_df

Unnamed: 0,question,context,answer_text,answer_start,answer_end
0,Which NFL team represented the AFC at Super Bo...,Super Bowl 50 was an American football game to...,Denver Broncos,177,191
3,Which NFL team represented the NFC at Super Bo...,Super Bowl 50 was an American football game to...,Carolina Panthers,249,266
6,Where did Super Bowl 50 take place?,Super Bowl 50 was an American football game to...,"Santa Clara, California",403,426
7,Where did Super Bowl 50 take place?,Super Bowl 50 was an American football game to...,Levi's Stadium,355,369
8,Where did Super Bowl 50 take place?,Super Bowl 50 was an American football game to...,Levi's Stadium in the San Francisco Bay Area a...,355,427
...,...,...,...,...,...
34711,What is a very seldom used unit of mass in the...,"The pound-force has a metric counterpart, less...",slug,274,278
34712,What is a very seldom used unit of mass in the...,"The pound-force has a metric counterpart, less...",metric slug,267,278
34715,What is a very seldom used unit of mass in the...,"The pound-force has a metric counterpart, less...",the metric slug,263,278
34716,What seldom used term of a unit of force equal...,"The pound-force has a metric counterpart, less...",kip,712,715


In [69]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87599 entries, 0 to 87598
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   question      87599 non-null  string
 1   context       87599 non-null  string
 2   answer_text   87599 non-null  string
 3   answer_start  87599 non-null  int32 
 4   answer_end    87599 non-null  int32 
dtypes: int32(2), string(3)
memory usage: 2.7 MB


In [70]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18162 entries, 0 to 34721
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   question      18162 non-null  string
 1   context       18162 non-null  string
 2   answer_text   18162 non-null  string
 3   answer_start  18162 non-null  int32 
 4   answer_end    18162 non-null  int32 
dtypes: int32(2), string(3)
memory usage: 709.5 KB


In [71]:
train_df = train_df.astype({'question':'string','context':'string','answer_text':'string','answer_start':'int32','answer_end':'int32'})
test_df = test_df.astype({'question':'string','context':'string','answer_text':'string','answer_start':'int32','answer_end':'int32'})

In [72]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87599 entries, 0 to 87598
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   question      87599 non-null  string
 1   context       87599 non-null  string
 2   answer_text   87599 non-null  string
 3   answer_start  87599 non-null  int32 
 4   answer_end    87599 non-null  int32 
dtypes: int32(2), string(3)
memory usage: 2.7 MB


In [73]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18162 entries, 0 to 34721
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   question      18162 non-null  string
 1   context       18162 non-null  string
 2   answer_text   18162 non-null  string
 3   answer_start  18162 non-null  int32 
 4   answer_end    18162 non-null  int32 
dtypes: int32(2), string(3)
memory usage: 709.5 KB


In [74]:
train_df = train_df.sample(70000)

In [75]:
df_obj = train_df.select_dtypes(['string'])
train_df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())

In [76]:
df_obj = test_df.select_dtypes(['string'])
test_df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())

### Tokenize dataset

In [77]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

## Divide the train data to train and val

In [78]:
train,val = train_test_split(train_df,test_size=0.1)

In [79]:
train = train.to_dict('l')
val = val.to_dict('l')
test = test_df.to_dict('l')

In [80]:
tokenized_train = prepare_train_features(train)

In [81]:
tokenized_val = prepare_train_features(val)

In [82]:
tokenized_test = prepare_train_features(test)

In [83]:
del train,val,test,train_df,test_df
gc.collect()

50

## Model Creation

In [84]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids) 

In [85]:
train_dataset = SquadDataset(tokenized_train)

In [86]:
val_dataset = SquadDataset(tokenized_val)

In [87]:
test_dataset = SquadDataset(tokenized_test)

In [88]:
del tokenized_train,tokenized_val,tokenized_test
gc.collect()

302

In [89]:
model = AutoModelForQuestionAnswering.from_pretrained(MODEL)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=331070498.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForQuestionAnswering: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be 

In [90]:
from tqdm import tqdm

In [91]:
def train_fn (data_loader, model, optimizer, device, scheduler):
        
        model.train()
        
        final_loss = 0
        for data in tqdm(data_loader,total=len(data_loader)):
            for k,v in data.items():
                data[k] = v.to(device)
            optim.zero_grad()
            outputs = model(**data)
            loss = outputs[0]
            loss.backward()
            optimizer.step()            
            final_loss += loss.item()
    
        return final_loss / len(data_loader)

In [92]:
def eval_fn (data_loader, model, device):
        model.train()
        final_loss = 0
        for data in tqdm(data_loader,total=len(data_loader)):
            for k,v in data.items():
                data[k] = v.to(device)

            loss = model(**data)[0]
            loss.backward()
            final_loss += loss.item()
    
        return final_loss / len(data_loader)


In [None]:
tb = SummaryWriter()

device = torch.device('cuda')
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)
num_train_steps = int(len(train_dataset) / BATCH_SIZE * EPOCHS)

scheduler = get_linear_schedule_with_warmup(
    optim, num_warmup_steps=0, num_training_steps=num_train_steps
)

best_loss = np.inf
for epoch in range(EPOCHS):
    train_loss = train_fn(train_loader, model, optim, device, scheduler)
    tb.add_scalar('Traning Loss',train_loss,epoch,test_loss = eval_fn(val_loader, model, device))
    tb.add_scalar('Validation Loss',test_loss,epoch)
    
    print(f"Train Loss = {train_loss} Valid Loss = {test_loss}")
    if test_loss < best_loss:
        torch.save(model.state_dict(),'model.h5')
        best_loss = test_loss
    

print('Finished Training')
model.eval()
tb.close() 

100%|██████████| 3944/3944 [57:29<00:00,  1.14it/s]
  1%|          | 3/439 [00:02<06:10,  1.18it/s]

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir ./runs

In [None]:
del train_dataset

In [None]:
torch.save(model,'model.h5')

# Model Evaluation

In [8]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
def normalize(sentence):

    tokens =word_tokenize(sentence)
    new_words= [word for word in tokens if word. isalnum()]

    return new_words
  

In [10]:
def calc_f1 (predAns, trueAns):
  norm_pred = normalize(predAns)
  norm_trueAns = normalize(trueAns)
  lb = LabelEncoder()
  lb.fit(norm_trueAns)
  y_true = lb.transform(norm_trueAns).tolist()

  y_pred = [-1 if s not in lb.classes_ else lb.transform([s])[0] for s in norm_pred]

  if(len(y_true) > len(y_pred)):

    y_pred = y_pred + ([-1] * (len(y_true) - len(y_pred)))
  elif (len(y_true) < len(y_pred)):
    y_true = y_true + ([-2] * (len(y_pred) - len(y_true)))
  f1 = f1_score(y_true,y_pred,average='macro')
  return f1
  

In [11]:
def calc_EM (predAns, trueAns):
  norm_pred = normalize(predAns)
  norm_trueAns = normalize(trueAns)
  if norm_pred == norm_trueAns:
    return 1
  return 0

In [12]:
!gdown --id 1oaW7mUm_6Tr-GfaTE4oRzuAePAIYmHyV

Downloading...
From: https://drive.google.com/uc?id=1oaW7mUm_6Tr-GfaTE4oRzuAePAIYmHyV
To: /content/model.h5
326MB [00:05, 60.4MB/s]


In [13]:
import torch
qaModel = torch.load('/content/model.h5')

In [14]:
def answer_question(question, context,model):

    inputs = tokenizer(question.strip(), context.strip(),max_length=512,padding=True,truncation='only_second', add_special_tokens=True, return_tensors="pt",return_offsets_mapping=True)
    offset_mapping = inputs.pop('offset_mapping').tolist()[0]
    inputs = inputs.to('cuda')
    input_ids = inputs["input_ids"].tolist()[0]
    # print(inputs)
    outputs = model(**inputs) 
    answer_start_scores = outputs.start_logits 
    answer_end_scores = outputs.end_logits 
    answer_start = torch.argmax(answer_start_scores)
    # Get the most likely beginning of answer with the argmax of the score
    answer_end = torch.argmax(answer_end_scores) + 1
    # Get the most likely end of answer with the argmax of the score
    predict_index = offset_mapping[answer_start:answer_end]
    # print('offset:' ,predict_index)
    # print(predict_index)
   


    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    # print(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    if (len(predict_index) == 0 ):
      start_end = [0,0]
    else:
      start_end = [predict_index[0][0] , predict_index[-1][-1]]


    return answer , start_end



In [24]:
test_df.drop_duplicates(inplace=True)

In [25]:
test_df

Unnamed: 0,question,context,answer_text,answer_start,answer_end
0,Which NFL team represented the AFC at Super Bo...,Super Bowl 50 was an American football game to...,Denver Broncos,177,191
3,Which NFL team represented the NFC at Super Bo...,Super Bowl 50 was an American football game to...,Carolina Panthers,249,266
6,Where did Super Bowl 50 take place?,Super Bowl 50 was an American football game to...,"Santa Clara, California",403,426
7,Where did Super Bowl 50 take place?,Super Bowl 50 was an American football game to...,Levi's Stadium,355,369
8,Where did Super Bowl 50 take place?,Super Bowl 50 was an American football game to...,Levi's Stadium in the San Francisco Bay Area a...,355,427
...,...,...,...,...,...
34711,What is a very seldom used unit of mass in the...,"The pound-force has a metric counterpart, less...",slug,274,278
34712,What is a very seldom used unit of mass in the...,"The pound-force has a metric counterpart, less...",metric slug,267,278
34715,What is a very seldom used unit of mass in the...,"The pound-force has a metric counterpart, less...",the metric slug,263,278
34716,What seldom used term of a unit of force equal...,"The pound-force has a metric counterpart, less...",kip,712,715


In [33]:
f1_scores = []
em_scores = []
for i in range(20):
  question = test_df.iloc[i,0]
  context = test_df.iloc[i,1]
  true_start = test_df.iloc[i,3]
  true_end = test_df.iloc[i,4]
  true_answer = test_df.iloc[i,2]
  # print(true_start,true_end)
  answer,start_end = answer_question(question,context,qaModel)
  # f1 = calc_f1_score(true_start=true_start,true_end=true_end,pred_stat=start_end[0],pred_end=start_end[1])
  f1 = calc_f1(answer,true_answer)

  em = calc_EM(answer,true_answer)
  print('true answer: ',true_answer)
  print('F1 Score:' ,f1)
  print('Exact Match:',em)
  f1_scores.append(f1)
  em_scores.append(em)

true answer:  Denver Broncos
F1 Score: 1.0
Exact Match: 1
true answer:  Carolina Panthers
F1 Score: 0.0
Exact Match: 0
true answer:  Santa Clara, California
F1 Score: 0.0
Exact Match: 0
true answer:  Levi's Stadium
F1 Score: 0.5
Exact Match: 0
true answer:  Levi's Stadium in the San Francisco Bay Area at Santa Clara, California.
F1 Score: 1.0
Exact Match: 1
true answer:  Denver Broncos
F1 Score: 1.0
Exact Match: 1
true answer:  gold
F1 Score: 1.0
Exact Match: 1
true answer:  gold
F1 Score: 1.0
Exact Match: 1
true answer:  "golden anniversary"
F1 Score: 1.0
Exact Match: 1
true answer:  gold-themed
F1 Score: 0.0
Exact Match: 0
true answer:  "golden anniversary
F1 Score: 1.0
Exact Match: 1
true answer:  February 7, 2016
F1 Score: 1.0
Exact Match: 1
true answer:  February 7
F1 Score: 0.5
Exact Match: 0
true answer:  American Football Conference
F1 Score: 1.0
Exact Match: 1
true answer:  gold
F1 Score: 0.0
Exact Match: 0
true answer:  American Football Conference
F1 Score: 1.0
Exact Match: 

In [34]:
np.mean(f1_scores)

0.725

In [35]:
np.mean(em_scores)

0.65

In [38]:
f1_scores = []
em_scores = []
for i in range(len(test_df)-1):
  question = test_df.iloc[i,0]
  context = test_df.iloc[i,1]
  true_start = test_df.iloc[i,3]
  true_end = test_df.iloc[i,4]
  true_answer = test_df.iloc[i,2]
  # print(true_start,true_end)
  answer,start_end = answer_question(question,context,qaModel)
  # f1 = calc_f1_score(true_start=true_start,true_end=true_end,pred_stat=start_end[0],pred_end=start_end[1])
  f1 = calc_f1(answer,true_answer)

  em = calc_EM(answer,true_answer)
  f1_scores.append(f1)
  em_scores.append(em)

In [41]:
np.mean(em_scores)

0.47981939320522

In [65]:
scores =[]
for f in f1_scores:
  if ~np.isnan(f):
    scores.append(f)

In [66]:
np.mean(scores)

0.5108562907964285