# **INF8460 A20 Project: Fine-tuning for extraction answering**

<br>

Equipe 8:


*   Cedric Sadeu 
*   Mamoudou Sacko 
*   Oumayma Messoussi 
<br>

---

<br>

In [1]:
!pip install sent2vec transformers pytorch-pretrained-bert -q

[K     |████████████████████████████████| 1.4MB 6.1MB/s 
[K     |████████████████████████████████| 133kB 45.6MB/s 
[K     |████████████████████████████████| 890kB 41.7MB/s 
[K     |████████████████████████████████| 2.9MB 43.8MB/s 
[K     |████████████████████████████████| 133kB 55.3MB/s 
[K     |████████████████████████████████| 71kB 9.7MB/s 
[K     |████████████████████████████████| 7.1MB 59.0MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[31mERROR: botocore 1.19.36 has requirement urllib3<1.27,>=1.25.4; python_version != "3.4", but you'll have urllib3 1.24.3 which is incompatible.[0m


**Map Google Drive**


In [None]:

from google.colab import drive
drive.mount('/content/drive')
!ls '/content/drive/My Drive/Colab Notebooks/INF8460/Project/'

import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/INF8460/Project/')


Mounted at /content/drive
bert.py		      data		inf8460_projet_A20_equipe8_clean.ipynb
correction_script.py  data_handling.py	output
create_embeddings.py  hub.py		__pycache__


**Import**

In [None]:
import torch
import hub

import importlib
importlib.reload(hub)
from hub import *

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


**load SQuAD2.0 data**

In [None]:
import json
from pathlib import Path

def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_squad('/content/drive/My Drive/Colab Notebooks/INF8460/Project/data/train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('/content/drive/My Drive/Colab Notebooks/INF8460/Project/data/dev-v2.0.json')



**load and format custom data for training**

In [None]:
def format(questions, question_context_id, question_answer, contexts_id, contexts_text):
  contexts = []
  answers = []
  not_answer_count = 0

  for i, question in enumerate(questions):
      context_id = question_context_id[i]
      if math.isnan(float(context_id)):
        contexts.append(contexts_text[random.randint(0, len(contexts_id) - 1)])
        answer = { 'text': '', 'answer_start': 0 }
        answers.append(answer)
      else:
        context_index = contexts_id.index(context_id)
        context = contexts_text[context_index]
        contexts.append(context)

        answer = {}
        answer["answer_start"] = 0
        answer["text"] = ""

        if str(question_answer[i]) in context:
          answer["answer_start"] = context.index(str(question_answer[i]))
          answer["text"] = question_answer[i]
        else:
          not_answer_count += 1
          print(i)

        answers.append(answer)        
  return contexts, questions, answers


In [None]:
from transformers import DistilBertTokenizerFast

corpus_data = load_data("corpus", 0)
train_data = load_data("train", force_refresh = 0)
vals_data = load_data("validation", force_refresh = 0)


train_contexts, train_questions, train_answers = format(train_data[1], train_data[2], train_data[3], corpus_data[0], corpus_data[1])
val_contexts, val_questions, val_answers = format(vals_data[1], vals_data[2], vals_data[3], corpus_data[0], corpus_data[1])

**Fine-tuning with custom data**

In [None]:
#inspired from 
#https://huggingface.co/transformers/custom_datasets.html


import torch
import math
import random

def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if end_idx == start_idx:
           answer['answer_end'] = 1
        elif context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters
        else:
             answer['answer_end'] = 1

def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        print(answers[i])
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        # if None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

**Loading pretrained model**

In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline, DistilBertTokenizerFast
import csv
import json
import io
import os
import torch

model_name = "deepset/electra-base-squad2"
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name, device=0)

model = nlp.model
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=635.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435618605.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=200.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




**Training**

In [None]:
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)


from torch.utils.data import DataLoader
from transformers import AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

optim = AdamW(model.parameters(), lr=3e-5)


for epoch in range(2):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()

import torch
import io
import os
output_path = "/content/drive/My Drive/Colab Notebooks/INF8460/Project/output"
torch.save(model.state_dict(), os.path.join(output_path, "model_electra_v2.pt"))

**Sample answer extraction on validation question**

In [None]:
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
import torch
question, text = "What is the oldest piece of Qutb Shahi architecture in Hyderabad?", "Qutb Shahi architecture of the 16th and early 17th centuries followed classical Persian architecture featuring domes and colossal arches. The oldest surviving Qutb Shahi structure in Hyderabad is the ruins of Golconda fort built in the 16th century. The Charminar, Mecca Masjid, Charkaman and Qutb Shahi tombs are other existing structures of this period. Among these the Charminar has become an icon of the city; located in the centre of old Hyderabad, it is a square structure with sides 20 m (66 ft) long and four grand arches each facing a road. At each corner stands a 56 m (184 ft)-high minaret. Most of the historical bazaars that still exist were constructed on the street north of Charminar towards Golconda fort. The Charminar, Qutb Shahi tombs and Golconda fort are considered to be monuments of national importance in India; in 2010 the Indian government proposed that the sites be listed for UNESCO World Heritage status.:11â€“18"

inputs = tokenizer(question, text, return_tensors='pt').to(device)
start_positions = torch.tensor([1]).to(device)
end_positions = torch.tensor([3]).to(device)

outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
loss = outputs.loss
start_scores = outputs.start_logits
end_scores = outputs.end_logits

input_ids = inputs["input_ids"].tolist()[0]

answer_start = torch.argmax(start_scores)  # Get the most likely beginning of answer with the argmax of the score
answer_end = torch.argmax(end_scores) + 1  # Get the most likely end of answer with the argmax of the score
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

print(answer_start)
print(answer_end)

print(f"Question: {question}")
print(f"Answer: {answer}")
