In [None]:
!pip install transformers
!pip install tensorflow_addons
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m54.0 MB/s[0m eta [36m0:00:00[0m
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3021, in _dep_map
    return self.__dep_map
  File "/usr/local/lib/python3.8/di

In [None]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from typing import *

import os
import urllib.request
from tqdm import tqdm
from copy import deepcopy
import tensorflow as tf
import tensorflow_addons as tfa
import torch

from transformers import BertForQuestionAnswering, TFAutoModelForQuestionAnswering
from transformers import AutoTokenizer, BertTokenizer

In [None]:
class DownloadProgressBar(tqdm):
  def update_to(self, b=1, bsize=1, tsize=None):
    if tsize is not None:
      self.total = tsize
    self.update(b*bsize - self.n)

def download_url(url, output_path):
  with DownloadProgressBar(unit='B', unit_scale=True, miniters=1, desc=url.split('/')[-1]) as t:
    urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)

def download_data(data_path, url_path, suffix):
  if not os.path.exists(data_path):
    os.makedirs(data_path)

  data_path = os.path.join(data_path,f'{suffix}.json')

  if not os.path.exists(data_path):
    print(f"Downloading CoQA {suffix} data split... (it may take a while)")
    download_url(url=url_path, output_path=data_path)
    print("Download Completed!")


In [None]:
#Train Data
train_url = "https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json"
download_data(data_path='coqa',url_path=train_url, suffix='train')

#Test Data
test_url = "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json"
download_data(data_path='coqa', url_path = test_url, suffix='test')

In [None]:
train_data = json.load((open('/content/coqa/train.json')))
qas = pd.json_normalize(train_data['data'], ['questions'], ['source', 'id', 'story'])
ans = pd.json_normalize(train_data['data'], ['answers'],['id'])
train_df = pd.merge(qas,ans, left_on=['id','turn_id'], right_on=['id','turn_id'])

In [None]:
train_df['q_first_word']=train_df['input_text_x'].str.lower().str.extract(r'(\w+)')
train_df['q_first_two_words']=train_df['input_text_x'].str.lower().str.extract(r'^((?:\S+\s+){1}\S+).*')

In [None]:
train_df = train_df.loc[train_df['input_text_y']!='unknown']

In [None]:
test_data = json.load((open('/content/coqa/test.json')))
qas = pd.json_normalize(test_data['data'], ['questions'], ['source', 'id', 'story'])
ans = pd.json_normalize(test_data['data'], ['answers'],['id'])
test_df = pd.merge(qas,ans, left_on=['id','turn_id'], right_on=['id','turn_id'])
test_df = test_df.loc[test_df['input_text_y']!='unknown']

In [None]:
train, val = train_test_split(train_df, test_size=0.2, random_state=42)
train.head()

In [None]:
train = train[['story','input_text_x', 'input_text_y', 'span_text', 'span_start']]
val = val[['story','input_text_x', 'input_text_y', 'span_text', 'span_start']]
test_df = test_df[['story','input_text_x', 'input_text_y', 'span_text', 'span_start']]
train.rename(columns={'input_text_x': 'questions', 'input_text_y': 'answers', 'span_text': 'reasons'}, inplace=True)
val.rename(columns={'input_text_x': 'questions', 'input_text_y': 'answers', 'span_text': 'reasons'}, inplace=True)
test_df.rename(columns={'input_text_x': 'questions', 'input_text_y': 'answers', 'span_text': 'reasons'}, inplace=True)
display(train.head(),val.head(),test_df.head())

In [None]:
model = TFAutoModelForQuestionAnswering.from_pretrained('distilroberta-base')

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')

In [None]:
max_length = 400
doc_stride = 200

In [None]:
# tokenizer(train["questions"][0], train["answers"][0])

In [None]:
# for i in range(len(train["questions"])):
#   if len(tokenizer(train["questions"][i],train["story"][i])["input_ids"]) > 400:
#     break


#Truncate otherwise too large

In [None]:
# len(tokenizer(questions, stories, max_length = max_length, truncation = "only_second")["input_ids"])

In [None]:
# tokenized_data = tokenizer(questions, stories, max_length = max_length, truncation = "only_second", return_overflowing_tokens=True, stride=doc_stride)

In [None]:
# [len(x) for x in tokenized_data["input_ids"]]

In [None]:
# for x in tokenized_data["input_ids"][:2]:
#   print(tokenizer.decode(x))

In [None]:
# tokenized_data = tokenizer(questions, stories, max_length=max_length,truncation="only_second",return_overflowing_tokens=True, return_offsets_mapping=True, stride=doc_stride)
# print(tokenized_data["offset_mapping"][0][:100])

In [None]:
# first_token_id = tokenized_data["input_ids"][0][1]
# offsets = tokenized_data["offset_mapping"][0][1]
# print(
#     tokenizer.convert_ids_to_tokens([first_token_id])[0],
#     questions[offsets[0] : offsets[1]],
# )

In [None]:
# sequence_ids = tokenized_data.sequence_ids()
# print(sequence_ids)

In [None]:
# #Probabilmente sbagliato
# start_char = len(answers)
# end_char = start_char + len(answers[0])

# #Start token index of current span in the text
# token_start_index = 0
# while sequence_ids[token_start_index] != 1:
#   token_start_index += 1

# #End token index of current span in text
# token_end_index = len(tokenized_data["input_ids"][0]) - 1
# while sequence_ids[token_end_index] != 1:
#   token_end_index -=1

# #Detect if answer is out of span
# offsets = tokenized_data["offset_mapping"][0]
# if offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char:
#   while(token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char):
#     token_start_index += 1
#   start_position = token_start_index - 1
#   while offsets[token_end_index][1] >= end_char:
#     token_end_index -= 1
#   end_position = token_end_index +1
#   print(start_position, end_position)
# else:
#   print("Ask another question")

In [None]:
# print(tokenizer.decode(tokenized_data["input_ids"][0][start_position: end_position + 1]))
# print(answers[0])

In [None]:
pad_on_right = tokenizer.padding_side = "right"

In [None]:
def prepare_train_features(train=train):
  tokenized_data = tokenizer(train["questions" if pad_on_right else "story"].to_list(), train["story" if pad_on_right else "questions"].to_list(),
                             truncation='only_second' if pad_on_right else 'only_first', max_length = max_length,
                             stride=doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True,padding='max_length')
  
  sample_mapping = tokenized_data.pop("overflow_to_sample_mapping")
  offset_mapping = tokenized_data.pop("offset_mapping")

  tokenized_data["start_positions"] = []
  tokenized_data["end_positions"] = []

  for i, offsets in enumerate(offset_mapping):
    input_ids = tokenized_data["input_ids"][i]
    cls_index = input_ids.index(tokenizer.cls_token_id)

    sequence_ids = tokenized_data.sequence_ids(i)
    sample_index = sample_mapping[i]
    answers = train[["answers", "span_start"]].iloc[sample_index]
    if answers["span_start"]==0:
      tokenized_data["start_positions"].append(cls_index)
      tokenized_data["end_positions"].append(cls_index)
    else:

      start_char = answers["span_start"]
      end_char = start_char + len(answers["answers"])

      token_start_index = 0
      while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
        token_start_index += 1
      
      token_end_index = len(input_ids) - 1
      while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
        token_end_index -= 1

      if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index] >= end_char).any():
        tokenized_data["start_positions"].append(cls_index)
        tokenized_data["end_positions"].append(cls_index)
      else:
        while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
          token_start_index +=1
        
        tokenized_data["start_positions"].append(token_start_index - 1)
        while offsets[token_end_index][1] >= end_char:
          token_end_index -= 1
        tokenized_data["end_positions"].append(token_end_index + 1)

  return tokenized_data

In [None]:
tokenized_train = prepare_train_features(train.iloc[:5])

In [None]:
print(type(tokenized_train))

In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(train)
print(dataset["span_start"])

In [None]:
tokenized_train = dataset.map(prepare_train_features_d, batched=True, remove_columns=dataset.column_names)

In [None]:
learning_rate = 2e-5
num_train_epochs=2
weight_decay=0.01
print(tokenized_train)

In [None]:
train_set = model.prepare_tf_dataset(tokenized_train, shuffle=True, batch_size=32)

In [None]:
from transformers import create_optimizer

total_train_steps = len(train) * num_train_epochs

optimizer, schedule = create_optimizer(
    init_lr=learning_rate, num_warmup_steps=0, num_train_steps=total_train_steps
)

In [None]:
model.compile(optimizer = optimizer, jit_compile=True, metrics=["accuracy"])

In [None]:
model.fit(
    train.all(),
    validation_data=val,
    epochs=num_train_epochs
)

In [None]:
def question_answer(question, text):

    #tokenize question and text as a pair
    input_ids = tokenizer.encode(question, text)
    
    #string version of tokenized ids
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    
    #segment IDs
    #first occurence of [SEP] token
    sep_idx = input_ids.index(tokenizer.sep_token_id)    #number of tokens in segment A (question)
    num_seg_a = sep_idx+1    #number of tokens in segment B (text)
    num_seg_b = len(input_ids) - num_seg_a
    
    #list of 0s and 1s for segment embeddings
    segment_ids = [0]*num_seg_a + [1]*num_seg_b    
    assert len(segment_ids) == len(input_ids)
    
    #model output using input_ids and segment_ids
    output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))
    
    #reconstructing the answer
    answer_start = torch.argmax(output.start_logits)
    answer_end = torch.argmax(output.end_logits)    
    if answer_end >= answer_start:
        answer = tokens[answer_start]
        for i in range(answer_start+1, answer_end+1):
            if tokens[i][0:2] == "##":
                answer += tokens[i][2:]
            else:
                answer += " " + tokens[i]
                
    if answer.startswith("[CLS]"):
        answer = "Ask another question"
    
    print("\nPredicted answer:\n{}".format(answer.capitalize()))

In [None]:
for i in range(len(questions)):
  question_answer(questions[i],stories[i])

  print("Original answer:\n", answers[i])