In CoQA dataset answers are saved with their spans but these spans are character-based, but to train our model we need to convert these spans to their token-based counterpart (based on our tokenizer). 

# Intro

In [1]:
!pip install nlp
!pip install transformers
!pip install datasets

Collecting nlp
  Downloading nlp-0.4.0-py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 5.0 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 54.8 MB/s 
Installing collected packages: xxhash, nlp
Successfully installed nlp-0.4.0 xxhash-2.0.2
Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 55.6 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 42.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K  

In [2]:
## IMPORTS
import json

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm.notebook import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import nlp

from transformers import LongformerTokenizerFast

In [3]:
tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096')

max_length = 1024

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/694 [00:00<?, ?B/s]

In [4]:
def get_correct_alignement(example):
    """ Some original examples in SQuAD have indices wrong by 1 or 2 character. We test and fix this here. """
    context = example["story"]
    gold_text = example["span_text"]
    start_idx = example["span_start"]
    end_idx = example["span_end"]
    if context[start_idx:end_idx] == gold_text:
        start_idx, end_idx = start_idx, end_idx       # When the gold label position is good
    elif context[start_idx-1:end_idx-1] == gold_text:
        start_idx, end_idx = start_idx-1, end_idx-1   # When the gold label is off by one character
    elif context[start_idx-2:end_idx-2] == gold_text:
        start_idx, end_idx = start_idx-2, end_idx-2   # When the gold label is off by two character
    else:
        raise ValueError()

    while context[start_idx]==" " or context[start_idx]=="\n":
        start_idx += 1

    while context[end_idx-1]==" " or context[end_idx-1]=="\n":
        end_idx -= 1

    return start_idx, end_idx

# Tokenize our training dataset
def convert_to_features(example):
    # Tokenize contexts and questions (as pairs of inputs)
    input_pairs = [example['question'], example['story']]
    encodings = tokenizer.encode_plus(input_pairs, pad_to_max_length=True, max_length=max_length)
    context_encodings = tokenizer.encode_plus(example['story'])

    # Compute start and end tokens for labels using Transformers's fast tokenizers alignement methodes.
    # this will give us the position of answer span in the context text
    start_idx, end_idx = get_correct_alignement(example)
    start_positions_context = context_encodings.char_to_token(start_idx)
    end_positions_context = context_encodings.char_to_token(end_idx-1)
    
    # here we will compute the start and end position of the answer in the whole example
    # as the example is encoded like this <s> question</s></s> context</s>
    # and we know the postion of the answer in the context
    # we can just find out the index of the sep token and then add that to position + 1 (+1 because there are two sep tokens)
    # this will give us the position of the answer span in whole example 
    last_sep_idx = encodings['input_ids'].index(tokenizer.sep_token_id) + 1
    start_positions = start_positions_context + last_sep_idx 
    end_positions = end_positions_context + last_sep_idx

    # if end_positions > 512:
    #   start_positions, end_positions = 0, 0

    # encodings.update({'start_positions': start_positions,
    #                   'end_positions': end_positions,
    #                   'attention_mask': encodings['attention_mask']})
    # return encodings
    return start_positions, end_positions, encodings['attention_mask'], encodings['input_ids']

# Data

In [13]:
df_train = pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/Seminar/dataset/coqa_flat_val_df.pkl")

In [14]:
df_train.head(1)

Unnamed: 0,index,name,filename,id,source,story,turn_id,question,input_text,span_text,span_start,span_end,bad_turn
0,0,mc160.test.41,mc160.test.41,3dr23u6we5exclen4th8uq9rb42tel,mctest,"Once upon a time, in a barn near a farm house,...",1,What color was Cotton?,white,a little white kitten named Cotton,59,93,False


In [15]:
len(df_train)

7983

Some answers have trailing or leading white space. we remove those here.

In [16]:
samples_train_list = []
x , y = 0 , 0

for index, item in tqdm(df_train.iterrows()):
    # if item["span_text"][0] == " " or item["span_text"][-1] == " ":
        # TODO: change convert_to_features to handle this case (returns None ex.: df_train.iloc[7])
        # x += 1
        # print("HJERE")
        # continue

    try:
        span_start_token, span_end_token, attention_mask, input_ids = convert_to_features(item)
        d = {
            "index": index,
            "name": item["name"],
            "filename": item["filename"],
            "id": item["id"],
            "source": item["source"],
            "story": item["story"],
            "turn_id": item["turn_id"],
            "question": item["question"],
            "input_text": item["input_text"],
            "span_text": item["span_text"],
            "span_start": item["span_start"],
            "span_end": item["span_end"],
            "start_positions": span_start_token,
            "end_positions": span_end_token,
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "bad_turn": item["bad_turn"]
        }
        samples_train_list.append(d)
    except ValueError:
        y += 1
        pass

0it [00:00, ?it/s]



In [17]:
len(samples_train_list), y

(7918, 65)

In [18]:
df_train_tokenized = pd.DataFrame(samples_train_list)

In [25]:
df_train_tokenized.to_pickle("/content/drive/MyDrive/Colab Notebooks/Seminar/dataset/coqa_flat_val_df_tokenized.pkl")

In [19]:
samples_train_list = []

for index, item in tqdm(df_train_tokenized.iterrows()):
    d = {
        "index": index,
        "id": item["id"],
        "turn_id": item["turn_id"],
        "start_positions": span_start_token,
        "end_positions": span_end_token,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
    }
    samples_train_list.append(d)


0it [00:00, ?it/s]

In [20]:
df_train_tokenized_reduced = pd.DataFrame(samples_train_list)

In [21]:
df_train_tokenized_reduced.to_pickle("/content/drive/MyDrive/Colab Notebooks/Seminar/dataset/coqa_flat_val_df_tokenized_reduced_1024.pkl")