In CoQA dataset answers are saved with their spans but these spans are character-based, but to train our model we need to convert these spans to their token-based counterpart (based on our tokenizer). 

# Intro

In [1]:
!pip install nlp
!pip install transformers
!pip install datasets

Collecting nlp
  Downloading nlp-0.4.0-py3-none-any.whl (1.7 MB)
[?25l[K     |▏                               | 10 kB 29.7 MB/s eta 0:00:01[K     |▍                               | 20 kB 12.3 MB/s eta 0:00:01[K     |▋                               | 30 kB 9.7 MB/s eta 0:00:01[K     |▉                               | 40 kB 8.6 MB/s eta 0:00:01[K     |█                               | 51 kB 5.0 MB/s eta 0:00:01[K     |█▏                              | 61 kB 5.2 MB/s eta 0:00:01[K     |█▍                              | 71 kB 5.4 MB/s eta 0:00:01[K     |█▋                              | 81 kB 6.0 MB/s eta 0:00:01[K     |█▉                              | 92 kB 4.7 MB/s eta 0:00:01[K     |██                              | 102 kB 5.2 MB/s eta 0:00:01[K     |██▏                             | 112 kB 5.2 MB/s eta 0:00:01[K     |██▍                             | 122 kB 5.2 MB/s eta 0:00:01[K     |██▋                             | 133 kB 5.2 MB/s eta 0:00:01[K     |██▊

In [2]:
## IMPORTS
import json

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm.notebook import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import nlp

from transformers import LongformerTokenizerFast

In [8]:
tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096')

max_length = 1024

In [9]:
def get_correct_alignement(example):
    """ Some original examples in SQuAD have indices wrong by 1 or 2 character. We test and fix this here. """
    context = example["story"]
    gold_text = example["span_text"]
    start_idx = example["span_start"]
    end_idx = example["span_end"]
    if context[start_idx:end_idx] == gold_text:
        start_idx, end_idx = start_idx, end_idx       # When the gold label position is good
    elif context[start_idx-1:end_idx-1] == gold_text:
        start_idx, end_idx = start_idx-1, end_idx-1   # When the gold label is off by one character
    elif context[start_idx-2:end_idx-2] == gold_text:
        start_idx, end_idx = start_idx-2, end_idx-2   # When the gold label is off by two character
    else:
        raise ValueError()

    while context[start_idx]==" " or context[start_idx]=="\n":
        start_idx += 1

    while context[end_idx-1]==" " or context[end_idx-1]=="\n":
        end_idx -= 1

    return start_idx, end_idx

# Tokenize our training dataset
def convert_to_features(example):
    # Tokenize contexts and questions (as pairs of inputs)
    input_pairs = [example['question'], example['story']]
    encodings = tokenizer.encode_plus(input_pairs, pad_to_max_length=True, max_length=max_length)
    context_encodings = tokenizer.encode_plus(example['story'])

    # Compute start and end tokens for labels using Transformers's fast tokenizers alignement methodes.
    # this will give us the position of answer span in the context text
    start_idx, end_idx = get_correct_alignement(example)
    start_positions_context = context_encodings.char_to_token(start_idx)
    end_positions_context = context_encodings.char_to_token(end_idx-1)
    
    # here we will compute the start and end position of the answer in the whole example
    # as the example is encoded like this <s> question</s></s> context</s>
    # and we know the postion of the answer in the context
    # we can just find out the index of the sep token and then add that to position + 1 (+1 because there are two sep tokens)
    # this will give us the position of the answer span in whole example 
    last_sep_idx = encodings['input_ids'].index(tokenizer.sep_token_id) + 1
    start_positions = start_positions_context + last_sep_idx 
    end_positions = end_positions_context + last_sep_idx

    # if end_positions > 512:
    #   start_positions, end_positions = 0, 0

    # encodings.update({'start_positions': start_positions,
    #                   'end_positions': end_positions,
    #                   'attention_mask': encodings['attention_mask']})
    # return encodings
    return start_positions, end_positions, encodings['attention_mask'], encodings['input_ids']

# Data

In [10]:
df_train = pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/Seminar/dataset/coqa_flat_train_df.pkl")

In [11]:
df_train.head(1)

Unnamed: 0,index,name,filename,id,source,story,turn_id,question,input_text,span_text,span_start,span_end,bad_turn
0,0,Vatican_Library.txt,Vatican_Library.txt,3zotghdk5ibi9cex97fepx7jetpso7,wikipedia,"The Vatican Apostolic Library (), more commonl...",1,When was the Vat formally opened?,It was formally established in 1475,Formally established in 1475,151,179,False


In [12]:
len(df_train)

108647

Some answers have trailing or leading white space. we remove those here.

In [13]:
samples_train_list = []
x , y = 0 , 0

for index, item in tqdm(df_train.iterrows()):
    # if item["span_text"][0] == " " or item["span_text"][-1] == " ":
        # TODO: change convert_to_features to handle this case (returns None ex.: df_train.iloc[7])
        # x += 1
        # print("HJERE")
        # continue

    try:
        span_start_token, span_end_token, attention_mask, input_ids = convert_to_features(item)
        d = {
            "index": index,
            "name": item["name"],
            "filename": item["filename"],
            "id": item["id"],
            "source": item["source"],
            "story": item["story"],
            "turn_id": item["turn_id"],
            "question": item["question"],
            "input_text": item["input_text"],
            "span_text": item["span_text"],
            "span_start": item["span_start"],
            "span_end": item["span_end"],
            "start_positions": span_start_token,
            "end_positions": span_end_token,
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "bad_turn": item["bad_turn"]
        }
        samples_train_list.append(d)
    except ValueError:
        y += 1
        pass

0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [14]:
len(samples_train_list), y

(107286, 1361)

In [15]:
df_train_tokenized = pd.DataFrame(samples_train_list)

In [None]:
df_train_tokenized.to_pickle("/content/drive/MyDrive/Colab Notebooks/Seminar/dataset/coqa_flat_train_df_tokenized.pkl")

In [16]:
samples_train_list = []

for index, item in tqdm(df_train_tokenized.iterrows()):
    d = {
        "index": index,
        "id": item["id"],
        "turn_id": item["turn_id"],
        "start_positions": span_start_token,
        "end_positions": span_end_token,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
    }
    samples_train_list.append(d)


0it [00:00, ?it/s]

In [17]:
df_train_tokenized_reduced = pd.DataFrame(samples_train_list)

In [18]:
df_train_tokenized_reduced.to_pickle("/content/drive/MyDrive/Colab Notebooks/Seminar/dataset/coqa_flat_train_df_tokenized_reduced_1024.pkl")