In CoQA dataset answers are saved with their spans but these spans are character-based, but to train our model we need to convert these spans to their token-based counterpart (based on our tokenizer). 

To preprocess train split change the split variable to "train" and for validation change it to "val"

# Intro

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install nlp
!pip install transformers
!pip install datasets
!pip install wandb

In [3]:
## IMPORTS
import json

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm.notebook import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import datasets

import nlp

from transformers import LongformerTokenizerFast

In [4]:
tokenizer = LongformerTokenizerFast.from_pretrained('bert-base-uncased')

max_length = 512

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'LongformerTokenizerFast'.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
# GLOBAL VARIABLES

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

UNKNOWN = "unknown"
DATASET_TRAIN_PATH = "/content/drive/MyDrive/Colab Notebooks/seminar/dataset/coqa_flat_train_df.pkl"
DATASET_TEST_PATH = "/content/drive/MyDrive/Colab Notebooks/seminar/dataset/coqa_flat_val_df.pkl"
ANSWERS_PATH = "/content/drive/MyDrive/Colab Notebooks/seminar/answers/{file_name}"

SEED = 7

max_length = 512

In [6]:
df_train_raw = pd.read_pickle(DATASET_TRAIN_PATH)
df_val_raw = pd.read_pickle(DATASET_TEST_PATH)

In [8]:
dataset_train_raw = datasets.Dataset.from_pandas(df_train_raw)
dataset_val_raw = datasets.Dataset.from_pandas(df_val_raw)

In [9]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["story"],
        max_length=max_length,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
tokenized_coqa_train = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

#OLD

In [None]:
def get_correct_alignement(example):
    """ Some original examples in SQuAD have indices wrong by 1 or 2 character. We test and fix this here. """
    context = example["story"]
    gold_text = example["span_text"]
    start_idx = example["span_start"]
    end_idx = example["span_end"]
    if context[start_idx:end_idx] == gold_text:
        start_idx, end_idx = start_idx, end_idx       # When the gold label position is good
    elif context[start_idx-1:end_idx-1] == gold_text:
        start_idx, end_idx = start_idx-1, end_idx-1   # When the gold label is off by one character
    elif context[start_idx-2:end_idx-2] == gold_text:
        start_idx, end_idx = start_idx-2, end_idx-2   # When the gold label is off by two character
    else:
        raise ValueError()

    while context[start_idx]==" " or context[start_idx]=="\n":
        start_idx += 1

    while context[end_idx-1]==" " or context[end_idx-1]=="\n":
        end_idx -= 1

    return start_idx, end_idx

# Tokenize our training dataset
def convert_to_features(example):
    # Tokenize contexts and questions (as pairs of inputs)
    input_pairs = [example['question'], example['story']]
    encodings = tokenizer.encode_plus(input_pairs, pad_to_max_length=True, max_length=max_length)
    context_encodings = tokenizer.encode_plus(example['story'])

    # Compute start and end tokens for labels using Transformers's fast tokenizers alignement methodes.
    # this will give us the position of answer span in the context text
    start_idx, end_idx = get_correct_alignement(example)
    start_positions_context = context_encodings.char_to_token(start_idx)
    end_positions_context = context_encodings.char_to_token(end_idx-1)
    
    # here we will compute the start and end position of the answer in the whole example
    # as the example is encoded like this <s> question</s></s> context</s>
    # and we know the postion of the answer in the context
    # we can just find out the index of the sep token and then add that to position + 1 (+1 because there are two sep tokens)
    # this will give us the position of the answer span in whole example 
    last_sep_idx = encodings['input_ids'].index(tokenizer.sep_token_id) + 1
    start_positions = start_positions_context + last_sep_idx 
    end_positions = end_positions_context + last_sep_idx

    # if end_positions > 512:
    #   start_positions, end_positions = 0, 0

    # encodings.update({'start_positions': start_positions,
    #                   'end_positions': end_positions,
    #                   'attention_mask': encodings['attention_mask']})
    # return encodings
    return start_positions, end_positions, encodings['attention_mask'], encodings['input_ids']

# Data

In [None]:
df_train = pd.read_pickle(f"/content/drive/MyDrive/Colab Notebooks/seminar/dataset/coqa_flat_{split}_df.pkl")

In [None]:
df_train.head(1)

Unnamed: 0,index,name,filename,id,source,story,turn_id,question,input_text,span_text,span_start,span_end,bad_turn
0,0,Vatican_Library.txt,Vatican_Library.txt,3zotghdk5ibi9cex97fepx7jetpso7,wikipedia,"The Vatican Apostolic Library (), more commonl...",1,When was the Vat formally opened?,It was formally established in 1475,Formally established in 1475,151,179,False


In [None]:
len(df_train)

108647

Some answers have trailing or leading white space. we remove those here.

In [None]:
s = 0
for index, item in tqdm(df_train.iterrows()):
  a = len(item["question"].split())
  b = len(item["input_text"].split())
  s += int((a+b > 512))

print(s)

0it [00:00, ?it/s]

0


In [19]:
samples_train_list = []
x , y = 0 , 0

for index, item in tqdm(df_train.iterrows()):
    # if item["span_text"][0] == " " or item["span_text"][-1] == " ":
        # TODO: change convert_to_features to handle this case (returns None ex.: df_train.iloc[7])
        # x += 1
        # print("HJERE")
        # continue

    a = len(item["question"].split())
    b = len(item["input_text"].split())
    if a + b > max_length:
      continue

    try:
        span_start_token, span_end_token, attention_mask, input_ids = convert_to_features(item)
        d = {
            "index": index,
            "name": item["name"],
            "filename": item["filename"],
            "id": item["id"],
            "source": item["source"],
            "story": item["story"],
            "turn_id": item["turn_id"],
            "question": item["question"],
            "input_text": item["input_text"],
            "span_text": item["span_text"],
            "span_start": item["span_start"],
            "span_end": item["span_end"],
            "start_positions": span_start_token,
            "end_positions": span_end_token,
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "bad_turn": item["bad_turn"]
        }
        samples_train_list.append(d)
    except ValueError as e:
        y += 1
        pass
        raise e


0it [00:00, ?it/s]

In [None]:
len(samples_train_list), y

(0, 108647)

In [None]:
df_train_tokenized = pd.DataFrame(samples_train_list)

In [None]:
df_train_tokenized.to_pickle("/content/drive/MyDrive/Colab Notebooks/Seminar/dataset/coqa_flat_val_df_tokenized.pkl")

In [None]:
samples_train_list = []

for index, item in tqdm(df_train_tokenized.iterrows()):
    d = {
        "index": index,
        "id": item["id"],
        "turn_id": item["turn_id"],
        "start_positions": span_start_token,
        "end_positions": span_end_token,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
    }
    samples_train_list.append(d)


0it [00:00, ?it/s]

In [None]:
df_train_tokenized_reduced = pd.DataFrame(samples_train_list)

In [None]:
df_train_tokenized_reduced.to_pickle("/content/drive/MyDrive/Colab Notebooks/Seminar/dataset/coqa_flat_val_df_tokenized_reduced_1024.pkl")