In [1]:
from data import read_squad_examples
train_examples = read_squad_examples(
            input_file='squad\\v1.1\\train-v1.1.json', is_training=True, version_2_with_negative=False)

In [2]:
train_examples[1]

qas_id: 5733be284776f4190066117f, question_text: What is in front of the Notre Dame Main Building?, doc_tokens: [Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.], start_position: 32, end_position: 36

## test the dataset from scratch

In [3]:
import json
from tokenization import (BasicTokenizer, BertTokenizer, whitespace_tokenize)

In [4]:
with open('squad\\v1.1\\train-v1.1.json','r') as f:
    data =json.load(f)['data']

In [5]:
def is_whitespace(c):
    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
        return True
    return False

In [6]:
examples = []
for entry in data:
    for paragraph in entry["paragraphs"]:
        paragraph_text = paragraph["context"]
        doc_tokens = []
        char_to_word_offset = []
        prev_is_whitespace = True
    break

In [7]:
for c in paragraph_text:
    if is_whitespace(c):
        prev_is_whitespace = True
    else:
        if prev_is_whitespace:
            doc_tokens.append(c)
        else:
            doc_tokens[-1]+=c
        prev_is_whitespace = False
    char_to_word_offset.append(len(doc_tokens)-1)


In [8]:
for qa in paragraph["qas"]:
    qas_id = qa["id"]
    question_text = qa["question"]
    start_position = None
    end_position = None
    orig_answer_text = None
    is_impossible = False
    break

In [9]:
question_text

'Which Secretary of State attended Notre Dame?'

In [10]:
qa["answers"]

[{'answer_start': 185, 'text': 'Condoleezza Rice'}]

In [11]:
if True:
    if False:
        is_impossible = qa["is_impossible"]
    if (len(qa["answers"]) != 1) and (not is_impossible):
        raise ValueError(
            "For training, each question should have exactly 1 answer.")
    if not is_impossible:
        answer = qa["answers"][0]
        orig_answer_text = answer["text"]
        answer_offset = answer["answer_start"]
        answer_length = len(orig_answer_text)
        start_position = char_to_word_offset[answer_offset]
        end_position = char_to_word_offset[answer_offset + answer_length - 1]

In [12]:
actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
cleaned_answer_text = " ".join(
    whitespace_tokenize(orig_answer_text))

In [13]:
actual_text

'Condoleezza Rice.'

In [14]:
cleaned_answer_text

'Condoleezza Rice'

In [15]:
tokenizer = BertTokenizer('.\\vocab\\vocab', do_lower_case=True, max_len=512) # for bert large
for (example_index, example) in enumerate(train_examples):
        query_tokens = tokenizer.tokenize(example.question_text)
        break

In [16]:
if len(query_tokens) > 60:
    query_tokens = query_tokens[0:60]
## the original index before tokenizer
tok_to_orig_index = []
## the index after tokenizer
orig_to_tok_index = []
all_doc_tokens = []
for (i, token) in enumerate(example.doc_tokens):
    orig_to_tok_index.append(len(all_doc_tokens))
    sub_tokens = tokenizer.tokenize(token)
    for sub_token in sub_tokens:
        tok_to_orig_index.append(i)
        all_doc_tokens.append(sub_token)
    

In [17]:
from data import _improve_answer_span
is_training = True
tok_start_position = None
tok_end_position = None
if is_training and example.is_impossible:
    tok_start_position = -1
    tok_end_position = -1
if is_training and not example.is_impossible:
    tok_start_position = orig_to_tok_index[example.start_position]
    if example.end_position < len(example.doc_tokens) - 1:  ## 
        tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
    else:
        tok_end_position = len(all_doc_tokens) - 1
    (tok_start_position, tok_end_position) = _improve_answer_span(
        all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
        example.orig_answer_text)


In [18]:
max_tokens_for_doc = 512 - len(query_tokens) - 3

In [19]:
len(all_doc_tokens)

158

In [20]:
all_doc_tokens = all_doc_tokens

In [21]:
import collections
# We can have documents that are longer than the maximum sequence length.
# To deal with this we do a sliding window approach, where we take chunks
# of the up to our max length with a stride of `doc_stride`.
_DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
    "DocSpan", ["start", "length"])
doc_spans = []
start_offset = 0
while start_offset < len(all_doc_tokens):
    length = len(all_doc_tokens) - start_offset
    if length > max_tokens_for_doc:
        length = max_tokens_for_doc
    doc_spans.append(_DocSpan(start=start_offset, length=length))
    if start_offset + length == len(all_doc_tokens):
        break
    start_offset += min(length, 126)


In [22]:
for (doc_span_index, doc_span) in enumerate(doc_spans):
    break

### add cls and sep to query

In [32]:

tokens = []
token_to_orig_map = {}
token_is_max_context = {}
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in query_tokens:
    tokens.append(token)
    segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)

### 
    # token can appear in multiple documents. E.g.
    #  Doc: the man went to the store and bought a gallon of milk
    #  Span A: the man went to the
    #  Span B: to the store and bought
    #  Span C: and bought a gallon of

In [33]:
# tok_to_orig_index: the orignal index for the token
print(len(tok_to_orig_index))
print(len(orig_to_tok_index))
print(len(all_doc_tokens))
print(len(example.doc_tokens))
print(example.doc_tokens)
print(all_doc_tokens)

158
124
158
124
['Architecturally,', 'the', 'school', 'has', 'a', 'Catholic', 'character.', 'Atop', 'the', 'Main', "Building's", 'gold', 'dome', 'is', 'a', 'golden', 'statue', 'of', 'the', 'Virgin', 'Mary.', 'Immediately', 'in', 'front', 'of', 'the', 'Main', 'Building', 'and', 'facing', 'it,', 'is', 'a', 'copper', 'statue', 'of', 'Christ', 'with', 'arms', 'upraised', 'with', 'the', 'legend', '"Venite', 'Ad', 'Me', 'Omnes".', 'Next', 'to', 'the', 'Main', 'Building', 'is', 'the', 'Basilica', 'of', 'the', 'Sacred', 'Heart.', 'Immediately', 'behind', 'the', 'basilica', 'is', 'the', 'Grotto,', 'a', 'Marian', 'place', 'of', 'prayer', 'and', 'reflection.', 'It', 'is', 'a', 'replica', 'of', 'the', 'grotto', 'at', 'Lourdes,', 'France', 'where', 'the', 'Virgin', 'Mary', 'reputedly', 'appeared', 'to', 'Saint', 'Bernadette', 'Soubirous', 'in', '1858.', 'At', 'the', 'end', 'of', 'the', 'main', 'drive', '(and', 'in', 'a', 'direct', 'line', 'that', 'connects', 'through', '3', 'statues', 'and', 'the',

In [34]:
doc_span.length

158

In [35]:
from data import _check_is_max_context

for i in range(doc_span.length):
    split_token_index = doc_span.start + i
    # print(split_token_index)
    token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
    # print(doc_span_index,split_token_index)
    is_max_context = _check_is_max_context(doc_spans, doc_span_index,
                                            split_token_index)
    token_is_max_context[len(tokens)] = is_max_context
    tokens.append(all_doc_tokens[split_token_index])
    segment_ids.append(1)
    
tokens.append("[SEP]")
segment_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)


In [38]:
print(len(tokens))
print(len(token_is_max_context))
print(len(token_to_orig_map))
print(len(tok_to_orig_index))
print(len(segment_ids))

176
158
158
158
176


In [54]:
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
max_seq_length = 512
# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
    input_ids.append(0)
    input_mask.append(0)
    segment_ids.append(0)

assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length

start_position = None
end_position = None
if is_training and not example.is_impossible:
    # For training, if our document chunk does not contain an annotation
    # we throw it out, since there is nothing to predict.
    doc_start = doc_span.start
    doc_end = doc_span.start + doc_span.length - 1
    out_of_span = False
    if not (tok_start_position >= doc_start and
            tok_end_position <= doc_end):
        out_of_span = True
    if out_of_span:
        start_position = 0
        end_position = 0
    else:
        doc_offset = len(query_tokens) + 2
        start_position = tok_start_position - doc_start + doc_offset
        end_position = tok_end_position - doc_start + doc_offset
if is_training and example.is_impossible:
    start_position = 0
    end_position = 0


In [None]:
features.append(
    InputFeatures(
        unique_id=unique_id,
        example_index=example_index,
        doc_span_index=doc_span_index,
        tokens=tokens,
        token_to_orig_map=token_to_orig_map,
        token_is_max_context=token_is_max_context,
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        start_position=start_position,
        end_position=end_position,
        is_impossible=example.is_impossible))

In [55]:
tokenizer = BertTokenizer('.\\vocab\\vocab', do_lower_case=True, max_len=512) # for bert large
from data import convert_examples_to_features
train_features = convert_examples_to_features(
                examples=train_examples,
                tokenizer=tokenizer,
                max_seq_length=512,
                doc_stride=128,
                max_query_length=64,
                is_training=True)

In [6]:
train_features[1].input_ids

[101,
 2054,
 2003,
 1999,
 2392,
 1997,
 1996,
 10289,
 8214,
 2364,
 2311,
 1029,
 102,
 6549,
 2135,
 1010,
 1996,
 2082,
 2038,
 1037,
 3234,
 2839,
 1012,
 10234,
 1996,
 2364,
 2311,
 1005,
 1055,
 2751,
 8514,
 2003,
 1037,
 3585,
 6231,
 1997,
 1996,
 6261,
 2984,
 1012,
 3202,
 1999,
 2392,
 1997,
 1996,
 2364,
 2311,
 1998,
 5307,
 2009,
 1010,
 2003,
 1037,
 6967,
 6231,
 1997,
 4828,
 2007,
 2608,
 2039,
 14995,
 6924,
 2007,
 1996,
 5722,
 1000,
 2310,
 3490,
 2618,
 4748,
 2033,
 18168,
 5267,
 1000,
 1012,
 2279,
 2000,
 1996,
 2364,
 2311,
 2003,
 1996,
 13546,
 1997,
 1996,
 6730,
 2540,
 1012,
 3202,
 2369,
 1996,
 13546,
 2003,
 1996,
 24665,
 23052,
 1010,
 1037,
 14042,
 2173,
 1997,
 7083,
 1998,
 9185,
 1012,
 2009,
 2003,
 1037,
 15059,
 1997,
 1996,
 24665,
 23052,
 2012,
 10223,
 26371,
 1010,
 2605,
 2073,
 1996,
 6261,
 2984,
 22353,
 2135,
 2596,
 2000,
 3002,
 16595,
 9648,
 4674,
 2061,
 12083,
 9711,
 2271,
 1999,
 8517,
 1012,
 2012,
 1996,
 2203,
 1997