In [1]:
import torch
from transformers import DistilBertForQuestionAnswering
from transformers import DistilBertTokenizer

In [2]:
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad')

In [3]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-distilled-squad')

In [4]:
question = "How many parameters does BERT-large have?"
answer_text = "BERT-large is really big... it has 24-layers and an embedding size of 1,024, for a total of 340M parameters! Altogether it is 1.34GB, so expect it to take a couple minutes to download to your Colab instance."

In [5]:
# Apply the tokenizer to the input text, treating them as a text-pair.
input_ids = tokenizer.encode(question, answer_text)

print('The input has a total of {:} tokens.'.format(len(input_ids)))

The input has a total of 70 tokens.


In [6]:
# BERT only needs the token IDs, but for the purpose of inspecting the 
# tokenizer's behavior, let's also get the token strings and display them.
tokens = tokenizer.convert_ids_to_tokens(input_ids)

# For each token and its id...
for token, id in zip(tokens, input_ids):
    
    # If this is the [SEP] token, add some space around it to make it stand out.
    if id == tokenizer.sep_token_id:
        print('')
    
    # Print the token string and its ID in two columns.
    print('{:<12} {:>6,}'.format(token, id))

    if id == tokenizer.sep_token_id:
        print('')
    

[CLS]           101
how           2,129
many          2,116
parameters   11,709
does          2,515
bert         14,324
-             1,011
large         2,312
have          2,031
?             1,029

[SEP]           102

bert         14,324
-             1,011
large         2,312
is            2,003
really        2,428
big           2,502
.             1,012
.             1,012
.             1,012
it            2,009
has           2,038
24            2,484
-             1,011
layers        9,014
and           1,998
an            2,019
em            7,861
##bed         8,270
##ding        4,667
size          2,946
of            1,997
1             1,015
,             1,010
02            6,185
##4           2,549
,             1,010
for           2,005
a             1,037
total         2,561
of            1,997
340          16,029
##m           2,213
parameters   11,709
!               999
altogether   10,462
it            2,009
is            2,003
1             1,015
.             1,01

In [7]:
inputs = tokenizer(question, answer_text, return_tensors='pt')
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])

# outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
outputs = model(**inputs)

start_scores = outputs.start_logits
end_scores = outputs.end_logits

In [8]:
answer_start = torch.argmax(start_scores)
answer_end = torch.argmax(end_scores)

# Combine the tokens in the answer and print it out.
answer = ' '.join(tokens[answer_start:answer_end+1])

print('Answer: "' + answer + '"')

Answer: "340 ##m"


In [9]:
# Start with the first token.
answer = tokens[answer_start]

# Select the remaining answer tokens and join them with whitespace.
for i in range(answer_start + 1, answer_end + 1):
    
    # If it's a subword token, then recombine it with the previous token.
    if tokens[i][0:2] == '##':
        answer += tokens[i][2:]
    
    # Otherwise, add a space then the token.
    else:
        answer += ' ' + tokens[i]

print('Answer: "' + answer + '"')

Answer: "340m"


In [19]:
question = "How did you process picture"
answer_text = """
We took the dirt layer off, then we took the varnish layer off, and that allowed us to see the quality of the paint below: not only the colors, but the look of the paint. You can start seeing its age, the cracks, the abrasion pattern that you see in the early Netherlandish pictures, she explained."""

In [20]:
input_ids = tokenizer.encode(question, answer_text)

In [21]:
tokens = tokenizer.convert_ids_to_tokens(input_ids)

In [22]:
inputs = tokenizer(question, answer_text, return_tensors='pt')
# outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
outputs = model(**inputs)

start_scores = outputs.start_logits
end_scores = outputs.end_logits

In [23]:
answer_start = torch.argmax(start_scores)
answer_end = torch.argmax(end_scores)

# Combine the tokens in the answer and print it out.
answer = ' '.join(tokens[answer_start:answer_end+1])

print('Answer: "' + answer + '"')

Answer: "var ##nish layer off , and that allowed us to see the quality of the paint below : not only the colors , but the look of the paint . you can start seeing its age , the cracks , the ab ##ras ##ion pattern that you see in the early net ##her ##land ##ish pictures"


{'input_ids': tensor([[  101,  2129,  2106,  2017,  2832,  3861,   102,  2057,  2165,  1996,
          6900,  6741,  2125,  1010,  2059,  2057,  2165,  1996, 13075, 24014,
          6741,  2125,  1010,  1998,  2008,  3039,  2149,  2000,  2156,  1996,
          3737,  1997,  1996,  6773,  2917,  1024,  2025,  2069,  1996,  6087,
          1010,  2021,  1996,  2298,  1997,  1996,  6773,  1012,  2017,  2064,
          2707,  3773,  2049,  2287,  1010,  1996, 15288,  1010,  1996, 11113,
          8180,  3258,  5418,  2008,  2017,  2156,  1999,  1996,  2220,  5658,
          5886,  3122,  4509,  4620,  1010,  2016,  4541,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1]])}