# NLP Course

Please see the [Hugging Face NLP Course page](https://huggingface.co/learn/nlp-course/chapter0/1?fw=pt).

## 6. The 🤗 Tokenizers library

#### Assembling a corpus


> The repository for code_search_net contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/code_search_net.
> You can avoid this prompt in future by passing the argument `trust_remote_code=True`.
>
> Do you wish to run the custom code? [y/N]


In [1]:
from datasets import load_dataset

# This can take a few minutes to load, so grab a coffee or tea while you wait!
raw_datasets = load_dataset("code_search_net", "python", trust_remote_code=True)
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 412178
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 22176
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 23107
    })
})

In [2]:
raw_datasets["train"]

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 412178
})

In [3]:
print(raw_datasets["train"][123456]["whole_func_string"])

def _compress_for_distribute(max_vol, plan, **kwargs):
    """
    Combines as many dispenses as can fit within the maximum volume
    """
    source = None
    new_source = None
    a_vol = 0
    temp_dispenses = []
    new_transfer_plan = []
    disposal_vol = kwargs.get('disposal_vol', 0)
    max_vol = max_vol - disposal_vol

    def _append_dispenses():
        nonlocal a_vol, temp_dispenses, new_transfer_plan, source
        if not temp_dispenses:
            return
        added_volume = 0
        if len(temp_dispenses) > 1:
            added_volume = disposal_vol
        new_transfer_plan.append({
            'aspirate': {
                'location': source,
                'volume': a_vol + added_volume
            }
        })
        for d in temp_dispenses:
            new_transfer_plan.append({
                'dispense': {
                    'location': d['location'],
                    'volume': d['volume']
                }
            })
        a_vol = 0
        temp

> Using a Python generator, we can avoid Python loading anything into memory
> until it’s actually necessary. To create such a generator, 
> <span style="background-color:#33ffff;">you just to need to replace the brackets with parentheses</span>

In [4]:
gen = (i for i in range(10))
print(list(gen))
print(list(gen))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[]


In [5]:
def get_training_corpus():
    return (
        raw_datasets["train"][i : i + 1000]["whole_func_string"]
        for i in range(0, len(raw_datasets["train"]), 1000)
    )


training_corpus = get_training_corpus()

> You can also define your generator inside a for loop by using the yield statement...
> which will produce the exact same generator as before,
> <span style="background-color:#33ffff">but allows you to use more complex logic than you can in a list comprehension</span>

#### Training a new tokenizer

In [6]:
from transformers import AutoTokenizer

old_tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [7]:
type(old_tokenizer)

transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast

In [8]:
example = '''def add_numbers(a, b):
    """Add the two numbers `a` and `b`."""
    return a + b'''

tokens = old_tokenizer.tokenize(example)
print(tokens)

['def', 'Ġadd', '_', 'n', 'umbers', '(', 'a', ',', 'Ġb', '):', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ"""', 'Add', 'Ġthe', 'Ġtwo', 'Ġnumbers', 'Ġ`', 'a', '`', 'Ġand', 'Ġ`', 'b', '`', '."', '""', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġreturn', 'Ġa', 'Ġ+', 'Ġb']


In [9]:
old_tokenizer.vocab_size

50257

##### Please read

> Note that AutoTokenizer.train_new_from_iterator() only works if the tokenizer you are using is a “fast” tokenizer. 

* [API documentation for `Tokenizer.train_new_from_iterator`](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizerFast.train_new_from_iterator)
* StackOverflow question from alvas: [How to add new tokens to an existing Huggingface AutoTokenizer?](https://stackoverflow.com/questions/76198051/how-to-add-new-tokens-to-an-existing-huggingface-tokenizer)

In [10]:
%%time
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)




CPU times: user 5min 53s, sys: 9.82 s, total: 6min 2s
Wall time: 1min 40s


In [11]:
tokens = tokenizer.tokenize(example)
print(tokens)

['def', 'Ġadd', '_', 'numbers', '(', 'a', ',', 'Ġb', '):', 'ĊĠĠĠ', 'Ġ"""', 'Add', 'Ġthe', 'Ġtwo', 'Ġnumbers', 'Ġ`', 'a', '`', 'Ġand', 'Ġ`', 'b', '`."""', 'ĊĠĠĠ', 'Ġreturn', 'Ġa', 'Ġ+', 'Ġb']


In [12]:
print(len(tokens))
print(len(old_tokenizer.tokenize(example)))

27
36


In [13]:
example = """class LinearLayer():
    def __init__(self, input_size, output_size):
        self.weight = torch.randn(input_size, output_size)
        self.bias = torch.zeros(output_size)

    def __call__(self, x):
        return x @ self.weights + self.bias
    """

print(tokenizer.tokenize(example))

['class', 'ĠLinear', 'Layer', '():', 'ĊĠĠĠ', 'Ġdef', 'Ġ__', 'init', '__(', 'self', ',', 'Ġinput', '_', 'size', ',', 'Ġoutput', '_', 'size', '):', 'ĊĠĠĠĠĠĠĠ', 'Ġself', '.', 'weight', 'Ġ=', 'Ġtorch', '.', 'randn', '(', 'input', '_', 'size', ',', 'Ġoutput', '_', 'size', ')', 'ĊĠĠĠĠĠĠĠ', 'Ġself', '.', 'bias', 'Ġ=', 'Ġtorch', '.', 'zeros', '(', 'output', '_', 'size', ')', 'ĊĊĠĠĠ', 'Ġdef', 'Ġ__', 'call', '__(', 'self', ',', 'Ġx', '):', 'ĊĠĠĠĠĠĠĠ', 'Ġreturn', 'Ġx', 'Ġ@', 'Ġself', '.', 'weights', 'Ġ+', 'Ġself', '.', 'bias', 'ĊĠĠĠĠ']


In [14]:
tokenizer.save_pretrained("code-search-net-tokenizer")

('code-search-net-tokenizer/tokenizer_config.json',
 'code-search-net-tokenizer/special_tokens_map.json',
 'code-search-net-tokenizer/vocab.json',
 'code-search-net-tokenizer/merges.txt',
 'code-search-net-tokenizer/added_tokens.json',
 'code-search-net-tokenizer/tokenizer.json')

In [15]:
!ls -la code-search-net-tokenizer

total 4864
drwxr-xr-x 2 se_olliphant se_olliphant    4096 Feb  8 06:02 .
drwxr-xr-x 5 se_olliphant se_olliphant    4096 Feb 11 07:30 ..
-rw-r--r-- 1 se_olliphant se_olliphant  466894 Feb 11 07:30 merges.txt
-rw-r--r-- 1 se_olliphant se_olliphant      99 Feb 11 07:30 special_tokens_map.json
-rw-r--r-- 1 se_olliphant se_olliphant 3673415 Feb 11 07:30 tokenizer.json
-rw-r--r-- 1 se_olliphant se_olliphant     471 Feb 11 07:30 tokenizer_config.json
-rw-r--r-- 1 se_olliphant se_olliphant  822037 Feb 11 07:30 vocab.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


,,,

...

In [16]:
# Replace "huggingface-course" below with your actual namespace to use your own tokenizer
tokenizer = AutoTokenizer.from_pretrained("code-search-net-tokenizer")

#### Fast tokenizers’ special powers

> <span style="background-color:#33FFFF"><b>Slow</b> tokenizers</span> are those written in Python inside the 🤗 Transformers library<p/>
> <span style="background-color:#AFFF33">the <b>fast</b> versions</span> are the ones provided by 🤗 Tokenizers, which are written in Rust

In [17]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
print(f"tokenizer.is_fast ? {tokenizer.is_fast}")

tokenizer.is_fast ? True


In [18]:
example = "My name is Sylvain and I work at Hugging Face in Brooklyn."
encoding = tokenizer(example)

print(f"example: {example}")
print(type(encoding))
print()

print(f"encoding.is_fast ? {encoding.is_fast}\n")
print(f"tokens:\n{encoding.tokens()}\n")
print(f"word IDs:\n{encoding.word_ids()}")

example: My name is Sylvain and I work at Hugging Face in Brooklyn.
<class 'transformers.tokenization_utils_base.BatchEncoding'>

encoding.is_fast ? True

tokens:
['[CLS]', 'My', 'name', 'is', 'S', '##yl', '##va', '##in', 'and', 'I', 'work', 'at', 'Hu', '##gging', 'Face', 'in', 'Brooklyn', '.', '[SEP]']

word IDs:
[None, 0, 1, 2, 3, 3, 3, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, None]


...

In [19]:
tokenizer_roberta = AutoTokenizer.from_pretrained("roberta-base")

check_this = "81s"

encoding_bert = tokenizer(check_this)
encoding_roberta = tokenizer_roberta(check_this)

print("bert-base-cased")
print(f"tokens: {encoding_bert.tokens()}")
print(f"word IDs: {encoding_bert.word_ids()}")

print()

print("roberta-based")
print(f"tokens: {encoding_roberta.tokens()}")
print(f"word IDs: {encoding_roberta.word_ids()}")

bert-base-cased
tokens: ['[CLS]', '81', '##s', '[SEP]']
word IDs: [None, 0, 0, None]

roberta-based
tokens: ['<s>', '81', 's', '</s>']
word IDs: [None, 0, 1, None]


...

In [20]:
start, end = encoding.word_to_chars(3)
example[start:end]

'Sylvain'

...

##### There is no `sentence_ids` method on the return encodings from `tokenizer(input)`!

In [21]:
sentence_1 = "I am the son and heir of a shyness that is criminally vulgar. I am the son and the heir of nothing in particular. Really, I am."
sentence_2 = "If it's not love, then it's the Bomb that'll bring us together."

encoding_sentence_1 = tokenizer(sentence_1)
encoding_sentence_2 = tokenizer(sentence_2)

In [22]:
print(dir(encoding_sentence_1))

['_MutableMapping__marker', '__abstractmethods__', '__class__', '__class_getitem__', '__contains__', '__copy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__ior__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__or__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__ror__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_encodings', '_n_sequences', 'char_to_token', 'char_to_word', 'clear', 'convert_to_tensors', 'copy', 'data', 'encodings', 'fromkeys', 'get', 'is_fast', 'items', 'keys', 'n_sequences', 'pop', 'popitem', 'sequence_ids', 'setdefault', 'to', 'token_to_chars', 'token_to_sequence', 'token_to_word', 'tokens', 'update', 'values', 'word_ids', 'word_to_chars', 'word_to_toke

...

#### Inside the token-classification pipeline

> First, let’s grab a token classification pipeline so we can get some results to compare manually. The model used by default is [`dbmdz/bert-large-cased-finetuned-conll03-english`](https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english); it performs NER on sentences


In [23]:
from transformers import pipeline

token_classifier = pipeline("token-classification")

for ntt in token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn."):
    print(ntt)

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
hugging

{'entity': 'I-PER', 'score': 0.99938285, 'index': 4, 'word': 'S', 'start': 11, 'end': 12}
{'entity': 'I-PER', 'score': 0.99815494, 'index': 5, 'word': '##yl', 'start': 12, 'end': 14}
{'entity': 'I-PER', 'score': 0.9959072, 'index': 6, 'word': '##va', 'start': 14, 'end': 16}
{'entity': 'I-PER', 'score': 0.99923277, 'index': 7, 'word': '##in', 'start': 16, 'end': 18}
{'entity': 'I-ORG', 'score': 0.9738931, 'index': 12, 'word': 'Hu', 'start': 33, 'end': 35}
{'entity': 'I-ORG', 'score': 0.976115, 'index': 13, 'word': '##gging', 'start': 35, 'end': 40}
{'entity': 'I-ORG', 'score': 0.9887976, 'index': 14, 'word': 'Face', 'start': 41, 'end': 45}
{'entity': 'I-LOC', 'score': 0.9932106, 'index': 16, 'word': 'Brooklyn', 'start': 49, 'end': 57}


In [24]:
from transformers import pipeline

token_classifier = pipeline("token-classification", aggregation_strategy="simple")
#                                                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

for ntt in token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn."):
    print(ntt)

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


{'entity_group': 'PER', 'score': 0.9981694, 'word': 'Sylvain', 'start': 11, 'end': 18}
{'entity_group': 'ORG', 'score': 0.9796019, 'word': 'Hugging Face', 'start': 33, 'end': 45}
{'entity_group': 'LOC', 'score': 0.9932106, 'word': 'Brooklyn', 'start': 49, 'end': 57}


##### From inputs to predictions

... let's try doing the same w/out using `pipeline`

In [25]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_checkpoint = "dbmdz/bert-large-cased-finetuned-conll03-english"
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

print(model)
print()
print(model.config)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024

In [26]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
print(f"tokenizer.is_fast ? {tokenizer.is_fast}")

example = "My name is Sylvain and I work at Hugging Face in Brooklyn."
inputs = tokenizer(example, return_tensors="pt")

outputs = model(**inputs)

tokenizer.is_fast ? True


In [27]:
print(inputs["input_ids"].shape)
print(outputs.logits.shape)

torch.Size([1, 19])
torch.Size([1, 19, 9])


##### NOTE

* 1st `dim` is batch (index)
* 2nd `dim` is sequence (length)
* 3rd `dim` is logits (labels)

We use Torch's [`torch.nn.functional.softmax`](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html#torch-nn-functional-softmax) to convert the logits to probabilities, and then `argmax` to get the final NER prediction.

In [28]:
import torch

probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()
predictions = outputs.logits.argmax(dim=-1)[0].tolist()

In [29]:
predictions_labels = [
    model.config.id2label[p]
    for p in predictions
]

In [30]:
for tok, pred, label in zip(inputs.tokens(), predictions, predictions_labels):
    print(tok, pred, label)

[CLS] 0 O
My 0 O
name 0 O
is 0 O
S 4 I-PER
##yl 4 I-PER
##va 4 I-PER
##in 4 I-PER
and 0 O
I 0 O
work 0 O
at 0 O
Hu 6 I-ORG
##gging 6 I-ORG
Face 6 I-ORG
in 0 O
Brooklyn 8 I-LOC
. 0 O
[SEP] 0 O


In [31]:
results = []
tokens = inputs.tokens()

for idx, pred in enumerate(predictions):
    label = model.config.id2label[pred]
    if label != "O":
        results.append(
            {"entity": label, "score": probabilities[idx][pred], "word": tokens[idx]}
        )

results

[{'entity': 'I-PER', 'score': 0.9993828535079956, 'word': 'S'},
 {'entity': 'I-PER', 'score': 0.9981548190116882, 'word': '##yl'},
 {'entity': 'I-PER', 'score': 0.995907187461853, 'word': '##va'},
 {'entity': 'I-PER', 'score': 0.9992327690124512, 'word': '##in'},
 {'entity': 'I-ORG', 'score': 0.9738931059837341, 'word': 'Hu'},
 {'entity': 'I-ORG', 'score': 0.9761149883270264, 'word': '##gging'},
 {'entity': 'I-ORG', 'score': 0.9887974858283997, 'word': 'Face'},
 {'entity': 'I-LOC', 'score': 0.99321049451828, 'word': 'Brooklyn'}]

In [32]:
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
#                                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^

inputs_with_offsets["offset_mapping"]

[(0, 0),
 (0, 2),
 (3, 7),
 (8, 10),
 (11, 12),
 (12, 14),
 (14, 16),
 (16, 18),
 (19, 22),
 (23, 24),
 (25, 29),
 (30, 32),
 (33, 35),
 (35, 40),
 (41, 45),
 (46, 48),
 (49, 57),
 (57, 58),
 (0, 0)]

In [33]:
example[12:14]

'yl'

In [34]:
results = []
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets["offset_mapping"]

for idx, pred in enumerate(predictions):
    label = model.config.id2label[pred]
    if label != "O":
        start, end = offsets[idx]
        results.append(
            {
                "entity": label,
                "score": probabilities[idx][pred],
                "word": tokens[idx],
                "start": start,
                "end": end,
            }
        )

results

[{'entity': 'I-PER',
  'score': 0.9993828535079956,
  'word': 'S',
  'start': 11,
  'end': 12},
 {'entity': 'I-PER',
  'score': 0.9981548190116882,
  'word': '##yl',
  'start': 12,
  'end': 14},
 {'entity': 'I-PER',
  'score': 0.995907187461853,
  'word': '##va',
  'start': 14,
  'end': 16},
 {'entity': 'I-PER',
  'score': 0.9992327690124512,
  'word': '##in',
  'start': 16,
  'end': 18},
 {'entity': 'I-ORG',
  'score': 0.9738931059837341,
  'word': 'Hu',
  'start': 33,
  'end': 35},
 {'entity': 'I-ORG',
  'score': 0.9761149883270264,
  'word': '##gging',
  'start': 35,
  'end': 40},
 {'entity': 'I-ORG',
  'score': 0.9887974858283997,
  'word': 'Face',
  'start': 41,
  'end': 45},
 {'entity': 'I-LOC',
  'score': 0.99321049451828,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

In [35]:
example[33:45]

'Hugging Face'

In [36]:
import numpy as np

results = []
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets["offset_mapping"]

idx = 0
while idx < len(predictions):
    pred = predictions[idx]
    label = model.config.id2label[pred]
    if label != "O":
        # Remove the B- or I-
        label = label[2:]
        start, _ = offsets[idx]

        # Grab all the tokens labeled with I-label
        all_scores = []
        while (
            idx < len(predictions)
            and model.config.id2label[predictions[idx]] == f"I-{label}"
        ):
            all_scores.append(probabilities[idx][pred])
            _, end = offsets[idx]
            idx += 1

        # The score is the mean of all the scores of the tokens in that grouped entity
        score = np.mean(all_scores).item()
        word = example[start:end]
        results.append(
            {
                "entity_group": label,
                "score": score,
                "word": word,
                "start": start,
                "end": end,
            }
        )
    idx += 1

results

[{'entity_group': 'PER',
  'score': 0.998169407248497,
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': 0.9796018600463867,
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.99321049451828,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

#### Fast tokenizers in the QA pipeline

##### QA using `pipeline`

In [37]:
from transformers import pipeline

question = "Which deep learning libraries back 🤗 Transformers?"

context = """
🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch, and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""

question_answerer = pipeline("question-answering")
question_answerer(question=question, context=context)

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


{'score': 0.9802603125572205,
 'start': 78,
 'end': 106,
 'answer': 'Jax, PyTorch, and TensorFlow'}

In [38]:
long_context = """
🤗 Transformers: State of the Art NLP

🤗 Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction,
question answering, summarization, translation, text generation and more in over 100 languages.
Its aim is to make cutting-edge NLP easier to use for everyone.

🤗 Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets and
then share them with the community on our model hub. At the same time, each python module defining an architecture is fully standalone and
can be modified to enable quick research experiments.

Why should I use transformers?

1. Easy-to-use state-of-the-art models:
  - High performance on NLU and NLG tasks.
  - Low barrier to entry for educators and practitioners.
  - Few user-facing abstractions with just three classes to learn.
  - A unified API for using all our pretrained models.
  - Lower compute costs, smaller carbon footprint:

2. Researchers can share trained models instead of always retraining.
  - Practitioners can reduce compute time and production costs.
  - Dozens of architectures with over 10,000 pretrained models, some in more than 100 languages.

3. Choose the right framework for every part of a model's lifetime:
  - Train state-of-the-art models in 3 lines of code.
  - Move a single model between TF2.0/PyTorch frameworks at will.
  - Seamlessly pick the right framework for training, evaluation and production.

4. Easily customize a model or an example to your needs:
  - We provide examples for each architecture to reproduce the results published by its original authors.
  - Model internals are exposed as consistently as possible.
  - Model files can be used independently of the library for quick experiments.

🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""
#                                                                              ^^^^^^^^^^^^^^^^^^^^^^^^^^
#
# ... see now how the answer is almost at the very end of this
#     long, long context ...?

question_answerer(question=question, context=long_context)

{'score': 0.9714871048927307,
 'start': 1892,
 'end': 1919,
 'answer': 'Jax, PyTorch and TensorFlow'}

##### Using a model for question answering

... where we do things the hard way.

> The checkpoint used by default for the question-answering pipeline is [`distilbert-base-cased-distilled-squad`](https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad)

> Models for question answering work a little differently from the models we’ve seen up to now. Using the picture above as an example, the model has been trained to predict the index of the token starting the answer (here 21) and the index of the token where the answer ends (here 24). This is why those models don’t return one tensor of logits but two: one for the logits corresponding to the start token of the answer, and one for the logits corresponding to the end token of the answer.

In [39]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

model_checkpoint = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

inputs = tokenizer(question, context, return_tensors="pt")
outputs = model(**inputs)

##### For QA, we have logits on both the start of span with the answer; and the end of the span with the answer.

... or we need to use the `[CLS]` token for indicating an impossible answer.

In [40]:
start_logits = outputs.start_logits
end_logits = outputs.end_logits

print(f"num. of tokens? {len(inputs.tokens())}")
print(start_logits.shape, end_logits.shape)

num. of tokens? 67
torch.Size([1, 67]) torch.Size([1, 67])


> To convert those logits into probabilities, we will apply a softmax function — but before that, we need to make sure we mask the indices that are not part of the context. Our input is `[CLS]` question [SEP] context `[SEP]`, so we need to mask the tokens of the question as well as the `[SEP]` token. We’ll keep the `[CLS]` token, however, as some models use it to indicate that the answer is not in the context.

##### To clarify

* We want to calculate the probabilities for the `start` and `end` tokens using <u>only the context</u> and not the question.
* We do that by setting the probabilities on the tokens in the question, as well as the `[SEP]` BERT tokens to 0
* Some models use `[CLS]` for indicating an impossible answer (answer could not be found in the context), so we need to allow that through for calculating probabilities.
* So, we need to create a mask of `1`s (or `True` on the positions for the tokens in the question and those `[SEP]` tokens
* We can then set the logits on those tokens to some large, negative numbers, since $Softmax(x_{i}) = \frac{exp(x_{i})}{\sum_j{exp(x_{j})}}$ and the exponent of a large, negative number $x_{i}$ yields 0.

We will use the `sequence_ids` values for building up a mask which we will use for setting the probabilities for the tokens in the question (and BERT special tokens) to 0. 

`sequence_ids` values are:
* `None` for `[CLS]`, `[SEP]` special BERT tokens
* `0` for the question
* `1` for the context

In [41]:
import torch

sequence_ids = inputs.sequence_ids()
#print(inputs.sequence_ids())
#print()

list(zip(inputs.tokens(), sequence_ids))

[('[CLS]', None),
 ('Which', 0),
 ('deep', 0),
 ('learning', 0),
 ('libraries', 0),
 ('back', 0),
 ('[UNK]', 0),
 ('Transformers', 0),
 ('?', 0),
 ('[SEP]', None),
 ('[UNK]', 1),
 ('Transformers', 1),
 ('is', 1),
 ('backed', 1),
 ('by', 1),
 ('the', 1),
 ('three', 1),
 ('most', 1),
 ('popular', 1),
 ('deep', 1),
 ('learning', 1),
 ('libraries', 1),
 ('—', 1),
 ('Jax', 1),
 (',', 1),
 ('P', 1),
 ('##y', 1),
 ('##T', 1),
 ('##or', 1),
 ('##ch', 1),
 (',', 1),
 ('and', 1),
 ('Ten', 1),
 ('##sor', 1),
 ('##F', 1),
 ('##low', 1),
 ('—', 1),
 ('with', 1),
 ('a', 1),
 ('sea', 1),
 ('##m', 1),
 ('##less', 1),
 ('integration', 1),
 ('between', 1),
 ('them', 1),
 ('.', 1),
 ('It', 1),
 ("'", 1),
 ('s', 1),
 ('straightforward', 1),
 ('to', 1),
 ('train', 1),
 ('your', 1),
 ('models', 1),
 ('with', 1),
 ('one', 1),
 ('before', 1),
 ('loading', 1),
 ('them', 1),
 ('for', 1),
 ('in', 1),
 ('##ference', 1),
 ('with', 1),
 ('the', 1),
 ('other', 1),
 ('.', 1),
 ('[SEP]', None)]

In [42]:
# Mask indicating the positions of the question tokens and [SEP]
mask = [i != 1 for i in sequence_ids]
#print(mask)

# Unmask the [CLS] token
mask[0] = False
#print(mask)
#print()

# N.B., the mask needs to be the same shape
#       as the tensor of output logits
#mask = torch.tensor(mask)[None]
mask = torch.tensor(mask).unsqueeze(dim=0)
#mask = torch.tensor(mask)
print(mask.shape)

torch.Size([1, 67])


In [43]:
start_logits[mask] = -10000
end_logits[mask] = -10000

In [44]:
start_probabilities = torch.nn.functional.softmax(start_logits, dim=-1)[0]
end_probabilities = torch.nn.functional.softmax(end_logits, dim=-1)[0]

##### Create a matrix of scores

`start_probabilities[:, None] * end_probabilities[None, :]`, or `unsqueeze`ing the tensors will create a $n \times n$ tensor (matrix) of scores.

In [45]:
scores = start_probabilities[:, None] * end_probabilities[None, :]
scores.shape

torch.Size([67, 67])

[API documentation for `torch.triu`](https://pytorch.org/docs/stable/generated/torch.triu.html#torch-triu)

In [46]:
scores = torch.triu(scores)
scores

tensor([[9.4340e-13, 0.0000e+00, 0.0000e+00,  ..., 1.1023e-12, 1.6345e-12,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 9.1136e-14, 1.3514e-13,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 1.2744e-13,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00]], grad_fn=<TriuBackward0>)

> Now we just have to get the index of the maximum. Since PyTorch will return the index in the flattened tensor, we need to use the floor division // and modulus % operations to get the `start_index` and `end_index`

##### Clarification: `//` and `%` operations to get start and end???

That explanation above is somewhat lacking...

From ['s response to Get indices of the max of a 2D Tensor question at discuss.pytorch.org](https://discuss.pytorch.org/t/get-indices-of-the-max-of-a-2d-tensor/82150/5):

> Since `argmax()` gives you the index in a flattened tensor, you can infer the position in your 2D tensor from size of the last dimension.
> E.g. if `argmax()` returns 10 and you’ve got 4 columns, you know it’s on row 2, column 2.
> You can use Python's (built-in function) [`divmod`](https://docs.python.org/3/library/functions.html#divmod) for this

In [47]:
max_index = scores.argmax().item()
print(type(max_index))

start_index = max_index // scores.shape[1]
end_index   = max_index %  scores.shape[1]
#start_index, end_index = divmod(max_index, scores.shape[1])

print(f"start_index: {start_index}")
print(f"end_index: {end_index}")

print(scores[start_index, end_index])

<class 'int'>
start_index: 23
end_index: 35
tensor(0.9803, grad_fn=<SelectBackward0>)


##### Slight detour for extra credit

> ✏️ Try it out! Compute the start and end indices for the five most likely answers.

Please see API documentation for [`torch.topk`](https://pytorch.org/docs/stable/generated/torch.Tensor.topk.html)

In [48]:
values, indices = torch.topk(scores.flatten(), 5)
start_and_end = [
    divmod(max_index, scores.shape[1])
    for max_index in indices.tolist()
]
start_and_end

[(23, 35), (23, 36), (16, 35), (23, 29), (25, 35)]

...

In [49]:
inputs_with_offsets = tokenizer(question, context, return_offsets_mapping=True)
print(inputs_with_offsets)

{'input_ids': [101, 5979, 1996, 3776, 9818, 1171, 100, 25267, 136, 102, 100, 25267, 1110, 5534, 1118, 1103, 1210, 1211, 1927, 1996, 3776, 9818, 783, 13612, 117, 153, 1183, 1942, 1766, 1732, 117, 1105, 5157, 21484, 2271, 6737, 783, 1114, 170, 2343, 1306, 2008, 9111, 1206, 1172, 119, 1135, 112, 188, 21546, 1106, 2669, 1240, 3584, 1114, 1141, 1196, 10745, 1172, 1111, 1107, 16792, 1114, 1103, 1168, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 5), (6, 10), (11, 19), (20, 29), (30, 34), (35, 36), (37, 49), (49, 50), (0, 0), (1, 2), (3, 15), (16, 18), (19, 25), (26, 28), (29, 32), (33, 38), (39, 43), (44, 51), (52, 56), (57, 65), (66, 75), (76, 77), (78, 81), (81, 82), (83, 84), (84, 85), (85, 86), (86, 88), (88, 90), (90, 91), (92, 95), (96, 99), (99, 102), (102, 103), (103, 10

In [50]:
offsets = inputs_with_offsets["offset_mapping"]

start_char, _ = offsets[start_index]
_, end_char = offsets[end_index]
answer = context[start_char:end_char]

In [53]:
result = {
    "answer": answer,
    "start": start_char,
    "end": end_char,
    "score": scores[start_index, end_index],
}
result

{'answer': 'Jax, PyTorch, and TensorFlow',
 'start': 78,
 'end': 106,
 'score': tensor(0.9803, grad_fn=<SelectBackward0>)}

...

In [52]:
question_answerer(question=question, context=context, top_k=5)

[{'score': 0.9802603125572205,
  'start': 78,
  'end': 106,
  'answer': 'Jax, PyTorch, and TensorFlow'},
 {'score': 0.008247793652117252,
  'start': 78,
  'end': 108,
  'answer': 'Jax, PyTorch, and TensorFlow —'},
 {'score': 0.0013676970265805721,
  'start': 78,
  'end': 90,
  'answer': 'Jax, PyTorch'},
 {'score': 0.00038108686567284167,
  'start': 83,
  'end': 106,
  'answer': 'PyTorch, and TensorFlow'},
 {'score': 0.00021684505918528885,
  'start': 96,
  'end': 106,
  'answer': 'TensorFlow'}]

#### Handling long contexts