# What's inside a Pipeline 2 (A different example)

In [1]:
!pip install transformers

Collecting transformers
  Using cached transformers-4.23.1-py3-none-any.whl (5.3 MB)
Collecting huggingface-hub<1.0,>=0.10.0
  Using cached huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Using cached tokenizers-0.13.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.1.2[0m[39;49m -> [0m[32;49m22.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
from transformers import pipeline

In [3]:
ner = pipeline("ner")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [4]:
input = ["I live in Colombo from 2010 & my name is John", "I am Josh"]

In [5]:
ner(input)

[[{'entity': 'I-LOC',
   'score': 0.9994998,
   'index': 4,
   'word': 'Colombo',
   'start': 10,
   'end': 17},
  {'entity': 'I-PER',
   'score': 0.99862194,
   'index': 11,
   'word': 'John',
   'start': 41,
   'end': 45}],
 [{'entity': 'I-PER',
   'score': 0.9892971,
   'index': 3,
   'word': 'Josh',
   'start': 5,
   'end': 9}]]

## Let's build this from scratch

In [6]:
checkpoint = "dbmdz/bert-large-cased-finetuned-conll03-english"

In [7]:
from transformers import AutoTokenizer
tokz = AutoTokenizer.from_pretrained(checkpoint)

In [8]:
tokens = tokz(input, padding=True, truncation=True, return_tensors="pt")
tokens

{'input_ids': tensor([[  101,   146,  1686,  1107, 17245,  1121,  1333,   111,  1139,  1271,
          1110,  1287,   102],
        [  101,   146,  1821,  5868,   102,     0,     0,     0,     0,     0,
             0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [9]:
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained(checkpoint)

In [10]:
outputs = model(**tokens)

In [11]:
outputs.logits.shape

torch.Size([2, 13, 9])

In [12]:
result = outputs.logits.argmax(dim=2)
result

tensor([[0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 4, 0],
        [0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [13]:
model.config.id2label

{0: 'O',
 1: 'B-MISC',
 2: 'I-MISC',
 3: 'B-PER',
 4: 'I-PER',
 5: 'B-ORG',
 6: 'I-ORG',
 7: 'B-LOC',
 8: 'I-LOC'}

In [14]:
result2 = [r[r != 0] for r in result]
result2

[tensor([8, 4]), tensor([4])]

In [15]:
[[model.config.id2label[i.item()] for i in r] for r in result2]

[['I-LOC', 'I-PER'], ['I-PER']]