In [1]:
import transformers

#Set to avoid warning messages.
transformers.logging.set_verbosity_error()

  from .autonotebook import tqdm as notebook_tqdm


## 06.02 Running the standard NER Pipeline

In [2]:
from transformers import pipeline

input_text="Sam went to California on the 23rd of August. \
There, he visited Google headquarters with John Smith and bought a cap for $23"

basic_ner = pipeline("ner")

basic_ner(input_text)


Downloading: 100%|██████████████████████████████████████████████████████████████████████████████| 998/998 [00:00<00:00, 155kB/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████████████| 1.33G/1.33G [00:43<00:00, 30.4MB/s]
Downloading: 100%|███████████████████████████████████████████████████████████████████████████| 60.0/60.0 [00:00<00:00, 35.1kB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████████████████| 213k/213k [00:00<00:00, 810kB/s]


[{'entity': 'I-PER',
  'score': 0.99887806,
  'index': 1,
  'word': 'Sam',
  'start': 0,
  'end': 3},
 {'entity': 'I-LOC',
  'score': 0.99972683,
  'index': 4,
  'word': 'California',
  'start': 12,
  'end': 22},
 {'entity': 'I-ORG',
  'score': 0.9960085,
  'index': 15,
  'word': 'Google',
  'start': 64,
  'end': 70},
 {'entity': 'I-PER',
  'score': 0.99891376,
  'index': 18,
  'word': 'John',
  'start': 89,
  'end': 93},
 {'entity': 'I-PER',
  'score': 0.99921584,
  'index': 19,
  'word': 'Smith',
  'start': 94,
  'end': 99}]

## 06.03. Understanding the model architecture

In [3]:
#Print model architecture
print(basic_ner.model)


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-1

## 06.04 Reviewing model configuration

In [5]:
print(basic_ner.model.config)


BertConfig {
  "_name_or_path": "dbmdz/bert-large-cased-finetuned-conll03-english",
  "_num_labels": 9,
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "O",
    "1": "B-MISC",
    "2": "I-MISC",
    "3": "B-PER",
    "4": "I-PER",
    "5": "B-ORG",
    "6": "I-ORG",
    "7": "B-LOC",
    "8": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "B-LOC": 7,
    "B-MISC": 1,
    "B-ORG": 5,
    "B-PER": 3,
    "I-LOC": 8,
    "I-MISC": 2,
    "I-ORG": 6,
    "I-PER": 4,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_s

## 06.05. Using a Custom Model and tokenizer

In [6]:
from transformers import AutoTokenizer, TFAutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner-with-dates", 
                                          from_pt=True)

model = TFAutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner-with-dates",
                                                          from_pt=True)

print(model.config.id2label)

Downloading: 100%|██████████████████████████████████████████████████████████████████████████████| 423/423 [00:00<00:00, 251kB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████████████████| 970/970 [00:00<00:00, 607kB/s]
Downloading: 100%|███████████████████████████████████████████████████████████████████████████| 811k/811k [00:00<00:00, 5.26MB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████████████████| 210/210 [00:00<00:00, 112kB/s]
Downloading: 100%|███████████████████████████████████████████████████████████████████████████| 440M/440M [00:15<00:00, 29.1MB/s]


{0: 'O', 1: 'I-LOC', 2: 'I-PER', 3: 'I-MISC', 4: 'I-ORG', 5: 'I-DATE'}


In [7]:
#Prediction
enhanced_ner = pipeline('ner', 
                        model=model, 
                        tokenizer=tokenizer, 
                        aggregation_strategy="simple")
enhanced_ner(input_text)

[{'entity_group': 'PER',
  'score': 0.9776213,
  'word': 'Sam',
  'start': 0,
  'end': 3},
 {'entity_group': 'LOC',
  'score': 0.9936407,
  'word': 'California',
  'start': 11,
  'end': 22},
 {'entity_group': 'DATE',
  'score': 0.92355955,
  'word': 'August',
  'start': 37,
  'end': 44},
 {'entity_group': 'ORG',
  'score': 0.5721681,
  'word': 'Google',
  'start': 63,
  'end': 70},
 {'entity_group': 'PER',
  'score': 0.9938346,
  'word': 'John Smith',
  'start': 88,
  'end': 99},
 {'entity_group': 'DATE',
  'score': 0.64064246,
  'word': '23',
  'start': 122,
  'end': 124}]