In [1]:
# !pip install transformers

In [2]:
# conll 2003
!wget -nc https://lazyprogrammer.me/course_files/nlp/ner_train.pkl
!wget -nc https://lazyprogrammer.me/course_files/nlp/ner_test.pkl

O arquivo “ner_train.pkl” já existe, não será baixado.

O arquivo “ner_test.pkl” já existe, não será baixado.



In [3]:
from transformers import pipeline

import pickle

In [4]:
ner = pipeline("ner", aggregation_strategy='simple', device=0)

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
with open("ner_train.pkl", "rb") as f:
    corpus_train = pickle.load(f)
    
with open("ner_test.pkl", "rb") as f:
    corpus_test = pickle.load(f)
    

In [6]:
corpus_test[0]

[('CRICKET', 'O'),
 ('-', 'O'),
 ('LEICESTERSHIRE', 'B-ORG'),
 ('TAKE', 'O'),
 ('OVER', 'O'),
 ('AT', 'O'),
 ('TOP', 'O'),
 ('AFTER', 'O'),
 ('INNINGS', 'O'),
 ('VICTORY', 'O'),
 ('.', 'O')]

In [7]:
inputs = []
targets = []

for sentence_tag_pairs in corpus_test:
    tokens = []
    target = []
    for token, tag in sentence_tag_pairs:
        tokens.append(token)
        target.append(tag)
    inputs.append(tokens)
    targets.append(target)


In [8]:
def showNER(inputs, tags):
    assert len(inputs) == len(tags)
    for i, t in zip(inputs, tags):
        print(i, " --> ",t)
        
showNER(inputs[9], targets[9])

He  -->  O
was  -->  O
well  -->  O
backed  -->  O
by  -->  O
England  -->  B-LOC
hopeful  -->  O
Mark  -->  B-PER
Butcher  -->  I-PER
who  -->  O
made  -->  O
70  -->  O
as  -->  O
Surrey  -->  B-ORG
closed  -->  O
on  -->  O
429  -->  O
for  -->  O
seven  -->  O
,  -->  O
a  -->  O
lead  -->  O
of  -->  O
234  -->  O
.  -->  O


In [9]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

detokenizer = TreebankWordDetokenizer()

In [10]:
# Instructors words about this:
# Why do we need this? Because the transformers pipeline works with raw text, not tokenized text
#
# But why can't we simple use " ".join()?
# For reasons such as: if we have a period, there sould be a space after, but not before the period mark.
# For hipens, there sould be no spaces. For regular words, there should be a space
# So, there are a lot of specific rules. We will use TreebankWordDetokenizer because it know the rules!
# So, TreebankWordDetokenizer can construct our sentece correctly starting with the tokens.
detokenizer.detokenize(inputs[9])

'He was well backed by England hopeful Mark Butcher who made 70 as Surrey closed on 429 for seven, a lead of 234.'

In [11]:
targets[9]

['O',
 'O',
 'O',
 'O',
 'O',
 'B-LOC',
 'O',
 'B-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'B-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [12]:
ner(detokenizer.detokenize(inputs[9]))

[{'entity_group': 'LOC',
  'score': 0.99967515,
  'word': 'England',
  'start': 22,
  'end': 29},
 {'entity_group': 'PER',
  'score': 0.99974275,
  'word': 'Mark Butcher',
  'start': 38,
  'end': 50},
 {'entity_group': 'ORG',
  'score': 0.9996264,
  'word': 'Surrey',
  'start': 66,
  'end': 72}]

In [13]:
def compute_prediction(tokens, input_, ner_result):
    # map hugging face ner result to list of tags for later performance assessment
    # tokens is the original tokenized sentence
    # input_ is the detokenized string

    predicted_tags = []
    state = "O"  # keep track of state, so if O --> B, if B --> I, if I --> I
    current_index = 0
    for token in tokens:
        # find the token in the input_ (should be at or near the start)
        index = input_.find(token)
        assert index >= 0
        current_index += index  # where we are currently pointing to

        # print(token, current_index) # debug

        # check if this index belongs to an entity and assign label
        tag = "O"
        for entity in ner_result:
            if current_index >= entity["start"] and current_index < entity["end"]:
                # then this token belongs to an entity
                if state == "O":
                    state = "B"
                else:
                    state = "I"
                tag = f"{state}-{entity['entity_group']}"
                break
        if tag == "O":
            # reset the state
            state = "O"
        predicted_tags.append(tag)

        # remove the token from input_
        input_ = input_[index + len(token) :]

        # update current_index
        current_index += len(token)

    # sanity check
    # print("len(predicted_tags)", len(predicted_tags))
    # print("len(tokens)", len(tokens))
    assert len(predicted_tags) == len(tokens)
    return predicted_tags


In [14]:
input_ = detokenizer.detokenize(inputs[9])
ner_result = ner(input_)
ptags = compute_prediction(inputs[9], input_, ner_result)

In [15]:
ptags

['O',
 'O',
 'O',
 'O',
 'O',
 'B-LOC',
 'O',
 'B-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'B-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [16]:
showNER(inputs[9], ptags)

He  -->  O
was  -->  O
well  -->  O
backed  -->  O
by  -->  O
England  -->  B-LOC
hopeful  -->  O
Mark  -->  B-PER
Butcher  -->  I-PER
who  -->  O
made  -->  O
70  -->  O
as  -->  O
Surrey  -->  B-ORG
closed  -->  O
on  -->  O
429  -->  O
for  -->  O
seven  -->  O
,  -->  O
a  -->  O
lead  -->  O
of  -->  O
234  -->  O
.  -->  O


In [17]:
from sklearn.metrics import accuracy_score, f1_score

In [18]:
accuracy_score(targets[9], ptags)

1.0

In [19]:
detok_inputs = []
for tokens in inputs:
    text = detokenizer.detokenize(tokens)
    detok_inputs.append(text)
    
detok_inputs[:2]

['CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTER INNINGS VICTORY.',
 'West Indian all-rounder Phil Simmons took four for 38 on Friday as Leicestershire beat Somerset by an innings and 39 runs in two days to take over at the head of the county championship.']

In [20]:
ner_results = ner(detok_inputs)

In [21]:
predictions = []
for tokens, text, ner_result in zip(inputs, detok_inputs, ner_results):
    pred = compute_prediction(tokens, text, ner_result)
    predictions.append(pred)
    

print(predictions)

[['O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'B-ORG', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O'], ['O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'B-LOC', 'O']

In [22]:
def flatten(list_of_lists):
     return [item for sublist in list_of_lists for item in sublist]


In [23]:
flat_predictions = flatten(predictions)
flat_targets = flatten(targets)

In [24]:
flat_predictions[:20]

['O',
 'O',
 'B-PER',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-MISC',
 'I-MISC',
 'O',
 'B-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O']

In [25]:
flat_targets[:20]

['O',
 'O',
 'B-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-MISC',
 'I-MISC',
 'O',
 'B-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O']

In [26]:
# Compute overal accuracy
accuracy_score(flat_targets, flat_predictions)

0.9916563354782848

In [27]:
# Compute overal f1_score
f1_score(flat_targets, flat_predictions, average='macro')

0.95403328229255

In [28]:
# Exercise: compare this results with other models you studied (in other courses), such as RNNs