<a href="https://colab.research.google.com/github/arthurziegler/transformers-for-NLP/blob/main/notebooks/Pipeline_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
from transformers import pipeline

In [5]:
ner = pipeline("ner", aggregation_strategy = 'simple', device=0)

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [6]:
import pickle

In [7]:
!wget -nc https://lazyprogrammer.me/course_files/nlp/ner_train.pkl
!wget -nc https://lazyprogrammer.me/course_files/nlp/ner_test.pkl

File ‘ner_train.pkl’ already there; not retrieving.

File ‘ner_test.pkl’ already there; not retrieving.



In [8]:
with open('ner_train.pkl', 'rb') as f:
    corpus_train = pickle.load(f)

with open('ner_test.pkl', 'rb') as f:
    corpus_test = pickle.load(f)

In [63]:
print(len(corpus_train), len(corpus_test))

12733 2970


In [62]:
len(corpus_test)

2970

In [64]:
corpus_test[:2]

[[('CRICKET', 'O'),
  ('-', 'O'),
  ('LEICESTERSHIRE', 'B-ORG'),
  ('TAKE', 'O'),
  ('OVER', 'O'),
  ('AT', 'O'),
  ('TOP', 'O'),
  ('AFTER', 'O'),
  ('INNINGS', 'O'),
  ('VICTORY', 'O'),
  ('.', 'O')],
 [('West', 'B-MISC'),
  ('Indian', 'I-MISC'),
  ('all-rounder', 'O'),
  ('Phil', 'B-PER'),
  ('Simmons', 'I-PER'),
  ('took', 'O'),
  ('four', 'O'),
  ('for', 'O'),
  ('38', 'O'),
  ('on', 'O'),
  ('Friday', 'O'),
  ('as', 'O'),
  ('Leicestershire', 'B-ORG'),
  ('beat', 'O'),
  ('Somerset', 'B-ORG'),
  ('by', 'O'),
  ('an', 'O'),
  ('innings', 'O'),
  ('and', 'O'),
  ('39', 'O'),
  ('runs', 'O'),
  ('in', 'O'),
  ('two', 'O'),
  ('days', 'O'),
  ('to', 'O'),
  ('take', 'O'),
  ('over', 'O'),
  ('at', 'O'),
  ('the', 'O'),
  ('head', 'O'),
  ('of', 'O'),
  ('the', 'O'),
  ('county', 'O'),
  ('championship', 'O'),
  ('.', 'O')]]

In [10]:
inputs = []
targets =[]

for sentence_tag_pairs in corpus_test:
    tokens = []
    target = []
    for token, tag in sentence_tag_pairs:
        tokens.append(token)
        target.append(tag)
    inputs.append(tokens)
    targets.append(target)

In [11]:
inputs[9]

['He',
 'was',
 'well',
 'backed',
 'by',
 'England',
 'hopeful',
 'Mark',
 'Butcher',
 'who',
 'made',
 '70',
 'as',
 'Surrey',
 'closed',
 'on',
 '429',
 'for',
 'seven',
 ',',
 'a',
 'lead',
 'of',
 '234',
 '.']

In [12]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

In [13]:
detokenizer = TreebankWordDetokenizer()

In [14]:
detokenizer.detokenize(inputs[9])

'He was well backed by England hopeful Mark Butcher who made 70 as Surrey closed on 429 for seven, a lead of 234.'

In [15]:
targets[9]

['O',
 'O',
 'O',
 'O',
 'O',
 'B-LOC',
 'O',
 'B-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'B-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [16]:
ner(detokenizer.detokenize(inputs[9]))

[{'entity_group': 'LOC',
  'score': 0.99967515,
  'word': 'England',
  'start': 22,
  'end': 29},
 {'entity_group': 'PER',
  'score': 0.99974275,
  'word': 'Mark Butcher',
  'start': 38,
  'end': 50},
 {'entity_group': 'ORG',
  'score': 0.9996264,
  'word': 'Surrey',
  'start': 66,
  'end': 72}]

In [35]:
def compute_prediction(tokens, input_, ner_result):
    # map hugging face ner results to a list of tags for performance assessment
    # tokens is the original tokenized sentece (list of words)
    # input_ is the detokenized string (the phrase)
    # ner_result is the output of the hugging face pipeline

    predicted_tags = []
    state = 'O' #This will keep track of which part of the token we're at, so if O -> B, if B -> I, if I -> I
    current_index = 0

    for token in tokens:
        # find the token in the input_
        index = input_.find(token)
        assert(index >= 0) #test if index is valid
        current_index += index #where we are currently pointing to

        #print(input_)
        #print(token, current_index) #debugging
        
        # check if this index belongs to an entity and assign label
        tag = 'O'
        for entity in ner_result:
            if current_index >= entity['start'] and current_index < entity['end']:
                # then this token belongs to an entity
                if state == 'O':
                    state = 'B'
                else:
                    state = 'I'
                tag = f"{state}-{entity['entity_group']}"
                break
        if tag == 'O':
            # reset the state
            state = 'O'
        predicted_tags.append(tag)

        # remove the token from input_ (This is needed because if there is another of the same entity it will match the first token in the sentence again, and that will be incorrect)
        input_ = input_[index + len(token):]

        #update current_index
        current_index += len(token)
        #print(current_index)
    return predicted_tags

In [36]:
input_= detokenizer.detokenize(inputs[9]) ## The phrase that we are tagging
ner_result = ner(input_) ## Output of the NER Pipeline
ptags = compute_prediction(inputs[9], input_, ner_result) ## The function gets the list of words, the phrase and the ner output.

In [37]:
ptags

['O',
 'O',
 'O',
 'O',
 'O',
 'B-LOC',
 'O',
 'B-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'B-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [38]:
from sklearn.metrics import accuracy_score, f1_score

In [39]:
accuracy_score(targets[9], ptags)

1.0

In [40]:
for targ, pred in zip(targets[9], ptags):
    print(targ,pred)

O O
O O
O O
O O
O O
B-LOC B-LOC
O O
B-PER B-PER
I-PER I-PER
O O
O O
O O
O O
B-ORG B-ORG
O O
O O
O O
O O
O O
O O
O O
O O
O O
O O
O O


In [41]:
# get detokenized inputs to pass into ner model
detok_inputs = []
for tokens in inputs:
    text = detokenizer.detokenize(tokens)
    detok_inputs.append(text)

In [43]:
detok_inputs[:10]

['CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTER INNINGS VICTORY.',
 'West Indian all-rounder Phil Simmons took four for 38 on Friday as Leicestershire beat Somerset by an innings and 39 runs in two days to take over at the head of the county championship.',
 'Their stay on top, though, may be short-lived as title rivals Essex, Derbyshire and Surrey all closed in on victory while Kent made up for lost time in their rain-affected match against Nottinghamshire.',
 'After bowling Somerset out for 83 on the opening morning at Grace Road, Leicestershire extended their first innings by 94 runs before being bowled out for 296 with England discard Andy Caddick taking three for 83.',
 'Trailing by 213, Somerset got a solid start to their second innings before Simmons stepped in to bundle them out for 174.',
 'Essex, however, look certain to regain their top spot after Nasser Hussain and Peter Such gave them a firm grip on their match against Yorkshire at Headingley.',
 "Hussain, considered sur

In [44]:
ner_results = ner(detok_inputs)

In [45]:
predictions = []
for tokens, text, ner_result in zip(inputs, detok_inputs, ner_results):
    pred = compute_prediction(tokens, text, ner_result)
    predictions.append(pred)

In [48]:
predictions[:2]

[['O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['B-MISC',
  'I-MISC',
  'O',
  'B-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'O',
  'B-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O']]

In [51]:
# Flatten the list of lists into a single list with all the predictions
def flatten(list_of_lists):
    flattened = [val for sublist in list_of_lists for val in sublist]
    return flattened

In [52]:
flat_predictions = flatten(predictions)
flat_targets = flatten(targets)

In [54]:
print(len(flat_predictions), len(flat_targets))

50817 50817


In [55]:
accuracy_score(flat_targets, flat_predictions)

0.9916563354782848

In [56]:
f1_score(flat_targets, flat_predictions, average='macro')

0.95403328229255