In [9]:
import torch
# import json 
import glob
import tqdm
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

In [2]:
model = AutoModelForTokenClassification.from_pretrained('dslim/bert-base-NER')
tokenizer = AutoTokenizer.from_pretrained('dslim/bert-base-NER')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Downloading: 100%|██████████| 829/829 [00:00<00:00, 81.0kB/s]
Downloading: 100%|██████████| 433M/433M [00:24<00:00, 17.7MB/s] 
Downloading: 100%|██████████| 213k/213k [00:00<00:00, 715kB/s] 
Downloading: 100%|██████████| 2.00/2.00 [00:00<00:00, 988B/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 37.4kB/s]
Downloading: 100%|██████████| 59.0/59.0 [00:00<00:00, 14.6kB/s]


In [3]:
nlp = pipeline('ner', model=model, tokenizer=tokenizer)

In [13]:
# small test
text = """The oversized snowflakes fell softly and silently, settling among the pines like a picturesque Christmas scene.
By the roadside, spectators in heavy winter coats watched team cars and motorbikes struggle up one of Liege-Bastogne-Liege's countless climbs, tyres spinning in the slush as they pursued one man on a bike.
It was April 1980 and Bernard Hinault, almost unrecognisable beneath a big red balaclava, slewed doggedly on, further into the lead, somehow remaining balanced on the two wheels beneath him.
He was under such physical strain that he would do himself permanent damage. Pushing his body to its very limit, he raced through the Ardennes in search of victory in the race known as 'La Doyenne' - the old lady.
So bad were the conditions that several of cycling's best riders collected their number from organisers and then never lined up.
After just 70km of the 244km one-day race, 110 of the 174 entrants were already holed up in a hotel by the finish line. Only 21 completed the course. Hinault suffered frostbite.
Rarely do you see such attrition in cycling, but Liege-Bastogne-Liege, which celebrates its 130th birthday on Sunday, has been making and breaking the toughest competitors for years.
Hinault was 25. He had already won the Tour de France twice and would go on to win it a further three times, an icon of his sport in the making. His total of five Tour victories remains a joint record.
But this was a different challenge - a long way from the searing heat and sunflowers of summer.
One of the five prestigious 'Monument' one-day races in cycling, Liege-Bastogne-Liege is celebrated by many for being the very antithesis of the Tour.
In the hills of east and south Belgium the peloton is stretched through thick, damp forest, over short, sharp climbs and across tricky, part-cobbled sections before landing back where it all began in Liege.
"[The race is] already hard, it's long, and when I won it was in very tough conditions, especially the snow," says Hinault, now aged 67.
"Yes, I considered quitting if the weather conditions persisted. We started having difficulties. It's difficult in Liege-Bastogne-Liege."
Hinault's account of one of his greatest triumphs is characteristically taciturn. Tough conditions is a severe understatement. And in the racing he didn't have it all his own way, either.
With around 91km to go, approaching the 500m Stockeu climb, Rudy Pevenage was two minutes 15 seconds ahead of Hinault and a small chasing group.
Pevenage was one of the hard men of the spring classics. He was a Belgian with a big lead, in conditions many locals would feel only a Belgian could master.
But even he did not finish a race that truly separated the men from the legends. 'Neige-Bastogne-Neige,' as it would be dubbed.
On the next climb, a 500m ascent of the Haute Levee, Hinault and a small number of fellow pursuers caught up with Pevenage. Then Hinault launched his attack, bright red balaclava and thick blue gloves disappearing into the distance as his stunning acceleration left everybody behind.
There were still 80km to go.
"""
ner_results = nlp(text, show_tokens=False)
print(ner_results)

[{'word': 'Lie', 'score': 0.6306365132331848, 'entity': 'B-ORG', 'index': 44, 'start': 214, 'end': 217}, {'word': '##ge', 'score': 0.27894917130470276, 'entity': 'I-ORG', 'index': 45, 'start': 217, 'end': 219}, {'word': 'Ba', 'score': 0.7687879800796509, 'entity': 'B-ORG', 'index': 47, 'start': 220, 'end': 222}, {'word': '##sto', 'score': 0.48457035422325134, 'entity': 'I-ORG', 'index': 48, 'start': 222, 'end': 225}, {'word': '##gne', 'score': 0.7429560422897339, 'entity': 'I-ORG', 'index': 49, 'start': 225, 'end': 228}, {'word': 'Lie', 'score': 0.4995739459991455, 'entity': 'B-ORG', 'index': 51, 'start': 229, 'end': 232}, {'word': 'Bernard', 'score': 0.9997014403343201, 'entity': 'B-PER', 'index': 80, 'start': 340, 'end': 347}, {'word': 'Hi', 'score': 0.9997100234031677, 'entity': 'I-PER', 'index': 81, 'start': 348, 'end': 350}, {'word': '##na', 'score': 0.9567268490791321, 'entity': 'I-PER', 'index': 82, 'start': 350, 'end': 352}, {'word': '##ult', 'score': 0.9372967481613159, 'entit

In [16]:
# convert ner_results to dataframe
df = pd.DataFrame(ner_results)
df.head()

Unnamed: 0,word,score,entity,index,start,end
0,Lie,0.630637,B-ORG,44,214,217
1,##ge,0.278949,I-ORG,45,217,219
2,Ba,0.768788,B-ORG,47,220,222
3,##sto,0.48457,I-ORG,48,222,225
4,##gne,0.742956,I-ORG,49,225,228


In [60]:
# get the ORG entities
locs = df[df['entity'].str.contains('-ORG')].copy()
orgs = orgs[['word', 'entity']]

# get the index of each B-ORG
begin_index = orgs[orgs['entity'] == 'B-ORG'].index.tolist()
# print(begin_index)
# create a list to hold the words
org_words = []

for i in range(len(begin_index)):
    if i < len(begin_index) - 1:
        word_parts_list = orgs.loc[begin_index[i]: begin_index[i+1]-1, 'word'].tolist()
        word = [''.join([x.strip('##') for x in word_parts_list])]
        org_words.append(word[0])
    else:
        break
print(org_words)
    

['Liege', 'Bastogne']


In [63]:
# get the ORG entities
locs = df[df['entity'].str.contains('-LOC')].copy()
locs = locs[['word', 'entity']]
print(locs)

# get the index of each B-ORG
begin_index = locs[locs['entity'] == 'B-LOC'].index.tolist()
# print(begin_index)
# create a list to hold the words
loc_words = []

for i in range(len(begin_index)):
    if i < len(begin_index) - 1:
        word_parts_list = locs.loc[begin_index[i]: begin_index[i+1]-1, 'word'].tolist()
        word = [''.join([x.strip('##') for x in word_parts_list])]
        loc_words.append(word[0])
    else:
        break
print(loc_words)

       word entity
10        A  B-LOC
11   ##rden  B-LOC
12    ##nes  I-LOC
19      Lie  B-LOC
21       Ba  B-LOC
22    ##sto  I-LOC
23    ##gne  I-LOC
24      Lie  B-LOC
25     ##ge  I-LOC
34      Lie  B-LOC
35     ##ge  B-LOC
36       Ba  B-LOC
37    ##sto  I-LOC
38    ##gne  I-LOC
39      Lie  B-LOC
40     ##ge  I-LOC
41  Belgium  B-LOC
['A', 'rdennes', 'Lie', 'Bastogne', 'Liege', 'Lie', 'ge', 'Bastogne', 'Liege']


In [64]:
# get the ORG entities
pers = df[df['entity'].str.contains('-PER')].copy()
pers = pers[['word', 'entity']]
print(pers)

# get the index of each B-ORG
begin_index = pers[pers['entity'] == 'B-PER'].index.tolist()
# print(begin_index)
# create a list to hold the words
per_words = []

for i in range(len(begin_index)):
    if i < len(begin_index) - 1:
        word_parts_list = pers.loc[begin_index[i]: begin_index[i+1]-1, 'word'].tolist()
        word = [''.join([x.strip('##') for x in word_parts_list])]
        per_words.append(word[0])
    else:
        break
print(per_words)

       word entity
6   Bernard  B-PER
7        Hi  I-PER
8      ##na  I-PER
9     ##ult  I-PER
17       Hi  B-PER
18     ##na  I-PER
26       Hi  B-PER
27     ##na  I-PER
28    ##ult  I-PER
['BernardHinault', 'Hina']


In [65]:
# get the ORG entities
misc = df[df['entity'].str.contains('-MISC')].copy()
misc = misc[['word', 'entity']]
print(misc)

# get the index of each B-ORG
begin_index = misc[misc['entity'] == 'B-MISC'].index.tolist()
# print(begin_index)
# create a list to hold the words
misc_words = []

for i in range(len(begin_index)):
    if i < len(begin_index) - 1:
        word_parts_list = misc.loc[begin_index[i]: begin_index[i+1]-1, 'word'].tolist()
        word = [''.join([x.strip('##') for x in word_parts_list])]
        misc_words.append(word[0])
    else:
        break
print(misc_words)

        word  entity
13        La  B-MISC
14        Do  I-MISC
15      ##ye  I-MISC
16     ##nne  I-MISC
29      Tour  B-MISC
30        de  I-MISC
31    France  I-MISC
32      Tour  B-MISC
33  Monument  B-MISC
['LaDoyenne', 'TourdeFrance', 'Tour']


In [66]:
# save the model 
model.save_pretrained("../src/models/dslim/bert-base-NER")
# # save the tokenizer
tokenizer.save_pretrained("../src/models/dslim/bert-base-NER")


('../src/models/dslim/bert-base-NER\\tokenizer_config.json',
 '../src/models/dslim/bert-base-NER\\special_tokens_map.json',
 '../src/models/dslim/bert-base-NER\\vocab.txt',
 '../src/models/dslim/bert-base-NER\\added_tokens.json',
 '../src/models/dslim/bert-base-NER\\tokenizer.json')