In [3]:
import torch
import json 
import glob
# import tqdm
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
model = AutoModelForTokenClassification.from_pretrained('dslim/bert-base-NER')
tokenizer = AutoTokenizer.from_pretrained('dslim/bert-base-NER')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
nlp = pipeline(
    'ner',
    model=model,
    tokenizer=tokenizer,
    device=device,
    aggregation_strategy='first')
    # first works better than simple or the default

In [9]:
# small test
text = """The oversized snowflakes fell softly and silently, settling among the pines like a picturesque Christmas scene.
By the roadside, spectators in heavy winter coats watched team cars and motorbikes struggle up one of Liege-Bastogne-Liege's countless climbs, tyres spinning in the slush as they pursued one man on a bike.
It was April 1980 and Bernard Hinault, almost unrecognisable beneath a big red balaclava, slewed doggedly on, further into the lead, somehow remaining balanced on the two wheels beneath him.
He was under such physical strain that he would do himself permanent damage. Pushing his body to its very limit, he raced through the Ardennes in search of victory in the race known as 'La Doyenne' - the old lady.
So bad were the conditions that several of cycling's best riders collected their number from organisers and then never lined up.
After just 70km of the 244km one-day race, 110 of the 174 entrants were already holed up in a hotel by the finish line. Only 21 completed the course. Hinault suffered frostbite.
Rarely do you see such attrition in cycling, but Liege-Bastogne-Liege, which celebrates its 130th birthday on Sunday, has been making and breaking the toughest competitors for years.
Hinault was 25. He had already won the Tour de France twice and would go on to win it a further three times, an icon of his sport in the making. His total of five Tour victories remains a joint record.
But this was a different challenge - a long way from the searing heat and sunflowers of summer.
One of the five prestigious 'Monument' one-day races in cycling, Liege-Bastogne-Liege is celebrated by many for being the very antithesis of the Tour.
In the hills of east and south Belgium the peloton is stretched through thick, damp forest, over short, sharp climbs and across tricky, part-cobbled sections before landing back where it all began in Liege.
"[The race is] already hard, it's long, and when I won it was in very tough conditions, especially the snow," says Hinault, now aged 67.
"Yes, I considered quitting if the weather conditions persisted. We started having difficulties. It's difficult in Liege-Bastogne-Liege."
Hinault's account of one of his greatest triumphs is characteristically taciturn. Tough conditions is a severe understatement. And in the racing he didn't have it all his own way, either.
With around 91km to go, approaching the 500m Stockeu climb, Rudy Pevenage was two minutes 15 seconds ahead of Hinault and a small chasing group.
Pevenage was one of the hard men of the spring classics. He was a Belgian with a big lead, in conditions many locals would feel only a Belgian could master.
But even he did not finish a race that truly separated the men from the legends. 'Neige-Bastogne-Neige,' as it would be dubbed.
On the next climb, a 500m ascent of the Haute Levee, Hinault and a small number of fellow pursuers caught up with Pevenage. Then Hinault launched his attack, bright red balaclava and thick blue gloves disappearing into the distance as his stunning acceleration left everybody behind.
There were still 80km to go.
"""
ner_results = nlp(text)
print(ner_results)

[{'entity_group': 'ORG', 'score': 0.6306363, 'word': 'Liege', 'start': 214, 'end': 219}, {'entity_group': 'ORG', 'score': 0.768787, 'word': 'Bastogne', 'start': 220, 'end': 228}, {'entity_group': 'ORG', 'score': 0.49957347, 'word': 'Liege', 'start': 229, 'end': 234}, {'entity_group': 'PER', 'score': 0.9997057, 'word': 'Bernard Hinault', 'start': 340, 'end': 355}, {'entity_group': 'LOC', 'score': 0.63819337, 'word': 'Ardennes', 'start': 643, 'end': 651}, {'entity_group': 'MISC', 'score': 0.9159621, 'word': 'La Doyenne', 'start': 695, 'end': 705}, {'entity_group': 'PER', 'score': 0.99575186, 'word': 'Hinault', 'start': 1002, 'end': 1009}, {'entity_group': 'LOC', 'score': 0.54830253, 'word': 'Liege', 'start': 1079, 'end': 1084}, {'entity_group': 'LOC', 'score': 0.8442254, 'word': 'Bastogne', 'start': 1085, 'end': 1093}, {'entity_group': 'LOC', 'score': 0.91898054, 'word': 'Liege', 'start': 1094, 'end': 1099}, {'entity_group': 'PER', 'score': 0.9993431, 'word': 'Hinault', 'start': 1213, 'e

In [10]:
# convert ner_results to dataframe
df = pd.DataFrame(ner_results)
df.head()

Unnamed: 0,entity_group,score,word,start,end
0,ORG,0.630636,Liege,214,219
1,ORG,0.768787,Bastogne,220,228
2,ORG,0.499573,Liege,229,234
3,PER,0.999706,Bernard Hinault,340,355
4,LOC,0.638193,Ardennes,643,651


In [10]:
# save the model 
model.save_pretrained("../models/dslim/bert-base-NER")
# # save the tokenizer
tokenizer.save_pretrained("../models/dslim/bert-base-NER")


('../models/dslim/bert-base-NER\\tokenizer_config.json',
 '../models/dslim/bert-base-NER\\special_tokens_map.json',
 '../models/dslim/bert-base-NER\\vocab.txt',
 '../models/dslim/bert-base-NER\\added_tokens.json',
 '../models/dslim/bert-base-NER\\tokenizer.json')