# Named Entity Recognition

## Instalação, importação e pré-processamento

In [None]:
!python -m spacy download en_core_web_lg
#https://spacy.io/models/en

In [1]:
import en_core_web_lg
import pandas as pd
import re
import random
import tqdm
import spacy
from spacy import displacy
from spacy.util import minibatch, compounding
import warnings
from ipywidgets import FloatProgress
from IPython.display import display

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
food_df = pd.read_csv("/content/gdrive/My Drive/UPF/Trabalho IA/food.csv")
articles = pd.read_csv("/content/gdrive/My Drive/UPF/Trabalho IA/articles_ner.csv")

print(food_df.shape)
print(articles.shape)

(354565, 5)
(11992, 1)


In [3]:
food_templates = [
    "I ate my {sub_to_tag}",
    "I'm eating a {sub_to_tag}",
    "I just ate a {sub_to_tag}",
    "I only ate the {sub_to_tag}",
    "I'm done eating a {sub_to_tag}",
    "I've already eaten a {sub_to_tag}",
    "I just finished my {sub_to_tag}",
    "When I was having lunch I ate a {sub_to_tag}",
    "I had a {sub_to_tag} and a {sub_to_tag} today",
    "I ate a {sub_to_tag} and a {sub_to_tag} for lunch",
    "I made a {sub_to_tag} and {sub_to_tag} for lunch",
    "I ate {sub_to_tag} and {sub_to_tag}",
    "today I ate a {sub_to_tag} and a {sub_to_tag} for lunch",
    "I had {sub_to_tag} with my husband last night",
    "I brought you some {sub_to_tag} on my birthday",
    "I made {sub_to_tag} for yesterday's dinner",
    "last night, a {sub_to_tag} was sent to me with {sub_to_tag}",
    "I had {sub_to_tag} yesterday and I'd like to eat it anyway",
    "I ate a couple of {sub_to_tag} last night",
    "I had some {sub_to_tag} at dinner last night",
    "Last night, I ordered some {sub_to_tag}",
    "I made a {sub_to_tag} last night",
    "I had a bowl of {sub_to_tag} with {sub_to_tag} and I wanted to go to the mall today",
    "I brought a basket of {sub_to_tag} for breakfast this morning",
    "I had a bowl of {sub_to_tag}",
    "I ate a {sub_to_tag} with {sub_to_tag} in the morning",
    "I made a bowl of {sub_to_tag} for my breakfast",
    "There's {sub_to_tag} for breakfast in the bowl this morning",
    "This morning, I made a bowl of {sub_to_tag}",
    "I decided to have some {sub_to_tag} as a little bonus",
    "I decided to enjoy some {sub_to_tag}",
    "I've decided to have some {sub_to_tag} for dessert",
    "I had a {sub_to_tag}, a {sub_to_tag} and {sub_to_tag} at home",
    "I took a {sub_to_tag}, {sub_to_tag} and {sub_to_tag} on the weekend",
    "I ate a {sub_to_tag} with {sub_to_tag} and {sub_to_tag} just now",
    "Last night, I ate an {sub_to_tag} with {sub_to_tag} and {sub_to_tag}",
    "I tasted some {sub_to_tag}, {sub_to_tag} and {sub_to_tag} at the office",
    "There's a basket of {sub_to_tag}, {sub_to_tag} and {sub_to_tag} that I consumed",
    "I devoured a {sub_to_tag}, {sub_to_tag} and {sub_to_tag}",
    "I've already had a bag of {sub_to_tag}, {sub_to_tag} and {sub_to_tag} from the fridge"
]

In [4]:
foods = food_df[food_df["description"].str.contains("[^a-zA-Z ]") == False]["description"].apply(lambda food: food.lower())
foods = foods[foods.str.split().apply(len) <= 3].drop_duplicates()
foods.size

37596

In [5]:
len(food_templates)

40

In [6]:
#deixar na mesma proporção os dados

list_entities = []

for _ in range(2000):
  for qnt_ent in range(1,4):
    sentence = food_templates[random.randint(0,len(food_templates)-1)]

    while len(re.findall('{sub_to_tag}', sentence)) != qnt_ent:
      sentence = food_templates[random.randint(0,len(food_templates)-1)]
      continue
    matches = re.findall('{sub_to_tag}', sentence)

    entities = []
    for match in matches:
      food = foods.iloc[random.randint(0,len(foods)-1)]
      sentence = sentence.replace(match, food, 1)
      m = re.search(food, sentence).span()
      entities.append((m[0], m[1], 'FOOD'))
    list_entities.append((sentence, {'entities':entities}))

In [7]:
len(list_entities)

6000

In [8]:
list_entities[0]

('I ate a couple of spicy chicken nuggets last night',
 {'entities': [(18, 39, 'FOOD')]})

In [9]:
inicio_tag = list_entities[0][1]['entities'][0][0]
fim_tag = list_entities[0][1]['entities'][0][1]
print("Índice do início da tag {} e fim {} da tag food".format(inicio_tag, fim_tag))
list_entities[0][0][inicio_tag:fim_tag]

Índice do início da tag 18 e fim 39 da tag food


'spicy chicken nuggets'

In [10]:
from spacy.gold import GoldParse
from spacy.scorer import Scorer
scorer = Scorer()

def scoreNER(nlp, TEST_DATA):
  random.shuffle(TEST_DATA)
  for text, annot in TEST_DATA:
      doc_gold_text = nlp.make_doc(text)
      gold = GoldParse(doc_gold_text, entities=annot['entities'])
      pred_value = nlp(text)
      scorer.score(pred_value, gold)
  print("Scores: ",scorer.scores['ents_per_type'])

## Modelo 1

In [166]:
model_spacy = 'en_core_web_lg'
nlp = spacy.load(model_spacy) 

In [167]:
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
else:ner = nlp.get_pipe("ner")

In [168]:
TRAIN_DATA = list_entities[:int(0.75*len(list_entities))]
TEST_DATA = list_entities[int(0.75*len(list_entities)):]

for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
      ner.add_label(ent[2])

In [169]:
len(TRAIN_DATA)

4500

In [170]:
len(TEST_DATA)

1500

In [172]:
epochs = 30

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

with nlp.disable_pipes(*other_pipes):
    if model_spacy is None:
        nlp.begin_training()
    for itn in tqdm.tqdm(range(epochs)):
        random.shuffle(TRAIN_DATA)
        losses = {}
        size = compounding(4.0, 32.0, 1.001)
        batches = minibatch(TRAIN_DATA, size=size)
        for batch in batches:
            texts, annotations = zip(*batch)
            try:
              nlp.update(texts,annotations,drop=0.3,losses=losses)
            except:
              #print("ERRO:", text, annotations)
              continue
        print(itn," Losses", losses, end="\n")
        scoreNER(nlp, TEST_DATA)

  0%|          | 0/30 [00:00<?, ?it/s]

0  Losses {'ner': 30093.79447613284}


  3%|▎         | 1/30 [00:35<17:23, 35.99s/it]

Scores:  {'FOOD': {'p': 98.38220424671385, 'r': 97.3, 'f': 97.83810960281548}}
1  Losses {'ner': 26930.220629180316}


  7%|▋         | 2/30 [01:15<17:51, 38.27s/it]

Scores:  {'FOOD': {'p': 98.75546585940128, 'r': 97.86666666666667, 'f': 98.30905742507953}}
2  Losses {'ner': 26362.011418125883}


 10%|█         | 3/30 [02:01<18:38, 41.42s/it]

Scores:  {'FOOD': {'p': 98.93866607083007, 'r': 98.4, 'f': 98.66859784970197}}
3  Losses {'ner': 26356.193910353875}


 13%|█▎        | 4/30 [02:49<19:09, 44.20s/it]

Scores:  {'FOOD': {'p': 98.97148591019315, 'r': 98.63333333333333, 'f': 98.80212028882674}}
4  Losses {'ner': 26361.671791522385}


 17%|█▋        | 5/30 [03:37<18:59, 45.56s/it]

Scores:  {'FOOD': {'p': 99.07091771940378, 'r': 98.81333333333333, 'f': 98.94195787857548}}
5  Losses {'ner': 26199.277287281875}


 20%|██        | 6/30 [04:25<18:32, 46.34s/it]

Scores:  {'FOOD': {'p': 99.13154818237489, 'r': 98.92777777777778, 'f': 99.0295581569947}}
6  Losses {'ner': 26284.831726262455}


 23%|██▎       | 7/30 [05:12<17:54, 46.72s/it]

Scores:  {'FOOD': {'p': 99.24618320610688, 'r': 99.05714285714285, 'f': 99.15157292659677}}
7  Losses {'ner': 25885.97740693041}


 27%|██▋       | 8/30 [06:00<17:13, 46.97s/it]

Scores:  {'FOOD': {'p': 99.3279906503047, 'r': 99.15416666666667, 'f': 99.24100254389256}}
8  Losses {'ner': 25940.549682877143}


 30%|███       | 9/30 [06:47<16:27, 47.01s/it]

Scores:  {'FOOD': {'p': 99.39902062620567, 'r': 99.23703703703704, 'f': 99.31796278449106}}
9  Losses {'ner': 26018.762007813464}


 33%|███▎      | 10/30 [07:34<15:40, 47.03s/it]

Scores:  {'FOOD': {'p': 99.45247554502053, 'r': 99.29666666666667, 'f': 99.37451003285908}}
10  Losses {'ner': 25870.374924503238}


 37%|███▋      | 11/30 [08:26<15:24, 48.67s/it]

Scores:  {'FOOD': {'p': 99.49018298789186, 'r': 99.34848484848486, 'f': 99.4192834291086}}
11  Losses {'ner': 26048.68241537668}


 40%|████      | 12/30 [09:13<14:24, 48.04s/it]

Scores:  {'FOOD': {'p': 99.52716045948877, 'r': 99.39722222222223, 'f': 99.46214890275597}}
12  Losses {'ner': 25996.093058214}


 43%|████▎     | 13/30 [10:00<13:29, 47.59s/it]

Scores:  {'FOOD': {'p': 99.54815024004519, 'r': 99.42307692307692, 'f': 99.4855742710164}}
13  Losses {'ner': 26021.383717063924}


 47%|████▋     | 14/30 [10:46<12:36, 47.29s/it]

Scores:  {'FOOD': {'p': 99.56375598941571, 'r': 99.44285714285715, 'f': 99.50326984240432}}
14  Losses {'ner': 26045.926315679084}


 50%|█████     | 15/30 [11:33<11:45, 47.07s/it]

Scores:  {'FOOD': {'p': 99.5884132775652, 'r': 99.47333333333333, 'f': 99.53084004091252}}
15  Losses {'ner': 25819.267373969753}


 53%|█████▎    | 16/30 [12:19<10:56, 46.89s/it]

Scores:  {'FOOD': {'p': 99.60996975701323, 'r': 99.49583333333332, 'f': 99.55286883110115}}
16  Losses {'ner': 25874.6600422709}


 57%|█████▋    | 17/30 [13:08<10:18, 47.56s/it]

Scores:  {'FOOD': {'p': 99.62900439698493, 'r': 99.51960784313727, 'f': 99.57427607313821}}
17  Losses {'ner': 25871.822857204126}


 60%|██████    | 18/30 [13:55<09:27, 47.32s/it]

Scores:  {'FOOD': {'p': 99.640361121925, 'r': 99.53518518518518, 'f': 99.58774538413792}}
18  Losses {'ner': 25830.972538574133}


 63%|██████▎   | 19/30 [14:42<08:38, 47.18s/it]

Scores:  {'FOOD': {'p': 99.65755228913123, 'r': 99.5578947368421, 'f': 99.60769858613517}}
19  Losses {'ner': 25769.186318012944}


 67%|██████▋   | 20/30 [15:29<07:51, 47.18s/it]

Scores:  {'FOOD': {'p': 99.664681447375, 'r': 99.57000000000001, 'f': 99.61731822615741}}
20  Losses {'ner': 25683.07876392507}


 70%|███████   | 21/30 [16:16<07:04, 47.21s/it]

Scores:  {'FOOD': {'p': 99.68065838390899, 'r': 99.58888888888889, 'f': 99.6347525051214}}
21  Losses {'ner': 25646.252095404863}


 73%|███████▎  | 22/30 [17:04<06:17, 47.24s/it]

Scores:  {'FOOD': {'p': 99.69063253514506, 'r': 99.60151515151516, 'f': 99.64605391803913}}
22  Losses {'ner': 25713.45350954862}


 77%|███████▋  | 23/30 [17:51<05:30, 47.27s/it]

Scores:  {'FOOD': {'p': 99.70118510567313, 'r': 99.61304347826086, 'f': 99.65709480277513}}
23  Losses {'ner': 25956.044627837662}


 80%|████████  | 24/30 [18:40<04:46, 47.75s/it]

Scores:  {'FOOD': {'p': 99.71225221718703, 'r': 99.62638888888888, 'f': 99.66930206060943}}
24  Losses {'ner': 25792.726832060187}


 83%|████████▎ | 25/30 [19:28<03:59, 47.96s/it]

Scores:  {'FOOD': {'p': 99.71976086260275, 'r': 99.63466666666667, 'f': 99.67719560345748}}
25  Losses {'ner': 25812.834538070456}


 87%|████████▋ | 26/30 [20:18<03:13, 48.47s/it]

Scores:  {'FOOD': {'p': 99.72926156412395, 'r': 99.64615384615385, 'f': 99.68769038381376}}
26  Losses {'ner': 25772.441950808054}


 90%|█████████ | 27/30 [21:09<02:27, 49.17s/it]

Scores:  {'FOOD': {'p': 99.73558375446356, 'r': 99.6530864197531, 'f': 99.69431802041585}}
27  Losses {'ner': 25666.00444838591}


 93%|█████████▎| 28/30 [22:01<01:39, 49.95s/it]

Scores:  {'FOOD': {'p': 99.74384330358738, 'r': 99.66428571428571, 'f': 99.70404863844509}}
28  Losses {'ner': 25601.437996828463}


 97%|█████████▋| 29/30 [22:53<00:50, 50.67s/it]

Scores:  {'FOOD': {'p': 99.75038247846042, 'r': 99.6735632183908, 'f': 99.71195805281344}}
29  Losses {'ner': 25698.824056032114}


100%|██████████| 30/30 [23:45<00:00, 47.53s/it]

Scores:  {'FOOD': {'p': 99.75759462705155, 'r': 99.68222222222222, 'f': 99.71989418224663}}





In [173]:
nlp.to_disk('/content/gdrive/My Drive/UPF/Trabalho IA/model_ner')
#nlp = spacy.load('/content/gdrive/My Drive/UPF/Trabalho IA/model_ner')

In [174]:
f = FloatProgress(min=0, max=len(TEST_DATA))
display(f)

count = 0
label_correct = 0
total_prediction = 0

print("Total:", len(TEST_DATA))
for text, annotations in (TEST_DATA):
    doc = nlp(text)
    f.value += 1

    predicted = [(ent.text, ent.label_) for ent in doc.ents]
    for p in predicted:
      total_prediction +=1
      if p[1] == 'FOOD' and p[0] in foods.iloc:
        label_correct +=1

    if count < 20:
        displacy.render(nlp(text), jupyter=True, style='ent')
    count += 1

print("------------\n\nTotal de {} acertos de um total de {}".format(label_correct, total_prediction ))

FloatProgress(value=0.0, max=1500.0)

Total: 1500


------------

Total de 2995 acertos de um total de 2997


In [175]:
#Eu comi um hambúrguer com batatas fritas no almoço hoje
displacy.render(nlp("I had a hamburger and chips for lunch today."), style="ent", jupyter=True)
#Decidi tomar um sorvete de chocolate como uma recompensa para mim
displacy.render(nlp("I decided to have chocolate ice cream as a little treat for myself."), style="ent", jupyter=True)
#eu gosto de sorvete com cobertura de chocolate
displacy.render(nlp("eu gosto de sorvete com cobertura de chocolate"), style="ent", jupyter=True)
#ontem fui na UPF e comi uma pizza. No intervalo da aula, comprei um pastel
displacy.render(nlp('yesterday I went to the UPF and had a pizza. In between class, I bought a pastel'), style="ent", jupyter=True)
#Fui no supermercado e comprei massa para fazer macarrão
displacy.render(nlp("I went to the supermarket and bought pasta to make pasta"), style="ent", jupyter=True)

## Modelo 2

In [11]:
model_spacy = 'en_core_web_lg'
nlp = spacy.load(model_spacy)

In [12]:
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
else:
  ner = nlp.get_pipe("ner")

In [13]:
print(articles["Article"].shape)

(11992,)


In [14]:
revision_texts = []

for doc in nlp.pipe(articles["Article"][:4000], disable=["tagger", "ner"]):
    for sentence in doc.sents:
        revision_texts.append(" ".join(re.split("\s+", sentence.text)))

In [15]:
print(revision_texts[1])
articles["Article"].iloc[0]

And in that sense, this year shows little sign of ending on Dec. 31.


'In the Washington of 2016, even when the policy can be bipartisan, the politics cannot. And in that sense, this year shows little sign of ending on Dec. 31. When President Obama moved to sanction Russia over its alleged interference in the U. S. election just concluded, some Republicans who had long called for similar or more severe measures could scarcely bring themselves to approve. House Speaker Paul Ryan called the Obama measures ”appropriate” but also ”overdue” and ”a prime example of this administration’s ineffective foreign policy that has left America weaker in the eyes of the world.” Other GOP leaders sounded much the same theme. ”[We have] been urging President Obama for years to take strong action to deter Russia’s worldwide aggression, including its   operations,” wrote Rep. Devin Nunes,  . chairman of the House Intelligence Committee. ”Now with just a few weeks left in office, the president has suddenly decided that some stronger measures are indeed warranted.” Appearing 

In [16]:
list_texts_and_entities = []

for doc in nlp.pipe(revision_texts, batch_size=50, disable=["tagger", "parser"]):
    if len(doc.ents) > 0:
        list_texts_and_entities.append((doc.text, {"entities": [(e.start_char, e.end_char, e.label_) for e in doc.ents]}))

In [17]:
print(len(list_texts_and_entities))
list_texts_and_entities[0]

112460


('In the Washington of 2016, even when the policy can be bipartisan, the politics cannot.',
 {'entities': [(7, 17, 'GPE'), (21, 25, 'DATE')]})

In [18]:
list_texts_and_entities = random.sample(list_texts_and_entities, 15000)

In [19]:
len(list_texts_and_entities)

15000

In [20]:
random.shuffle(list_entities)
random.shuffle(list_texts_and_entities)

list_entities_TRAIN = list_entities[0:int(len(list_entities)*0.75)]
list_entities_TEST = list_entities[int(len(list_entities)*0.75):]

TRAIN_DATA = list_entities_TRAIN + list_texts_and_entities[0:int(len(list_texts_and_entities)*0.75)]
TEST_DATA = list_entities_TEST + list_texts_and_entities[int(len(list_texts_and_entities)*0.75):]

In [21]:
random.shuffle(TRAIN_DATA)

In [22]:
len(TRAIN_DATA)

15750

In [23]:
len(TEST_DATA)

5250

In [24]:
ner = nlp.get_pipe("ner")
ner.add_label("FOOD")

pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] #other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

epochs = 20
optimizer = nlp.resume_training()
with nlp.disable_pipes(*other_pipes):
    sizes = compounding(4.0, 32.0, 1.001)
    
    for epoch in range(epochs):
        examples = TRAIN_DATA
        random.shuffle(examples)
        batches = minibatch(examples, size=sizes)
        losses = {}
        
        for batch in batches:
            texts, annotations = zip(*batch)
            try:
              nlp.update(texts, annotations, sgd=optimizer, drop=0.3, losses=losses)
            except:continue

        print(epoch," Losses", losses, end="\n")
        random.shuffle(TEST_DATA)
        scoreNER(nlp, TEST_DATA[:500])

0  Losses {'ner': 160526.73461563885}
Scores:  {'MONEY': {'p': 100.0, 'r': 100.0, 'f': 100.0}, 'ORG': {'p': 85.64593301435407, 'r': 80.99547511312217, 'f': 83.25581395348837}, 'PERSON': {'p': 81.25, 'r': 94.41340782122904, 'f': 87.33850129198966}, 'GPE': {'p': 89.05109489051095, 'r': 91.72932330827066, 'f': 90.37037037037037}, 'FAC': {'p': 33.33333333333333, 'r': 11.11111111111111, 'f': 16.666666666666664}, 'TIME': {'p': 80.0, 'r': 66.66666666666666, 'f': 72.72727272727272}, 'LOC': {'p': 100.0, 'r': 75.0, 'f': 85.71428571428571}, 'FOOD': {'p': 94.13793103448276, 'r': 91.0, 'f': 92.54237288135593}, 'DATE': {'p': 88.49557522123894, 'r': 83.33333333333334, 'f': 85.83690987124464}, 'ORDINAL': {'p': 85.71428571428571, 'r': 100.0, 'f': 92.3076923076923}, 'WORK_OF_ART': {'p': 100.0, 'r': 20.0, 'f': 33.333333333333336}, 'NORP': {'p': 88.0, 'r': 97.77777777777777, 'f': 92.63157894736842}, 'CARDINAL': {'p': 85.0, 'r': 94.44444444444444, 'f': 89.47368421052632}, 'PERCENT': {'p': 100.0, 'r': 100.0

In [28]:
f = FloatProgress(min=0, max=len(TEST_DATA))
display(f)

count = 0
label_correct = 0
total_prediction = 0

print("Total:", len(TEST_DATA))
for text, annotations in (TEST_DATA):
    doc = nlp(text)
    f.value += 1

    predicted = [(ent.text,ent.label_) for ent in doc.ents]

    for p in predicted:
      if p[1] == 'FOOD':
        total_prediction +=1
      if p[1] == 'FOOD' and p[0] in foods.iloc:
        label_correct +=1

    if count < 50:
        displacy.render(nlp(text), jupyter=True, style='ent')
    count += 1

print("------------\n\nTotal de {} acertos de um total de {}".format(label_correct, total_prediction ))

FloatProgress(value=0.0, max=5250.0)

Total: 5250


  "__main__", mod_spec)


------------

Total de 2947 acertos de um total de 2970


In [26]:
displacy.render(nlp("I had a hamburger and chips for lunch today."), style="ent", jupyter=True)
#displacy.render(nlp("I decided to have chocolate ice cream as a little treat for myself."), style="ent", jupyter=True)
displacy.render(nlp("I ordered basmati rice, leaf spinach and cheese from Tesco yesterday"), style="ent", jupyter=True)
displacy.render(nlp("i like ice cream with cream. At night I like pasta"), style="ent", jupyter=True)
#ontem fui na UPF e comi uma pizza. No intervalo da aula, comprei um pastel
displacy.render(nlp('yesterday I went to the UPF and had a pizza. In between class, I bought a pastel'), style="ent", jupyter=True)
#Fui no supermercado e comprei massa para fazer macarrão
displacy.render(nlp("I went to the supermarket and bought pasta to make pasta"), style="ent", jupyter=True)

In [27]:
#nlp.meta["name"] = "ner_food"
nlp.to_disk("/content/gdrive/My Drive/UPF/Trabalho IA/model_ner3")