# **NAMED ENTITY RECOGNITION (NER)**

In [1]:
%pip install -q -r requirements.txt
!python -m spacy download es_core_news_sm
!python -m spacy download nl_core_news_sm

Note: you may need to restart the kernel to use updated packages.
Collecting es-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl (12.9 MB)
     ---------------------------------------- 0.0/12.9 MB ? eta -:--:--
     --------------------------------------- 0.0/12.9 MB 640.0 kB/s eta 0:00:21
     - -------------------------------------- 0.4/12.9 MB 4.4 MB/s eta 0:00:03
     ---- ----------------------------------- 1.3/12.9 MB 10.2 MB/s eta 0:00:02
     -------- ------------------------------- 2.7/12.9 MB 15.9 MB/s eta 0:00:01
     ------------ --------------------------- 4.0/12.9 MB 21.0 MB/s eta 0:00:01
     -------------- ------------------------- 4.5/12.9 MB 22.2 MB/s eta 0:00:01
     -------------- ------------------------- 4.5/12.9 MB 22.2 MB/s eta 0:00:01
     --------------- ------------------------ 5.0/12.9 MB 16.0 MB/s eta 0:00:01
     ----------------- ---------------------

In [2]:
import numpy as np
import spacy
import nltk
import svgling
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
nltk.download('conll2002')
from nltk.corpus import conll2002

# Spanish
train_esp = conll2002.iob_sents('esp.train') # Train
val_esp = conll2002.iob_sents('esp.testa') # Val
test_esp = conll2002.iob_sents('esp.testb') # Test
# Dutch
train_ned = conll2002.iob_sents('ned.train') # Train
val_ned = conll2002.iob_sents('ned.testa') # Val
test_ned = conll2002.iob_sents('ned.testb') # Test

[nltk_data] Downloading package conll2002 to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


In [4]:
from mycrftagger_class import MyCRFTagger

In [5]:
tagger_esp = MyCRFTagger(verbose=True, language="esp")
tagger_ned = MyCRFTagger(verbose=True, language="ned")

In [6]:
def get_tuples(X : list) -> list:
    new = []
    for s in X:
        t = []
        for w in s:
            t.append((w[0], w[2]))
        new.append(t)
    return new

In [7]:
train_esp_tuples = get_tuples(train_esp)
train_ned_tuples = get_tuples(train_ned)

In [8]:
test_esp_tuples = get_tuples(test_esp)
test_ned_tuples = get_tuples(test_ned)

In [9]:
train_ned_tuples[0]

[('De', 'O'),
 ('tekst', 'O'),
 ('van', 'O'),
 ('het', 'O'),
 ('arrest', 'O'),
 ('is', 'O'),
 ('nog', 'O'),
 ('niet', 'O'),
 ('schriftelijk', 'O'),
 ('beschikbaar', 'O'),
 ('maar', 'O'),
 ('het', 'O'),
 ('bericht', 'O'),
 ('werd', 'O'),
 ('alvast', 'O'),
 ('bekendgemaakt', 'O'),
 ('door', 'O'),
 ('een', 'O'),
 ('communicatiebureau', 'O'),
 ('dat', 'O'),
 ('Floralux', 'B-ORG'),
 ('inhuurde', 'O'),
 ('.', 'O')]

In [10]:
tagger_esp.train(train_esp_tuples, "./esp_model.mdl")

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 139071
Seconds required: 0.554

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 2147483647
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 272777.481758
Feature norm: 1.000000
Error norm: 247755.629116
Active features: 139071
Line search trials: 1
Line search step: 0.000003
Seconds required for this iteration: 0.325

***** Iteration #2 *****
Loss: 221759.322364
Feature norm: 3.859125
Error norm: 80744.364647
Active features: 139071
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.159

***** Iteration #3 *****
Loss: 116117.792106
Feature norm: 2.938989
Error norm: 49164.234518
Active features: 139071
Line search trials: 3
Line search step: 2.476164
Secon

In [14]:
tagger_ned.train(train_ned_tuples, "./ned_model.mdl")

In [None]:
tagger_ned.accuracy(test_ned_tuples)

Codi del profe

In [11]:
tagger_esp = MyCRFTagger(verbose=True, language="esp")


In [12]:
tagger_esp.set_model_file("./esp_model.mdl")

In [13]:

tagger_esp.accuracy(test_esp_tuples)

0.9709894630625037

In [13]:
test_data_es_tolist = [[token for token, _ in sentence] for sentence in test_esp_tuples]
test_data_nl_tolist = [[token for token, _ in sentence] for sentence in test_ned_tuples]


from typing import List, Tuple, Set, Any

def decode_entities(phrase: List[Tuple[Any, str]]) -> Set[Tuple[int, int, str]]:
    first_idx = -1
    current_entity = None
    
    result = set()
    for i, (token, label) in enumerate(phrase):
        if label[0] == "B" or label == "O":
            if current_entity:
                result.add((first_idx, i, current_entity))
                current_entity = None
            if label[0] == "B":
                first_idx = i
                current_entity = label[2:]
    if current_entity:
        result.add((first_idx, len(phrase), current_entity))
    return result



tagged_es = []
for sentence in test_data_es_tolist:
    tagged_sentence = tagger_esp.tag(sentence)
    tagged_es.append(tagged_sentence)
    #decoded_es.append(decode_entities(tagged_sentence))

tagged_nl = []
for sentence in test_data_nl_tolist:
    tagged_sentence = tagger_ned.tag(sentence)
    tagged_nl.append(tagged_sentence)
    #decoded_nl.append(decode_entities(tagged_sentence))
    
    

    
def evaluate(gold: List[List[Tuple[Any, str]]], predicted: List[List[Tuple[Any, str]]]) -> Tuple[int, int, int]:
    tp = 0
    fn = 0
    fp = 0
    tot = 0
    for gold_sentence, predicted_sentence in zip(gold, predicted):
        #gold_sentence, predicted_sentence = eliminate_rest(gold_sentence), eliminate_rest(predicted_sentence)
        gold_entities = decode_entities(gold_sentence)
        predicted_entities = decode_entities(predicted_sentence)
        tp += len(gold_entities.intersection(predicted_entities))
        fn += len(gold_entities.difference(predicted_entities))
        fp += len(predicted_entities.difference(gold_entities))
        tot += len(gold_entities.union(predicted_entities))
        '''
        if gold_entities != predicted_entities:
            print("GOLD sentence: ", gold_sentence)
            print("PRED sentence: ", predicted_sentence)
            for i in range(len(gold_sentence)):
                if gold_sentence[i][1] != predicted_sentence[i][1]:
                    print(f"ERROR {i} --- Gold: {gold_sentence[i]} Predicted: {predicted_sentence[i]}")
        #'''
    print("TP: ", tp)
    print("FN: ", fn)
    print("FP: ", fp)
    print("TOT", tot)
    
    return tp, fn, fp



tp_es, fn_es, fp_es = evaluate(test_esp_tuples, tagged_es)
tp_nl, fn_nl, fp_nl = evaluate(test_ned_tuples, tagged_nl)

tp_es, fn_es, fp_es = tp_es / (tp_es + fn_es), tp_es / (tp_es + fp_es), 2 * tp_es / (2 * tp_es + fn_es + fp_es)
tp_nl, fn_nl, fp_nl = tp_nl / (tp_nl + fn_nl), tp_nl / (tp_nl + fp_nl), 2 * tp_nl / (2 * tp_nl + fn_nl + fp_nl)

tp_es_f1 = 2 * tp_es / (2 * tp_es + fn_es + fp_es)
tp_nl_f1 = 2 * tp_nl / (2 * tp_nl + fn_nl + fp_nl)

recall_es = tp_es / (tp_es + fn_es)
precision_es = tp_es / (tp_es + fp_es)

recall_nl = tp_nl / (tp_nl + fn_nl)
precision_nl = tp_nl / (tp_nl + fp_nl)

TP:  2519
FN:  1039
FP:  879
TOT 4437
TP:  2449
FN:  1492
FP:  1045
TOT 4986


In [14]:
2519 / 4437

0.5677259409510931

In [15]:
2449 / 4986

0.49117529081427996

In [16]:
precision_es

0.4943149516770893

In [17]:
precision_nl

0.485408369785206

In [18]:
tagged_text_esp = tagger_esp.tag(words_esp)
tagged_text_ned = tagger_ned.tag(words_ned)

NameError: name 'words_esp' is not defined