# **NAMED ENTITY RECOGNITION (NER)**

In [1]:
%pip install -q -r requirements.txt
!python -m spacy download es_core_news_sm
!python -m spacy download nl_core_news_sm

Note: you may need to restart the kernel to use updated packages.
Collecting es-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl (12.9 MB)
     ---------------------------------------- 0.0/12.9 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.9 MB 1.3 MB/s eta 0:00:11
      --------------------------------------- 0.2/12.9 MB 3.4 MB/s eta 0:00:04
     - -------------------------------------- 0.6/12.9 MB 5.8 MB/s eta 0:00:03
     --- ------------------------------------ 1.0/12.9 MB 7.3 MB/s eta 0:00:02
     --- ------------------------------------ 1.0/12.9 MB 7.3 MB/s eta 0:00:02
     --- ------------------------------------ 1.0/12.9 MB 7.3 MB/s eta 0:00:02
     --- ------------------------------------ 1.0/12.9 MB 7.3 MB/s eta 0:00:02
     --- ------------------------------------ 1.0/12.9 MB 7.3 MB/s eta 0:00:02
     --- ------------------------------------ 1.0/1

In [2]:
import numpy as np
import spacy
import nltk
import svgling
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
nltk.download('conll2002')
from nltk.corpus import conll2002

# Spanish
train_esp = conll2002.iob_sents('esp.train') # Train
val_esp = conll2002.iob_sents('esp.testa') # Val
test_esp = conll2002.iob_sents('esp.testb') # Test
# Dutch
train_ned = conll2002.iob_sents('ned.train') # Train
val_ned = conll2002.iob_sents('ned.testa') # Val
test_ned = conll2002.iob_sents('ned.testb') # Test

[nltk_data] Downloading package conll2002 to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


In [14]:
train_esp[2]

[('El', 'DA', 'O'),
 ('Abogado', 'NC', 'B-PER'),
 ('General', 'AQ', 'I-PER'),
 ('del', 'SP', 'I-PER'),
 ('Estado', 'NC', 'I-PER'),
 (',', 'Fc', 'O'),
 ('Daryl', 'VMI', 'B-PER'),
 ('Williams', 'NC', 'I-PER'),
 (',', 'Fc', 'O'),
 ('subrayó', 'VMI', 'O'),
 ('hoy', 'RG', 'O'),
 ('la', 'DA', 'O'),
 ('necesidad', 'NC', 'O'),
 ('de', 'SP', 'O'),
 ('tomar', 'VMN', 'O'),
 ('medidas', 'NC', 'O'),
 ('para', 'SP', 'O'),
 ('proteger', 'VMN', 'O'),
 ('al', 'SP', 'O'),
 ('sistema', 'NC', 'O'),
 ('judicial', 'AQ', 'O'),
 ('australiano', 'AQ', 'O'),
 ('frente', 'RG', 'O'),
 ('a', 'SP', 'O'),
 ('una', 'DI', 'O'),
 ('página', 'NC', 'O'),
 ('de', 'SP', 'O'),
 ('internet', 'NC', 'O'),
 ('que', 'PR', 'O'),
 ('imposibilita', 'VMI', 'O'),
 ('el', 'DA', 'O'),
 ('cumplimiento', 'NC', 'O'),
 ('de', 'SP', 'O'),
 ('los', 'DA', 'O'),
 ('principios', 'NC', 'O'),
 ('básicos', 'AQ', 'O'),
 ('de', 'SP', 'O'),
 ('la', 'DA', 'O'),
 ('Ley', 'NC', 'B-MISC'),
 ('.', 'Fp', 'O')]

In [4]:
from mycrftagger_class import MyCRFTagger

In [5]:
tagger_esp = MyCRFTagger(verbose=True, language="esp")
tagger_ned = MyCRFTagger(verbose=True, language="ned")

In [6]:
def get_tuples(X : list, method: str = 'bio') -> list[tuple[str, str]]:
    """
    Get tuples from the dataset.

    Parameters
    ----------
    X : list
        Dataset.
    method : str
        Method to get tuples. Options: 'bio', 'biow', 'io'. Default: 'bio'.

    Returns
    -------
    list
        List of tuples.
    """
    assert method.lower() in ['bio', 'biow', 'io'], "Method not valid, options: 'bio', 'biow', 'io'"
    
    new_X = []
    for sentence in X:
        tuple_sentence = []
        for idx, word in enumerate(sentence):    
            if method.lower() == 'bio':
                # By default
                tuple_sentence.append((word[0], word[2]))
            
            elif method.lower() == 'biow':
                # When there is a B-TAG and the next word is an O-TAG, the B-TAG is changed to an W-TAG (length 1)
                if word[2].startswith('B') and (idx+1) < len(sentence) and sentence[idx+1][2].startswith('O'):
                    tuple_sentence.append((word[0], f'W-{word[2][2:]}'))
                else:
                    tuple_sentence.append((word[0], word[2]))
            
            elif method.lower() == 'io':
                # When there is a B-TAG, it is changed to an I-TAG
                if word[2].startswith('B'):
                    tuple_sentence.append((word[0], f'I-{word[2][2:]}'))
                else:
                    tuple_sentence.append((word[0], word[2]))
    
        new_X.append(tuple_sentence)
    
    return new_X

In [7]:
train_esp_tuples = get_tuples(train_esp)
train_ned_tuples = get_tuples(train_ned)

In [8]:
test_esp_tuples = get_tuples(test_esp)
test_ned_tuples = get_tuples(test_ned)

In [9]:
train_ned_tuples[0]

[('De', 'O'),
 ('tekst', 'O'),
 ('van', 'O'),
 ('het', 'O'),
 ('arrest', 'O'),
 ('is', 'O'),
 ('nog', 'O'),
 ('niet', 'O'),
 ('schriftelijk', 'O'),
 ('beschikbaar', 'O'),
 ('maar', 'O'),
 ('het', 'O'),
 ('bericht', 'O'),
 ('werd', 'O'),
 ('alvast', 'O'),
 ('bekendgemaakt', 'O'),
 ('door', 'O'),
 ('een', 'O'),
 ('communicatiebureau', 'O'),
 ('dat', 'O'),
 ('Floralux', 'B-ORG'),
 ('inhuurde', 'O'),
 ('.', 'O')]

In [10]:
tagger_esp.train(train_esp_tuples, "./esp_model.mdl")

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 139080
Seconds required: 0.588

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 2147483647
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 272783.710241
Feature norm: 1.000000
Error norm: 247761.647729
Active features: 139080
Line search trials: 1
Line search step: 0.000003
Seconds required for this iteration: 0.334

***** Iteration #2 *****
Loss: 215908.150388
Feature norm: 3.874776
Error norm: 86174.121669
Active features: 139080
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.169

***** Iteration #3 *****
Loss: 99610.216443
Feature norm: 3.051262
Error norm: 26038.453125
Active features: 139080
Line search trials: 3
Line search step: 2.806804
Second

In [11]:
tagger_ned.train(train_ned_tuples, "./ned_model.mdl")

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 135415
Seconds required: 1.497

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 2147483647
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 204583.042442
Feature norm: 1.000000
Error norm: 196761.398124
Active features: 135415
Line search trials: 1
Line search step: 0.000004
Seconds required for this iteration: 0.798

***** Iteration #2 *****
Loss: 139533.075188
Feature norm: 4.144392
Error norm: 49704.264401
Active features: 135415
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.510

***** Iteration #3 *****
Loss: 73671.713396
Feature norm: 3.100113
Error norm: 44898.182983
Active features: 135415
Line search trials: 2
Line search step: 5.000000
Second

In [12]:
tagger_ned.accuracy(test_ned_tuples)

0.9772050816696914

Codi del profe

In [13]:
tagger_esp = MyCRFTagger(verbose=True, language="esp")


In [14]:
tagger_esp.set_model_file("./esp_model.mdl")

In [15]:

tagger_esp.accuracy(test_esp_tuples)

0.9709118428967846

In [16]:
test_data_es_tolist = [[token for token, _ in sentence] for sentence in test_esp_tuples]
test_data_nl_tolist = [[token for token, _ in sentence] for sentence in test_ned_tuples]


from typing import List, Tuple, Set, Any

def decode_entities(phrase: List[Tuple[Any, str]]) -> Set[Tuple[int, int, str]]:
    first_idx = -1
    current_entity = None
    
    result = set()
    for i, (token, label) in enumerate(phrase):
        if label[0] == "B" or label == "O":
            if current_entity:
                result.add((first_idx, i, current_entity))
                current_entity = None
            if label[0] == "B":
                first_idx = i
                current_entity = label[2:]
    if current_entity:
        result.add((first_idx, len(phrase), current_entity))
    return result



tagged_es = []
for sentence in test_data_es_tolist:
    tagged_sentence = tagger_esp.tag(sentence)
    tagged_es.append(tagged_sentence)
    #decoded_es.append(decode_entities(tagged_sentence))

tagged_nl = []
for sentence in test_data_nl_tolist:
    tagged_sentence = tagger_ned.tag(sentence)
    tagged_nl.append(tagged_sentence)
    #decoded_nl.append(decode_entities(tagged_sentence))
    
    


In [21]:
def evaluate(gold: List[List[Tuple[Any, str]]], predicted: List[List[Tuple[Any, str]]]) -> Tuple[int, int, int]:
    tp = 0
    fn = 0
    fp = 0
    tot = 0
    for idx, (gold_sentence, predicted_sentence) in enumerate(zip(gold, predicted)):
        gold_entities = decode_entities(gold_sentence)
        predicted_entities = decode_entities(predicted_sentence)
        tp += len(gold_entities.intersection(predicted_entities))
        fn += len(gold_entities.difference(predicted_entities))
        fp += len(predicted_entities.difference(gold_entities))
        tot += len(gold_entities.union(predicted_entities))

        if gold_entities != predicted_entities:
            print("Sentence index:", idx)
            print("GOLD sentence: ", gold_sentence)
            print("PRED sentence: ", predicted_sentence)
            for i in range(len(gold_sentence)):
                if gold_sentence[i][1] != predicted_sentence[i][1]:
                    print(f"ERROR {i} --- Gold: {gold_sentence[i]} Predicted: {predicted_sentence[i]}")
            print()

    return tp, fn, fp, tot

tp_es, fn_es, fp_es, tot_es = evaluate(test_esp_tuples, tagged_es)
tp_nl, fn_nl, fp_nl, tot_nl = evaluate(test_ned_tuples, tagged_nl)

Sentence index: 2
GOLD sentence:  [('Las', 'O'), ('reservas', 'O'), ('"', 'O'), ('on', 'O'), ('line', 'O'), ('"', 'O'), ('de', 'O'), ('billetes', 'O'), ('aéreos', 'O'), ('a', 'O'), ('través', 'O'), ('de', 'O'), ('Internet', 'B-MISC'), ('aumentaron', 'O'), ('en', 'O'), ('España', 'B-LOC'), ('un', 'O'), ('300', 'O'), ('por', 'O'), ('ciento', 'O'), ('en', 'O'), ('el', 'O'), ('primer', 'O'), ('trimestre', 'O'), ('de', 'O'), ('este', 'O'), ('año', 'O'), ('con', 'O'), ('respecto', 'O'), ('al', 'O'), ('mismo', 'O'), ('período', 'O'), ('de', 'O'), ('1999', 'O'), (',', 'O'), ('aseguró', 'O'), ('hoy', 'O'), ('Iñigo', 'B-PER'), ('García', 'I-PER'), ('Aranda', 'I-PER'), (',', 'O'), ('responsable', 'O'), ('de', 'O'), ('comunicación', 'O'), ('de', 'O'), ('Savia', 'B-ORG'), ('Amadeus', 'I-ORG'), ('.', 'O')]
PRED sentence:  [('Las', 'O'), ('reservas', 'O'), ('"', 'O'), ('on', 'O'), ('line', 'O'), ('"', 'O'), ('de', 'O'), ('billetes', 'O'), ('aéreos', 'O'), ('a', 'O'), ('través', 'O'), ('de', 'O'), ('I

In [20]:
def calculate_precision_recall_f1(tp, fn, fp):
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score = 2 * (precision * recall) / (precision + recall)
    return precision, recall, f1_score

precision_es, recall_es, f1_es = calculate_precision_recall_f1(tp_es, fn_es, fp_es)
precision_nl, recall_nl, f1_nl = calculate_precision_recall_f1(tp_nl, fn_nl, fp_nl)

print("Spanish:")
print("Precision:", precision_es)
print("Recall:", recall_es)
print("F1-score:", f1_es)
print()
print("Dutch:")
print("Precision:", precision_nl)
print("Recall:", recall_nl)
print("F1-score:", f1_nl)


Spanish:
Precision: 0.7896089066514416
Recall: 0.7774030354131535
F1-score: 0.7834584336496245

Dutch:
Precision: 0.7820069204152249
Recall: 0.7454960669880741
F1-score: 0.7633151467913745


In [None]:
def calculate_accuracy(tp, fn, fp, tot):
    return (tp + (tot - tp - fn - fp))/tot

accuracy_es = calculate_accuracy(tp_es, fn_es, fp_es)
accuracy_nl = calculate_accuracy(tp_nl, fn_nl, fp_nl)
