In [1]:
import numpy as np
import spacy
import nltk
import svgling
import seaborn as sns
import matplotlib.pyplot as plt
from complete_class import CompleteNER

In [2]:
nltk.download('conll2002')
from nltk.corpus import conll2002

# Spanish
train_esp = conll2002.iob_sents('esp.train') # Train
val_esp = conll2002.iob_sents('esp.testa') # Val
test_esp = conll2002.iob_sents('esp.testb') # Test
# Dutch
train_ned = conll2002.iob_sents('ned.train') # Train
val_ned = conll2002.iob_sents('ned.testa') # Val
test_ned = conll2002.iob_sents('ned.testb') # Test

[nltk_data] Downloading package conll2002 to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


In [4]:
spanish = CompleteNER(train_esp, val_esp, test_esp, language="esp", postag=True)

In [8]:
features = {
			'CAPITALIZATION': True,
			'HAS_UPPER': True,
			'HAS_NUM': True,
			'PUNCTUATION': True,
			'SUF': True,
            'PRE': True,
			'WORD': True,
			'LEN': True,
			'NEXT': True,
			'POS': True,
			'LEMMA': True,
			'CITY': True,
			'COMPANY': True,
			'CELEBRITY': True,
			'RESEARCH_ORGANIZATION': False,
			'NAME': True,
			'SURNAME': True,
			'PREV': True,
			'NEXT': True,
			'NUMBER': True,
			'GENDER': True,
			'PERSON': False,
			'PRONTYPE': False,
			'DEP': False,
			'HEAD_DISTANCE': False,
			'HEAD': False
		}

In [9]:
spanish.train(verbose=True, file="spanish.mdl", features_opt=features)

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 139667
Seconds required: 0.640

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 2147483647
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 289402.091375
Feature norm: 1.000000
Error norm: 239322.982809
Active features: 139667
Line search trials: 1
Line search step: 0.000003
Seconds required for this iteration: 0.387

***** Iteration #2 *****
Loss: 218065.917223
Feature norm: 4.321902
Error norm: 76173.000911
Active features: 139667
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.172

***** Iteration #3 *****
Loss: 100745.287373
Feature norm: 3.363817
Error norm: 22015.014658
Active features: 139667
Line search trials: 3
Line search step: 2.824055
Secon

In [10]:
spanish.test(verb=True)

Sentence index: 2
GOLD sentence:  [('Las', 'O'), ('reservas', 'O'), ('"', 'O'), ('on', 'O'), ('line', 'O'), ('"', 'O'), ('de', 'O'), ('billetes', 'O'), ('aéreos', 'O'), ('a', 'O'), ('través', 'O'), ('de', 'O'), ('Internet', 'B-MISC'), ('aumentaron', 'O'), ('en', 'O'), ('España', 'B-LOC'), ('un', 'O'), ('300', 'O'), ('por', 'O'), ('ciento', 'O'), ('en', 'O'), ('el', 'O'), ('primer', 'O'), ('trimestre', 'O'), ('de', 'O'), ('este', 'O'), ('año', 'O'), ('con', 'O'), ('respecto', 'O'), ('al', 'O'), ('mismo', 'O'), ('período', 'O'), ('de', 'O'), ('1999', 'O'), (',', 'O'), ('aseguró', 'O'), ('hoy', 'O'), ('Iñigo', 'B-PER'), ('García', 'I-PER'), ('Aranda', 'I-PER'), (',', 'O'), ('responsable', 'O'), ('de', 'O'), ('comunicación', 'O'), ('de', 'O'), ('Savia', 'B-ORG'), ('Amadeus', 'I-ORG'), ('.', 'O')]
PRED sentence:  [('Las', 'O'), ('reservas', 'O'), ('"', 'O'), ('on', 'O'), ('line', 'O'), ('"', 'O'), ('de', 'O'), ('billetes', 'O'), ('aéreos', 'O'), ('a', 'O'), ('través', 'O'), ('de', 'O'), ('I

(0.78951865565366,
 0.7790893760539629,
 0.7842693450275852,
 1504,
 0.9706595773581976,
 array([[   0.,    0.,    0.,    0.,    0.],
        [  67., 1043.,   27.,  189.,   83.],
        [ 169.,   59.,  528.,  124.,   16.],
        [  66.,  146.,  126., 2073.,   93.],
        [  13.,   51.,    9.,   33., 1263.]]))

In [4]:
spanish.load_from_file("model.mdl")

In [5]:
spanish.validation()

Sentence index: 4
GOLD sentence:  [('"', 'O'), ('Telefónica', 'B-ORG'), ('asumió', 'O'), ('un', 'O'), ('compromiso', 'O'), ('con', 'O'), ('Brasil', 'B-LOC'), (',', 'O'), ('y', 'O'), ('en', 'O'), ('especial', 'O'), ('con', 'O'), ('Sao', 'B-LOC'), ('Paulo', 'I-LOC'), ('en', 'O'), ('1998', 'O'), ('(', 'O'), ('año', 'O'), ('de', 'O'), ('privatización', 'O'), ('del', 'O'), ('sistema', 'O'), ('Telebras', 'B-MISC'), (')', 'O'), ('y', 'O'), ('estamos', 'O'), ('aquí', 'O'), ('para', 'O'), ('prestar', 'O'), ('cuentas', 'O'), ('"', 'O'), (',', 'O'), ('dijo', 'O'), ('Ferreira', 'B-PER'), ('en', 'O'), ('el', 'O'), ('acto', 'O'), ('de', 'O'), ('instalación', 'O'), ('de', 'O'), ('la', 'O'), ('línea', 'O'), ('número', 'O'), ('tres', 'O'), ('millones', 'O'), ('de', 'O'), ('la', 'O'), ('gestión', 'O'), ('de', 'O'), ('Telefónica', 'B-ORG'), ('.', 'O')]
PRED sentence:  [('"', 'O'), ('Telefónica', 'B-ORG'), ('asumió', 'O'), ('un', 'O'), ('compromiso', 'O'), ('con', 'O'), ('Brasil', 'B-ORG'), (',', 'O'), ('