In [1]:
import numpy as np
import spacy
import nltk
import svgling
import seaborn as sns
import matplotlib.pyplot as plt
from complete_class import CompleteNER




In [2]:
nltk.download('conll2002')
from nltk.corpus import conll2002

# Spanish
train_esp = conll2002.iob_sents('esp.train') # Train
val_esp = conll2002.iob_sents('esp.testa') # Val
test_esp = conll2002.iob_sents('esp.testb') # Test
# Dutch
train_ned = conll2002.iob_sents('ned.train') # Train
val_ned = conll2002.iob_sents('ned.testa') # Val
test_ned = conll2002.iob_sents('ned.testb') # Test

[nltk_data] Downloading package conll2002 to C:\Users\Cai Selvas
[nltk_data]     Sala\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


In [3]:
spanish = CompleteNER(train_esp, val_esp, test_esp, language="esp", postag=True)

In [4]:
features = {
			'CAPITALIZATION': True,
			'HAS_UPPER': True,
			'HAS_NUM': True,
			'PUNCTUATION': True,
			'SUF': True,
            'PRE': True,
            '2NEXT': True,
            '2PREV': True,
			'WORD': True,
			'LEN': True,
			'NEXT': True,
			'POS': True,
			'LEMMA': False,
			'CITY': False,
			'COMPANY': False,
			'CELEBRITY': False,
			'RESEARCH_ORGANIZATION': False,
			'NAME': False,
			'SURNAME': False,
			'PREV': True,
			'NEXT': False,
			'NUMBER': False,
			'GENDER': False,
			'PERSON': False,
			'PRONTYPE': False,
			'DEP': False,
			'HEAD_DISTANCE': False,
			'HEAD': False
		}

In [5]:
exclude = ['HEAD', 'HEAD_DISTANCE']
for k, v in features.items():
	if k in exclude:
		features[k] = False
	else:
		features[k] = True

In [6]:
spanish.train(verbose=True, file="./models/spanish.mdl", feature_opt=features, use_regex=False, custom_postag=False)

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 203895
Seconds required: 0.892

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 2147483647
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 272674.578808
Feature norm: 1.000000
Error norm: 241909.721169
Active features: 203895
Line search trials: 1
Line search step: 0.000003
Seconds required for this iteration: 0.622

***** Iteration #2 *****
Loss: 177573.073826
Feature norm: 3.585529
Error norm: 81394.795693
Active features: 203895
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.317

***** Iteration #3 *****
Loss: 134574.870024
Feature norm: 3.325992
Error norm: 69017.930234
Active features: 203895
Line search trials: 1
Line search step: 1.000000
Secon

In [7]:
spanish.test(verb=True)

Sentence index: 2
GOLD sentence:  [('Las', 'O'), ('reservas', 'O'), ('"', 'O'), ('on', 'O'), ('line', 'O'), ('"', 'O'), ('de', 'O'), ('billetes', 'O'), ('aéreos', 'O'), ('a', 'O'), ('través', 'O'), ('de', 'O'), ('Internet', 'B-MISC'), ('aumentaron', 'O'), ('en', 'O'), ('España', 'B-LOC'), ('un', 'O'), ('300', 'O'), ('por', 'O'), ('ciento', 'O'), ('en', 'O'), ('el', 'O'), ('primer', 'O'), ('trimestre', 'O'), ('de', 'O'), ('este', 'O'), ('año', 'O'), ('con', 'O'), ('respecto', 'O'), ('al', 'O'), ('mismo', 'O'), ('período', 'O'), ('de', 'O'), ('1999', 'O'), (',', 'O'), ('aseguró', 'O'), ('hoy', 'O'), ('Iñigo', 'B-PER'), ('García', 'I-PER'), ('Aranda', 'I-PER'), (',', 'O'), ('responsable', 'O'), ('de', 'O'), ('comunicación', 'O'), ('de', 'O'), ('Savia', 'B-ORG'), ('Amadeus', 'I-ORG'), ('.', 'O')]
PRED sentence:  [('Las', 'O'), ('reservas', 'O'), ('"', 'O'), ('on', 'O'), ('line', 'O'), ('"', 'O'), ('de', 'O'), ('billetes', 'O'), ('aéreos', 'O'), ('a', 'O'), ('través', 'O'), ('de', 'O'), ('I

(0.8029092983456931,
 0.7911748173130972,
 0.7969988674971688,
 1399,
 0.9726971067083228,
 array([[   0.,    0.,    0.,    0.,    0.],
        [  58., 1080.,   26.,  179.,   66.],
        [ 177.,   48.,  530.,  108.,   33.],
        [  68.,  136.,   98., 2121.,   81.],
        [  10.,   34.,    6.,   48., 1271.]]))

In [None]:
spanish.load_from_file("./models/model.mdl")

In [None]:
spanish.validation()

In [None]:
nederlands = CompleteNER(train_ned, val_ned, test_ned, language="ned", postag=True)

In [None]:
nederlands.train(verbose=True, file="./models/nederlands.mdl", feature_opt=features)

In [None]:
nederlands.test(verb=True)