In [13]:
import numpy as np
import spacy
import nltk
import svgling
import seaborn as sns
import matplotlib.pyplot as plt
from complete_class import CompleteNER




In [14]:
nltk.download('conll2002')
from nltk.corpus import conll2002

# Spanish
train_esp = conll2002.iob_sents('esp.train') # Train
val_esp = conll2002.iob_sents('esp.testa') # Val
test_esp = conll2002.iob_sents('esp.testb') # Test
# Dutch
train_ned = conll2002.iob_sents('ned.train') # Train
val_ned = conll2002.iob_sents('ned.testa') # Val
test_ned = conll2002.iob_sents('ned.testb') # Test

[nltk_data] Downloading package conll2002 to C:\Users\Cai Selvas
[nltk_data]     Sala\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


In [15]:
spanish = CompleteNER(train_esp, val_esp, test_esp, language="esp", postag=True)

In [4]:
features = {
			'CAPITALIZATION': True,
			'HAS_UPPER': True,
			'HAS_NUM': True,
			'PUNCTUATION': True,
			'SUF': True,
            'PRE': True,
            '2NEXT': True,
            '2PREV': True,
			'WORD': True,
			'LEN': True,
			'NEXT': True,
			'POS': True,
			'LEMMA': False,
			'CITY': False,
			'COMPANY': False,
			'CELEBRITY': False,
			'RESEARCH_ORGANIZATION': False,
			'NAME': False,
			'SURNAME': False,
			'PREV': True,
			'NEXT': False,
			'NUMBER': False,
			'GENDER': False,
			'PERSON': False,
			'PRONTYPE': False,
			'DEP': False,
			'HEAD_DISTANCE': False,
			'HEAD': False
		}

In [5]:
exclude = ['HEAD', 'HEAD_DISTANCE']
for k, v in features.items():
	if k in exclude:
		features[k] = False
	else:
		features[k] = True

In [6]:
spanish.train(verbose=True, file="spanish.mdl", feature_opt=features, use_regex=True)

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 140044
Seconds required: 0.615

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 2147483647
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 273714.251774
Feature norm: 1.000000
Error norm: 241945.826694
Active features: 140044
Line search trials: 1
Line search step: 0.000003
Seconds required for this iteration: 0.559

***** Iteration #2 *****
Loss: 178943.470857
Feature norm: 3.633287
Error norm: 81358.797836
Active features: 140044
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.283

***** Iteration #3 *****
Loss: 135057.519475
Feature norm: 3.368116
Error norm: 69218.471399
Active features: 140044
Line search trials: 1
Line search step: 1.000000
Secon

In [7]:
spanish.test(verb=True)

Sentence index: 2
GOLD sentence:  [('Las', 'O'), ('reservas', 'O'), ('"', 'O'), ('on', 'O'), ('line', 'O'), ('"', 'O'), ('de', 'O'), ('billetes', 'O'), ('aéreos', 'O'), ('a', 'O'), ('través', 'O'), ('de', 'O'), ('Internet', 'B-MISC'), ('aumentaron', 'O'), ('en', 'O'), ('España', 'B-LOC'), ('un', 'O'), ('300', 'O'), ('por', 'O'), ('ciento', 'O'), ('en', 'O'), ('el', 'O'), ('primer', 'O'), ('trimestre', 'O'), ('de', 'O'), ('este', 'O'), ('año', 'O'), ('con', 'O'), ('respecto', 'O'), ('al', 'O'), ('mismo', 'O'), ('período', 'O'), ('de', 'O'), ('1999', 'O'), (',', 'O'), ('aseguró', 'O'), ('hoy', 'O'), ('Iñigo', 'B-PER'), ('García', 'I-PER'), ('Aranda', 'I-PER'), (',', 'O'), ('responsable', 'O'), ('de', 'O'), ('comunicación', 'O'), ('de', 'O'), ('Savia', 'B-ORG'), ('Amadeus', 'I-ORG'), ('.', 'O')]
PRED sentence:  [('Las', 'O'), ('reservas', 'O'), ('"', 'O'), ('on', 'O'), ('line', 'O'), ('"', 'O'), ('de', 'O'), ('billetes', 'O'), ('aéreos', 'O'), ('a', 'O'), ('través', 'O'), ('de', 'O'), ('I

(0.7962328767123288,
 0.7841483979763912,
 0.790144435004248,
 1454,
 0.9716298294296858,
 array([[   0.,    0.,    0.,    0.,    0.],
        [  61., 1064.,   24.,  185.,   75.],
        [ 162.,   36.,  534.,  136.,   28.],
        [  72.,  149.,   91., 2110.,   82.],
        [  10.,   48.,    6.,   48., 1257.]]))

In [None]:
spanish.load_from_file("model.mdl")

In [None]:
spanish.validation()

In [17]:
nederlands = CompleteNER(train_ned, val_ned, test_ned, language="ned", postag=True)

In [18]:
nederlands.train(verbose=True, file="nederlands.mdl", feature_opt=features)

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 76902
Seconds required: 0.857

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 2147483647
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 253325.912889
Feature norm: 1.000000
Error norm: 178345.793506
Active features: 76902
Line search trials: 1
Line search step: 0.000005
Seconds required for this iteration: 0.810

***** Iteration #2 *****
Loss: 124837.126943
Feature norm: 4.377126
Error norm: 42234.376403
Active features: 76902
Line search trials: 2
Line search step: 0.342150
Seconds required for this iteration: 0.862

***** Iteration #3 *****
Loss: 84503.744114
Feature norm: 3.528568
Error norm: 43042.612786
Active features: 76902
Line search trials: 2
Line search step: 5.000000
Seconds re

In [19]:
nederlands.test(verb=True)

Sentence index: 27
GOLD sentence:  [('Fifa-voorzitter', 'B-MISC'), ('Sepp', 'B-PER'), ('Blatter', 'I-PER'), (',', 'O'), ('die', 'O'), ('het', 'O'), ('topoverleg', 'O'), ('van', 'O'), ('de', 'O'), ('voetbalbonzen', 'O'), ('in', 'O'), ('Zürich', 'B-LOC'), ('gisteren', 'O'), ('voorzat', 'O'), (',', 'O'), ('zei', 'O'), ('ook', 'O'), ('dat', 'O'), ('de', 'O'), ('werkgroep', 'O'), ('gaat', 'O'), ('aandringen', 'O'), ('op', 'O'), ('een', 'O'), ('verbod', 'O'), ('van', 'O'), ('de', 'O'), ('aan-', 'O'), ('en', 'O'), ('verkoop', 'O'), ('van', 'O'), ('spelers', 'O'), ('onder', 'O'), ('de', 'O'), ('achttien', 'O'), ('jaar', 'O'), ('.', 'O')]
PRED sentence:  [('Fifa-voorzitter', 'B-MISC'), ('Sepp', 'I-MISC'), ('Blatter', 'I-MISC'), (',', 'O'), ('die', 'O'), ('het', 'O'), ('topoverleg', 'O'), ('van', 'O'), ('de', 'O'), ('voetbalbonzen', 'O'), ('in', 'O'), ('Zürich', 'B-LOC'), ('gisteren', 'O'), ('voorzat', 'O'), (',', 'O'), ('zei', 'O'), ('ook', 'O'), ('dat', 'O'), ('de', 'O'), ('werkgroep', 'O'), (

(0.7693741677762983,
 0.7330626744481096,
 0.7507796257796258,
 1646,
 0.9761016333938294,
 array([[   0.,    0.,    0.,    0.,    0.],
        [  37.,  623.,   40.,   36.,   87.],
        [ 190.,   58., 1036.,  168.,  145.],
        [  96.,   56.,  171.,  940.,  170.],
        [  49.,   26.,   61.,   42., 1727.]]))