# **BEST TAGSET (BIO, IO, BIOW)**

## **INITIALIZATION and setup**

In [2]:
import numpy as np
import spacy
import nltk
import svgling
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from complete_class import CompleteNER

In [3]:
nltk.download('conll2002')
from nltk.corpus import conll2002

# Spanish
train_esp = conll2002.iob_sents('esp.train') # Train
val_esp = conll2002.iob_sents('esp.testa') # Val
test_esp = conll2002.iob_sents('esp.testb') # Test
# Dutch
train_ned = conll2002.iob_sents('ned.train') # Train
val_ned = conll2002.iob_sents('ned.testa') # Val
test_ned = conll2002.iob_sents('ned.testb') # Test

[nltk_data] Downloading package conll2002 to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


## **Find the best tagset method**

In [4]:
methods = ["bio", "io", "biow"]

In [5]:
results = pd.DataFrame(columns=["Model", "Precision", "Recall", "F1", "Errors", "Accuracy"])

In [23]:
for m in methods[1:]:
    spanish = CompleteNER(train_esp, val_esp, test_esp, language="esp", postag=True, method=m)
    nederlands = CompleteNER(train_ned, val_ned, test_ned, language="ned", postag=True, method=m)
    spanish.train(verbose=False, file=f"./models/spanish_{m}.mdl")
    nederlands.train(verbose=False, file=f"./models/nederlands_{m}.mdl")
    precision_es, recall_es, f1_es, err_es, default_acc_es, matrix_es = spanish.validation()
    precision_ned, recall_ned, f1_ned, err_ned, default_acc_ned, matrix_ned = nederlands.validation()
    

    iteration = pd.DataFrame([[f"{m}_esp", precision_es, recall_es, f1_es, err_es, default_acc_es], [f"{m}_ned", precision_ned, recall_ned, f1_ned, err_ned, default_acc_ned]],columns=["Model", "Precision", "Recall", "F1", "Errors", "Accuracy"])
    results = pd.concat([results,iteration])

Sentence index: 4
GOLD sentence:  [('"', 'O'), ('Telefónica', 'B-ORG'), ('asumió', 'O'), ('un', 'O'), ('compromiso', 'O'), ('con', 'O'), ('Brasil', 'B-LOC'), (',', 'O'), ('y', 'O'), ('en', 'O'), ('especial', 'O'), ('con', 'O'), ('Sao', 'B-LOC'), ('Paulo', 'I-LOC'), ('en', 'O'), ('1998', 'O'), ('(', 'O'), ('año', 'O'), ('de', 'O'), ('privatización', 'O'), ('del', 'O'), ('sistema', 'O'), ('Telebras', 'B-MISC'), (')', 'O'), ('y', 'O'), ('estamos', 'O'), ('aquí', 'O'), ('para', 'O'), ('prestar', 'O'), ('cuentas', 'O'), ('"', 'O'), (',', 'O'), ('dijo', 'O'), ('Ferreira', 'B-PER'), ('en', 'O'), ('el', 'O'), ('acto', 'O'), ('de', 'O'), ('instalación', 'O'), ('de', 'O'), ('la', 'O'), ('línea', 'O'), ('número', 'O'), ('tres', 'O'), ('millones', 'O'), ('de', 'O'), ('la', 'O'), ('gestión', 'O'), ('de', 'O'), ('Telefónica', 'B-ORG'), ('.', 'O')]
PRED sentence:  [('"', 'O'), ('Telefónica', 'B-ORG'), ('asumió', 'O'), ('un', 'O'), ('compromiso', 'O'), ('con', 'O'), ('Brasil', 'B-ORG'), (',', 'O'), ('

In [24]:
results.to_csv("./data/results/tagset_results.csv")