# **BEST TAGSET (BIO, IO, BIOW)**

## **INITIALIZATION and setup**

In [None]:
import numpy as np
import spacy
import nltk
import svgling
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from complete_class import CompleteNER

In [None]:
nltk.download('conll2002')
from nltk.corpus import conll2002

# Spanish
train_esp = conll2002.iob_sents('esp.train') # Train
val_esp = conll2002.iob_sents('esp.testa') # Val
test_esp = conll2002.iob_sents('esp.testb') # Test
# Dutch
train_ned = conll2002.iob_sents('ned.train') # Train
val_ned = conll2002.iob_sents('ned.testa') # Val
test_ned = conll2002.iob_sents('ned.testb') # Test

## **Find the best tagset method**

In [None]:
methods = ["bio", "io", "biow"]

In [None]:
results = pd.DataFrame(columns=["Model", "Precision", "Recall", "F1", "Errors", "Accuracy"])

In [None]:
for m in methods:
    spanish = CompleteNER(train_esp, val_esp, test_esp, language="esp", postag=True, method=m)
    nederlands = CompleteNER(train_ned, val_ned, test_ned, language="ned", postag=True, method=m)
    spanish.train(verbose=False, file=f"./models/spanish_{m}.mdl")
    nederlands.train(verbose=False, file=f"./models/nederlands_{m}.mdl")
    precision_es, recall_es, f1_es, err_es, default_acc_es, matrix_es = spanish.validation()
    precision_ned, recall_ned, f1_ned, err_ned, default_acc_ned, matrix_ned = nederlands.validation()
    

    iteration = pd.DataFrame([[f"{m}_esp", precision_es, recall_es, f1_es, err_es, default_acc_es], [f"{m}_ned", precision_ned, recall_ned, f1_ned, err_ned, default_acc_ned]],columns=["Model", "Precision", "Recall", "F1", "Errors", "Accuracy"])
    results.append(iteration)

In [None]:
results.to_csv("./data/results/tagset_results.csv")