# Create training and testing data set

In [None]:
import pandas as pd
df =pd.read_pickle("results.pkl")

In [None]:
# Randomly select training samples
idx = np.random.choice(np.arange(0, len(df)), 50, replace = False)
entry = ""
for x in df["info"][idx]:
    if "Ha sido" in x:
        entry += x["Ha sido"] + "\n"

In [None]:
text_file = open("training.txt", "w", encoding = 'utf-8-sig')
text_file.write(entry)
text_file.close()

In [None]:
# Randomly select test samples
test_idx = np.random.choice(np.arange(0, len(df)), 20, replace = False)
test_entry = ""
for x in df["info"][test_idx]:
    if "Ha sido" in x:
        test_entry += x["Ha sido"] + "\n"

In [None]:
# Save test examples
text_file = open("testing.txt", "w", encoding = 'utf-8-sig')
text_file.write(test_entry)
text_file.close()

# Train NER Model

In [3]:
import json
f = open('training_data.json',encoding = 'utf-8-sig')
TRAIN_DATA = json.load(f)["annotations"]
f.close()

f = open('testing_data.json',encoding = 'utf-8-sig')
TEST_DATA = json.load(f)["annotations"]
f.close()

In [4]:
import pandas as pd 
import numpy as np

# Load Colombian place names
colombia = pd.read_csv("cities_colombia.csv") 
cities = colombia["city"]
admin = np.unique(colombia["admin_name"])
CITY_DATA = []

count = 0
for cat in [cities, admin]:
    for c in cat:
        CITY_DATA.append([count, count+len(c), "LOCATION"])
        count += len(c) +2 

TRAIN_DATA.append([". ".join(cities)+". ".join(admin), {"entities": CITY_DATA}])

In [5]:
# Load pre-existing spacy model
import spacy
nlp=spacy.load('es_core_news_sm')

# Getting the pipeline component
ner=nlp.get_pipe("ner")

In [6]:
# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [7]:
# Import requirements
import random
from spacy.util import minibatch, compounding
from pathlib import Path
from spacy.training.example import Example

# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

  # Training for 30 iterations
  for iteration in range(30):

    # shuufling examples  before every iteration
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        for text, annotations in batch: 
          # create Example
          doc = nlp.make_doc(text)
          example = Example.from_dict(doc, annotations)
          nlp.update(
                      [example],
                      drop=0.5,  # dropout - make it harder to memorise data
                      losses=losses,
                  )

        print("Losses", losses)



Losses {'ner': 74.13038352693327}




Losses {'ner': 103.27470514803142}




Losses {'ner': 161.0964677625912}




Losses {'ner': 221.8387080301709}




Losses {'ner': 263.3995325505962}




Losses {'ner': 354.0834080931115}




Losses {'ner': 401.5971980270979}




Losses {'ner': 426.3347081593489}




Losses {'ner': 480.9563099125525}




Losses {'ner': 2240.04863960148}




Losses {'ner': 2310.9481841421402}




Losses {'ner': 2320.96095894738}
Losses {'ner': 40.331373956218556}
Losses {'ner': 61.27408111931091}
Losses {'ner': 110.50664598573661}
Losses {'ner': 149.35784806862483}
Losses {'ner': 174.59886979893088}
Losses {'ner': 2041.0381451698756}
Losses {'ner': 2087.6615864683567}
Losses {'ner': 2129.1629114835514}
Losses {'ner': 2210.577697477465}
Losses {'ner': 2231.832712306198}
Losses {'ner': 2319.470374720072}
Losses {'ner': 2354.392519126106}
Losses {'ner': 102.2839061929449}
Losses {'ner': 182.39654663962185}
Losses {'ner': 235.24371408935951}
Losses {'ner': 263.5313970069901}
Losses {'ner': 310.59212433232165}
Losses {'ner': 333.66992401617625}
Losses {'ner': 404.31244599277653}
Losses {'ner': 442.07834153224337}
Losses {'ner': 1647.1869552565124}
Losses {'ner': 1674.6444939933067}
Losses {'ner': 1734.3091572889919}
Losses {'ner': 1751.2443522738417}
Losses {'ner': 77.10587251015664}
Losses {'ner': 106.96399363718936}
Losses {'ner': 139.72079336714108}
Losses {'ner': 205.12558001314

In [8]:
from spacy.scorer import Scorer
from spacy.tokens import Doc

def evaluate(ner_model, examples):
    scorer = Scorer()
    example = []
    for input_, annot in examples:
        pred = ner_model(input_)
        #print(pred,annot)
        temp = Example.from_dict(pred, annot)
        example.append(temp)
    scores = scorer.score(example)
    return scores

results = evaluate(nlp, TEST_DATA)
print(results)



{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'sents_p': None, 'sents_r': None, 'sents_f': None, 'tag_acc': None, 'pos_acc': None, 'morph_acc': None, 'morph_per_feat': None, 'dep_uas': None, 'dep_las': None, 'dep_las_per_type': None, 'ents_p': 0.4830508474576271, 'ents_r': 0.6263736263736264, 'ents_f': 0.5454545454545454, 'ents_per_type': {'POSITION': {'p': 0.5348837209302325, 'r': 0.5609756097560976, 'f': 0.5476190476190476}, 'LOCATION': {'p': 0.43243243243243246, 'r': 0.8421052631578947, 'f': 0.5714285714285715}, 'ORGANISATION': {'p': 0.4594594594594595, 'r': 0.5666666666666667, 'f': 0.5074626865671642}, 'DATE': {'p': 1.0, 'r': 1.0, 'f': 1.0}}, 'cats_score': 0.0, 'cats_score_desc': 'macro F', 'cats_micro_p': 0.0, 'cats_micro_r': 0.0, 'cats_micro_f': 0.0, 'cats_macro_p': 0.0, 'cats_macro_r': 0.0, 'cats_macro_f': 0.0, 'cats_macro_auc': 0.0, 'cats_f_per_type': {}, 'cats_auc_per_type': {}}




In [9]:
# Save the  model to directory
output_dir = Path('/content/')
nlp.to_disk(output_dir)
print("Saved model to", output_dir)

Saved model to \content
