<h1>Named Entity Recognition (NER) for job data: spaCy model training</h1>
<h3>Adel Rahmani</h3>

<hr style="height:5px;border:none;color:#333;background-color:#333;" />

<div style="background-color:#FBEFFB;">
<hr style="height:5px;border:none;color:#333;background-color:#333;" />
<h3>Licence</h3>
<p>Copyright (C) 2022  Adel Rahmani

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.</p>
<hr style="height:5px;border:none;color:#333;background-color:#333;" />
</div>

<div style="background-color:#F2FBEF;">
<h2><font color=#04B404>Abstract</font></h2>
This notebook and associated code uses NER annotated data to train and evaluate a spaCy machine learning model.
</div>
<hr>

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import random
import swifter
import string
import regex
import ast
import json
import time 
import copy

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import colorcet as cc
import operator

import spacy
from spacy.util import minibatch, compounding, filter_spans
from spacy.matcher import Matcher, PhraseMatcher
from spacy.training.example import Example
from pathlib import Path
from spacy.tokens import DocBin, Span
from spacy.pipeline import EntityRuler
from spacy import displacy
from spacy.language import Language

from sklearn.model_selection import train_test_split
from itertools import chain
from datetime import datetime
from pathlib import Path
from tqdm.notebook import tqdm_notebook

import warnings
warnings.simplefilter('ignore')

from ner_utils import save_spacy_ner_data_to_disk, load_spacy_data_from_csv, compute_elapsed_time, timedelta2str

from IPython.display import display_html, HTML, display

SPACY_MODEL_TYPES = {m: f'en_core_web_{m}' for m in ['sm', 'md', 'lg', 'trf']}

----
# Load the annnotated data



In [3]:
DATA_HASH = 'fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1'
SPACY_ANNOT_MODEL = 'sm'
SPACY_ANNOT_TYPE  = 'docs'

In [4]:
DATA_DIR = Path(f'experiments/data_{DATA_HASH}/')

In [5]:
available = list(DATA_DIR.glob('*'))
available

[PosixPath('experiments/data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1/Adzuna_job_data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1_sents.parq'),
 PosixPath('experiments/data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1/Adzuna_job_data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1_docs.parq'),
 PosixPath('experiments/data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1/spacy_ner_component_cased.json'),
 PosixPath('experiments/data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1/Adzuna_job_data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1_docs_split_Company_RS1_0_8_8064_2323.parq'),
 PosixPath('experiments/data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1/Adzuna_job_data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1_docs_split_Title_RS1_0_8_8137_2250.parq'),
 PosixPath('experiments/data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1/annotations_spacy_SM')]

In [6]:
ANNOT_PATH = DATA_DIR/f'annotations_spacy_{SPACY_ANNOT_MODEL.upper()}'
ANNOT_PATH

PosixPath('experiments/data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1/annotations_spacy_SM')

In [7]:
ANNOT_DATA = list(ANNOT_PATH.glob(f'*{SPACY_ANNOT_TYPE.lower()}*.gz'))[0]
ANNOT_DATA

PosixPath('experiments/data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1/annotations_spacy_SM/ner_annotation_data_Adzuna_OCCUP_GPE_EMPLOYER_ORG_ANY_10387_docs_cased.csv.gz')

In [8]:
REGEX_CASE = 'uncased' if 'uncased' in ANNOT_DATA.name else 'cased'
REGEX_CASE

'cased'

In [9]:
!ls {ANNOT_DATA.as_posix().replace('annotation_data', 'annotation_metadata').replace('.csv.gz', '.parq')}

experiments/data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1/annotations_spacy_SM/ner_annotation_metadata_Adzuna_OCCUP_GPE_EMPLOYER_ORG_ANY_10387_docs_cased.parq


In [10]:
ANNOT_META_DATA = Path(ANNOT_DATA.as_posix().replace('annotation_data', 'annotation_metadata').replace('.csv.gz', '.parq'))
ANNOT_META_DATA

PosixPath('experiments/data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1/annotations_spacy_SM/ner_annotation_metadata_Adzuna_OCCUP_GPE_EMPLOYER_ORG_ANY_10387_docs_cased.parq')

In [11]:
!ls {ANNOT_PATH}

ner_annotation_data_Adzuna_OCCUP_GPE_EMPLOYER_ORG_ANY_10387_docs_cased.csv.gz
ner_annotation_metadata_Adzuna_OCCUP_GPE_EMPLOYER_ORG_ANY_10387_docs_cased.parq


In [12]:
annotated_data = load_spacy_data_from_csv(ANNOT_DATA)
print(f"Loaded {len(annotated_data)} annotated documents...")

Loaded 10387 annotated documents...


In [13]:
annotated_data[0]

('Are you a qualified/newly qualified teacher looking for supply work in and around South Yorkshire? Do you want a new challenge with varied work, flexibility and great rates of pay? If so, Vision for Education can help We are currently looking for enthusiastic and dedicated KS and KS Primary Teachers for a number of schools across the area. Candidates must have an enthusiasm for teaching, a good knowledge of the national curriculum and excellent classroom and behaviour management skills. It is essential that you hold a valid, recognised teaching qualification and ideally you will have 6 weeks recent experience of teaching in the UK. We must also be able to contact your past school to obtain a reference. Vision for Education was started in by a group of like minded individuals with a desire for providing a quality service to customers. Our promise is to serve the education community, be it Teachers, Schools or Students alike to the highest possible standards. If you are looking for a p

In [14]:
ANNOT_META_DATA

PosixPath('experiments/data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1/annotations_spacy_SM/ner_annotation_metadata_Adzuna_OCCUP_GPE_EMPLOYER_ORG_ANY_10387_docs_cased.parq')

In [15]:
metadata = pd.read_parquet(ANNOT_META_DATA)

In [16]:
metadata.head()

Unnamed: 0,doc_id,text,text_length,EMPLOYER,GPE,OCCUP,ORG,entity_count,entity_per_char
0,36757414,Are you a qualified/newly qualified teacher lo...,1421,3,2,1,0,6,0.004222
1,44452524,Chef De Partie Rustic Italian AA Rosette resta...,2263,2,0,11,4,17,0.007512
2,45057486,Manager Recruiting a Manager for a newly acqui...,1647,2,1,7,3,13,0.007893
3,54238303,Commis Chef We are recruiting for a bright com...,2333,2,0,1,1,4,0.001715
4,55408076,Job Title: Manufacturing Engineer Salary: circ...,1598,1,0,4,10,15,0.009387


In [17]:
# metadata.sort_values('entity_per_char', ascending=False).head(12)

In [18]:
# np.log10(metadata.text.str.split().str.len()).plot.hist(bins=25, ec='w')

---
## Split the data into train/test sets

__Note__: Random splits can potentially cause data leakage between train and test due to regex annotation. Using the precomputed split (see notebook 2) to mitigate this issue to some extent.

In [19]:
def compute_random_split(annotated_data, train_size=0.8, random_state=0):

    split_info = f"TOT_{len(annotated_data)}_{SPACY_ANNOT_TYPE}_TRSIZE_{train_size}_RS{random_state}"
    # print(split_info)

    TRAIN_DATA, TEST_DATA = train_test_split(annotated_data, 
                                             train_size=train_size, 
                                             random_state=random_state)

    TRAIN_DATA_IDX, TEST_DATA_IDX = train_test_split(np.arange(len(annotated_data)), 
                                             train_size=train_size, 
                                             random_state=random_state)

    print(f"Split the {len(annotated_data)} annotated documents into {len(TRAIN_DATA)} documents for training and {len(TEST_DATA)} for testing.")
    
    return split_info, TRAIN_DATA, TEST_DATA, TRAIN_DATA_IDX, TEST_DATA_IDX

In [20]:
def split_annotations_into_train_and_test(annotated_data=None, DATA_DIR=DATA_DIR, precomputed=None, train_size=0.8, random_state=0):
    if precomputed is None:
        return compute_random_split(annotated_data, train_size, random_state)

    split_file = DATA_DIR/precomputed
    
    if not split_file.is_file():
        raise FileNotFoundError(f'File {file} not found!')
    
    random_state = split_file.name.split('RS')[1].split('_')[0]
    train_size, test_size = [int(_) for _ in split_file.name.split('.')[0].split('_')[-2:]]
    
    split_info = pd.read_parquet(split_file)
    TRAIN_DATA_IDX = np.where(split_info.training==True)[0]
    TEST_DATA_IDX  = np.where(split_info.training==False)[0]
    assert len(TRAIN_DATA_IDX) == train_size
    assert len(TEST_DATA_IDX) == test_size

    split_info = f"TOT_{len(annotated_data)}_{SPACY_ANNOT_TYPE}_TRSIZE_{train_size}_RS{random_state}_precomp"
    
    annotations_array = np.array(annotated_data)

    TRAIN_DATA = annotations_array[TRAIN_DATA_IDX].tolist()
    TEST_DATA = annotations_array[TEST_DATA_IDX].tolist()
    
    print(f"Split the {len(annotated_data)} annotated documents into {len(TRAIN_DATA)} documents for training and {len(TEST_DATA)} for testing.")
    
    return split_info, TRAIN_DATA, TEST_DATA, TRAIN_DATA_IDX, TEST_DATA_IDX

In [21]:
SPLIT_PARAMS = 'Adzuna_job_data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1_docs_split_Title_RS1_0_8_8137_2250.parq'
    
split_info, TRAIN_DATA, TEST_DATA, TRAIN_DATA_IDX, TEST_DATA_IDX = split_annotations_into_train_and_test(
    annotated_data, 
    DATA_DIR=DATA_DIR, 
    precomputed=SPLIT_PARAMS, 
    train_size=0.8, 
    random_state=0)


Split the 10387 annotated documents into 8137 documents for training and 2250 for testing.


In [22]:
TRAIN_DATA[0]

['Are you a qualified/newly qualified teacher looking for supply work in and around South Yorkshire? Do you want a new challenge with varied work, flexibility and great rates of pay? If so, Vision for Education can help We are currently looking for enthusiastic and dedicated KS and KS Primary Teachers for a number of schools across the area. Candidates must have an enthusiasm for teaching, a good knowledge of the national curriculum and excellent classroom and behaviour management skills. It is essential that you hold a valid, recognised teaching qualification and ideally you will have 6 weeks recent experience of teaching in the UK. We must also be able to contact your past school to obtain a reference. Vision for Education was started in by a group of like minded individuals with a desire for providing a quality service to customers. Our promise is to serve the education community, be it Teachers, Schools or Students alike to the highest possible standards. If you are looking for a p

In [23]:
annotated_data[TRAIN_DATA_IDX[0]]

('Are you a qualified/newly qualified teacher looking for supply work in and around South Yorkshire? Do you want a new challenge with varied work, flexibility and great rates of pay? If so, Vision for Education can help We are currently looking for enthusiastic and dedicated KS and KS Primary Teachers for a number of schools across the area. Candidates must have an enthusiasm for teaching, a good knowledge of the national curriculum and excellent classroom and behaviour management skills. It is essential that you hold a valid, recognised teaching qualification and ideally you will have 6 weeks recent experience of teaching in the UK. We must also be able to contact your past school to obtain a reference. Vision for Education was started in by a group of like minded individuals with a desire for providing a quality service to customers. Our promise is to serve the education community, be it Teachers, Schools or Students alike to the highest possible standards. If you are looking for a p

In [24]:
metadata['train'] = True
metadata.iloc[TEST_DATA_IDX, -1] = False

In [25]:
metadata.sample(10, random_state=1)

Unnamed: 0,doc_id,text,text_length,EMPLOYER,GPE,OCCUP,ORG,entity_count,entity_per_char,train
5156,71855929,Our client a Nuclear MOD site in Berkshire is ...,3246,1,1,2,28,32,0.009858,True
7577,69638911,Mclaren Absolute Taste Inflight Chef de Partie...,2921,5,5,2,3,15,0.005135,True
4181,71295298,URGENT English Teacher Required for contract p...,545,1,2,1,5,9,0.016514,False
1206,68691549,Recruitment Consultant Contract IT and Technic...,1622,1,1,8,5,15,0.009248,False
5332,72127029,Senior Project Manager London k k Benefits MMS...,1675,1,0,5,6,12,0.007164,True
1011,68676778,"Internal Communications Manager, k, Oxfordshir...",2321,1,3,1,4,9,0.003878,True
9358,66887987,Sellick Partnership is working on an exclusive...,1076,1,2,2,0,5,0.004647,True
6711,68592018,Project Manager Automotive South Yorkshire Sal...,2020,2,0,1,4,7,0.003465,False
6895,68694243,Our client is an expanding firm and is offerin...,2095,8,7,0,7,22,0.010501,True
4555,71557798,We are looking for a Part Time Administrator t...,671,2,0,1,4,7,0.010432,True


----
# Create the spaCy training config file and train the model

In [26]:
model_type = 'sm'

In [27]:
RUN_PATH = ANNOT_PATH/f'spacy_{model_type}_{split_info}_{REGEX_CASE}'
RUN_PATH

PosixPath('experiments/data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1/annotations_spacy_SM/spacy_sm_TOT_10387_docs_TRSIZE_8137_RS1_precomp_cased')

In [28]:
SPACY_DATA_PATH = RUN_PATH/'spacy_data'
SPACY_TRAIN_PATH = RUN_PATH/'spacy_model'

In [29]:
SPACY_TRAIN_PATH.mkdir(parents=True, exist_ok=True)
SPACY_TRAIN_PATH

PosixPath('experiments/data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1/annotations_spacy_SM/spacy_sm_TOT_10387_docs_TRSIZE_8137_RS1_precomp_cased/spacy_model')

In [30]:
SPACY_CONFIG_FILE = SPACY_TRAIN_PATH/f'config_{model_type}.cfg'

In [31]:
def create_config(model_name: str, component_to_update: str, output_path: Path, disable=None,
                 batch_size=1500, max_epochs=5, patience=1000, eval_freq=200):
    # https://github.com/explosion/projects/tree/v3/pipelines/ner_demo_update
    
    if disable is None: 
        disable = []
    
    nlp = spacy.load(model_name, disable=disable)

    # create a new config as a copy of the loaded pipeline's config
    config = nlp.config.copy()

    # revert most training settings to the current defaults
    default_config = spacy.blank(nlp.lang).config
    config["corpora"] = default_config["corpora"]
    config["training"]["logger"] = default_config["training"]["logger"]

    # copy tokenizer and vocab settings from the base model, which includes
    # lookups (lexeme_norm) and vectors, so they don't need to be copied or
    # initialized separately
    config["initialize"]["before_init"] = {
        "@callbacks": "spacy.copy_from_base_model.v1",
        "tokenizer": model_name,
        "vocab": model_name,
    }
    config["initialize"]["lookups"] = None
    config["initialize"]["vectors"] = None

    
    config["nlp"]["batch_size"] = batch_size
    config["training"]["max_epochs"] = max_epochs
    config["training"]["patience"] = patience
    config["training"]["eval_frequency"] = eval_freq
    
    # source all components from the loaded pipeline and freeze all except the
    # component to update; replace the listener for the component that is
    # being updated so that it can be updated independently
    config["training"]["frozen_components"] = []
    for pipe_name in nlp.component_names:
        if pipe_name != component_to_update:
            config["components"][pipe_name] = {"source": model_name}
            config["training"]["frozen_components"].append(pipe_name)
        else:
            config["components"][pipe_name] = {
                "source": model_name,
                "replace_listeners": ["model.tok2vec"],
            }
    disable_flag= '' if disable is None else f'_DISABLE_{"_".join(disable)}'
    output_path = Path(output_path.as_posix().replace('.cfg',f'_bs{batch_size}_me{max_epochs}_p{patience}{disable_flag}.cfg'))        
    # save the config
    config.to_disk(output_path)
    return output_path, nlp

In [32]:
CONFIG, model = create_config(
    model_name=SPACY_MODEL_TYPES[model_type], 
    component_to_update='ner', 
    disable=['parser','tagger','lemmatizer'],
    batch_size = 1000,
    eval_freq = 500,
    output_path=SPACY_CONFIG_FILE
)

CONFIG

PosixPath('experiments/data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1/annotations_spacy_SM/spacy_sm_TOT_10387_docs_TRSIZE_8137_RS1_precomp_cased/spacy_model/config_sm_bs1000_me5_p1000_DISABLE_parser_tagger_lemmatizer.cfg')

In [33]:
model.pipe_names

['tok2vec', 'senter', 'attribute_ruler', 'ner']

### Write the train/test data to disk

In [34]:
def build_spacy_ner_data(DATA, nlp=None, path=None, filename=None):
    ''' 
    Convert the annotated data into new DocBin spaCy format and save to disk
    https://stackoverflow.com/questions/67407433/using-spacy-3-0-to-convert-data-from-old-spacy-v2-format-to-the-brand-new-spacy
    '''
    assert isinstance(path, Path)
    
    if path is None:
        print('You must supply a path!')
        return 

    if filename is None:
        print('You must supply a filename!')
        return 
    
    filename = filename+f'_size_{len(DATA)}.spacy'
    
    path.mkdir(parents=True, exist_ok=True)
    
    # check whether the file already exists
    if (path/filename).exists():
        print('Data file already exists... Skipping.')
        return path/filename
    
    
    db = DocBin() # create a DocBin object

    for text, annot, _ in tqdm_notebook(DATA): # data in old spacy format
        doc = nlp.make_doc(text) # create doc object from text
        ents = []
        for start, end, label in annot["entities"]: # add character indexes
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                pass
            else:
                ents.append(span)
        doc.ents = filter_spans(ents) # label the text with the ents
        db.add(doc)
    

    db.to_disk(path/filename) # save the docbin object
    print(f"Saved data to {path/filename}")
    
    return path/filename

In [35]:
training_file = build_spacy_ner_data(TRAIN_DATA, model, path=SPACY_DATA_PATH, filename=f'train_annot_{split_info}')
testing_file  = build_spacy_ner_data(TEST_DATA,  model, path=SPACY_DATA_PATH, filename=f'test_annot_{split_info}')

  0%|          | 0/8137 [00:00<?, ?it/s]

Saved data to experiments/data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1/annotations_spacy_SM/spacy_sm_TOT_10387_docs_TRSIZE_8137_RS1_precomp_cased/spacy_data/train_annot_TOT_10387_docs_TRSIZE_8137_RS1_precomp_size_8137.spacy


  0%|          | 0/2250 [00:00<?, ?it/s]

Saved data to experiments/data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1/annotations_spacy_SM/spacy_sm_TOT_10387_docs_TRSIZE_8137_RS1_precomp_cased/spacy_data/test_annot_TOT_10387_docs_TRSIZE_8137_RS1_precomp_size_2250.spacy


In [36]:
# CONFIG = MANUAL_CONFIG_FILE

OUTPUT = SPACY_TRAIN_PATH/f"output_{CONFIG.stem}"
OUTPUT 

PosixPath('experiments/data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1/annotations_spacy_SM/spacy_sm_TOT_10387_docs_TRSIZE_8137_RS1_precomp_cased/spacy_model/output_config_sm_bs1000_me5_p1000_DISABLE_parser_tagger_lemmatizer')

In [37]:
%%time 
!python -m spacy train {CONFIG} --output {OUTPUT} --paths.train  ./{training_file.as_posix()} --paths.dev ./{testing_file.as_posix()}

[38;5;2m✔ Created output directory:
experiments/data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1/annotations_spacy_SM/spacy_sm_TOT_10387_docs_TRSIZE_8137_RS1_precomp_cased/spacy_model/output_config_sm_bs1000_me5_p1000_DISABLE_parser_tagger_lemmatizer[0m
[38;5;4mℹ Saving to output directory:
experiments/data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1/annotations_spacy_SM/spacy_sm_TOT_10387_docs_TRSIZE_8137_RS1_precomp_cased/spacy_model/output_config_sm_bs1000_me5_p1000_DISABLE_parser_tagger_lemmatizer[0m
[38;5;4mℹ Using CPU[0m
[1m
[2022-11-12 11:06:06,479] [INFO] Set up nlp object from config
[2022-11-12 11:06:06,491] [INFO] Pipeline: ['tok2vec', 'senter', 'attribute_ruler', 'ner']
[2022-11-12 11:06:06,491] [INFO] Resuming training for: ['ner']
[2022-11-12 11:06:06,501] [INFO] Copying tokenizer from: en_core_web_sm
[2022-11-12 11:06:07,050] [INFO] Copying vocab from: en_core_web_sm
[2022-11-12 11:06:07,156] [INFO] Created vocabulary
[2022-11-12 11:06:07,158] [INFO] Finished initializi

In [38]:
list(OUTPUT.glob('*'))

[PosixPath('experiments/data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1/annotations_spacy_SM/spacy_sm_TOT_10387_docs_TRSIZE_8137_RS1_precomp_cased/spacy_model/output_config_sm_bs1000_me5_p1000_DISABLE_parser_tagger_lemmatizer/model-last'),
 PosixPath('experiments/data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1/annotations_spacy_SM/spacy_sm_TOT_10387_docs_TRSIZE_8137_RS1_precomp_cased/spacy_model/output_config_sm_bs1000_me5_p1000_DISABLE_parser_tagger_lemmatizer/model-best')]

In [39]:
training_file

PosixPath('experiments/data_fba836ee1bdf4fda32004145ffe1eeb8d3c6b5f1/annotations_spacy_SM/spacy_sm_TOT_10387_docs_TRSIZE_8137_RS1_precomp_cased/spacy_data/train_annot_TOT_10387_docs_TRSIZE_8137_RS1_precomp_size_8137.spacy')

In [40]:
MODEL = OUTPUT/'model-best'

In [41]:
ner_model = spacy.load(MODEL) 

In [42]:
!python -m spacy evaluate {MODEL} ./{training_file}
!python -m spacy evaluate {MODEL} ./{testing_file}

[38;5;4mℹ Using CPU[0m
[1m

TOK      100.00
TAG      -     
POS      -     
MORPH    -     
LEMMA    -     
NER P    96.03 
NER R    96.31 
NER F    96.17 
SENT P   -     
SENT R   -     
SENT F   -     
SPEED    20006 

[1m

               P       R       F
EMPLOYER   99.18   99.26   99.22
GPE        96.78   97.14   96.96
OCCUP      97.16   97.57   97.37
ORG        94.17   94.43   94.30

[38;5;4mℹ Using CPU[0m
[1m

TOK      100.00
TAG      -     
POS      -     
MORPH    -     
LEMMA    -     
NER P    87.77 
NER R    87.33 
NER F    87.55 
SENT P   -     
SENT R   -     
SENT F   -     
SPEED    20390 

[1m

               P       R       F
OCCUP      93.64   92.49   93.06
EMPLOYER   97.41   94.83   96.10
ORG        81.05   81.54   81.29
GPE        89.24   88.72   88.98



---
### Checking the trained model

In [43]:
colors = {
    "EMPLOYER": "#DB81E2", 
    "OCCUP": "#AEE8EC", 
    "GPE": "#ECE3AE", 
    "ORG": "#ECBEAE", 
    # "PERSON":"#17B4C2", 
    # "OCCUP": "#9017C2", 
    # "OCCUP": "#878787", 
    # "OCCUP": "#0A6DF5", 
    # "OCCUP": "#1F541D"
}
options = {"ents": ["EMPLOYER", "OCCUP", "GPE"], "colors":colors}

In [44]:
random.seed(0)
for t, _, _ in random.choices(TEST_DATA, k=5):
    d_ref = model(t)
    d     = ner_model(t)
    
    if len(d.ents) > 0:
        display(HTML('<h3>Default model:</h3>'))
        displacy.render(d_ref, style='ent', options=options)
        display(HTML('<h3>Machine learning model:</h3>'))
        displacy.render(d, style='ent', options=options)
        print('\n\n')


























In [45]:
pd.DataFrame(ner_model.meta['performance']['ents_per_type']).T

Unnamed: 0,p,r,f
OCCUP,0.936351,0.924932,0.930607
EMPLOYER,0.974099,0.948331,0.961042
ORG,0.810458,0.81537,0.812907
GPE,0.892361,0.887227,0.889786
