### Custom NER Model

In [1]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") 
db = DocBin()

In [2]:
import json
f = open('./annotations.json')
TRAIN_DATA = json.load(f)

In [3]:
for text, annot in tqdm(TRAIN_DATA['annotations']): 
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents 
    db.add(doc)

db.to_disk("./training_data.spacy") 

100%|██████████| 904/904 [00:00<00:00, 14143.99it/s]


In [4]:
!python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [5]:
!python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     56.60   16.55    9.92   49.72    0.17
  2     200        108.90   2410.56   95.16   95.19   95.13    0.95
  5     400         55.21    597.13   95.49   95.52   95.46    0.95
  8     600         68.00    679.99   96.27   96.19   96.35    0.96
 13     800        105.90    761.19   96.95   97.11   96.79    0.97
 18    1000        123.31    838.38   97.01   96.96   97.07    0.97
 24    1200        160.96    963.26   97.34   97.61   97.07    0.97
 32    1400        583.05   1140.72   97.45   97.61   97.29    0.97
 42    1600        420.03   1150.75   97.54   97.41   97.

In [6]:
nlp_ner = spacy.load("./model-best")

In [9]:
doc = nlp_ner("Can you please proofread my article on NLP, Alicia?")

In [10]:
spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

In [38]:
import pandas as pd

In [39]:
df = pd.read_csv('./tasks.csv')
df.head()

Unnamed: 0,sentence
0,"Sarah, please review the report"
1,"John, update the presentation slides"
2,"Michael, complete the coding tasks"
3,"Emily, prepare the meeting agenda"
4,"David, proofread the document"


In [40]:
# Apply the NER model to the text column and separate out the TASK and ASSIGNEE entites
df['doc'] = df['sentence'].apply(nlp_ner)
df['task'] = df['doc'].apply(lambda x: [ent.text for ent in x.ents if ent.label_ == 'TASK'])
df['assignee'] = df['doc'].apply(lambda x: [ent.text for ent in x.ents if ent.label_ == 'ASSIGNEE'])
df.head()

Unnamed: 0,sentence,doc,task,assignee
0,"Sarah, please review the report","(Sarah, ,, please, review, the, report)",[please review the report],[Sarah]
1,"John, update the presentation slides","(John, ,, update, the, presentation, slides)",[update the presentation slides],[John]
2,"Michael, complete the coding tasks","(Michael, ,, complete, the, coding, tasks)",[complete the coding tasks],[Michael]
3,"Emily, prepare the meeting agenda","(Emily, ,, prepare, the, meeting, agenda)",[prepare the meeting agenda],[Emily]
4,"David, proofread the document","(David, ,, proofread, the, document)",[proofread the document],[David]


In [41]:
# Drop the doc column
df = df.drop(columns=['doc'])

In [42]:
df.head()

Unnamed: 0,sentence,task,assignee
0,"Sarah, please review the report",[please review the report],[Sarah]
1,"John, update the presentation slides",[update the presentation slides],[John]
2,"Michael, complete the coding tasks",[complete the coding tasks],[Michael]
3,"Emily, prepare the meeting agenda",[prepare the meeting agenda],[Emily]
4,"David, proofread the document",[proofread the document],[David]
