In [1]:
import spacy
import json
import random
from spacy.util import minibatch, compounding
from spacy.training import Example
# from spacy.lang.en import English
# from spacy.pipeline import EntityRuler

In [2]:
#### How to use spacy to generate a set of rules to perform rules based NER
#### Generate training data using these rules to train spacy NER model

In [3]:
def load_data(file):

    with open(file, "r",encoding="utf-8") as f:
        data = json.load(f)
    return(data)

def save_data(file,data):
    with open(file, "w",encoding="utf-8") as f:
        json.dump(data, f, indent = 4)

        
# format for training data > Train_data = [(text, {"entities":[(start,end,label)]})]

def test_model(model,text):
    doc = nlp(text)
    results = []
    entities = []
    for ent in doc.ents:
        entities = entities + [(ent.start_char, ent.end_char, ent.label_)]
    if len(entities) > 0:
        results = [text,{"entities": entities}]
#         print(results)
    return(results)


In [4]:
##model hp_ner is created in other notebook - Spacy Basics - ML NER
nlp = spacy.load("hp_ner")

In [5]:
TRAIN_DATA = []
with open("hpchapter1.txt","r",encoding="utf-8") as f:
    text = f.read()
    chapters = text.split("CHAPTER")[1:]
    for chapter in chapters:
        chapter_num, chapter_title = chapter.split("\n\n")[0:2]
        segments = chapter.split("\n\n")[2:]
        results = list()
        for segment in segments:
            segment = segment.replace("\n"," ").strip()
            result = test_model(nlp,segment)
            if result != []:
                results = results + [result]
        TRAIN_DATA = TRAIN_DATA + results
print(len(TRAIN_DATA))


2213


In [6]:
TRAIN_DATA[0]

["Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense.",
 {'entities': [(0, 20, 'PERSON')]}]

In [7]:
# save_data("train_data.json",TRAIN_DATA)
# for a , b in TRAIN_DATA:
#     print(b)

TRAIN_DATA[4]

['At half past eight, Mr. Dursley picked up his briefcase, pecked Mrs. Dursley on the cheek, and tried to kiss Dudley good-bye but missed, because Dudley was now having a tantrum and throwing his cereal at the walls. "Little tyke," chortled Mr. Dursley as he left the house. He got into his car and backed out of number four\'s drive.',
 {'entities': [(20, 31, 'PERSON'),
   (64, 76, 'PERSON'),
   (109, 115, 'PERSON'),
   (145, 151, 'PERSON'),
   (239, 250, 'PERSON')]}]

In [8]:
### create a function that will be used to train the model 
def train_spacy(data, iterations):
    TRAIN_DATA = data
    nlp = spacy.blank("en")
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner", last=True)
    
    for a , annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(other_pipes):
        optimizer = nlp.begin_training()
        
        for itn in range(iterations):
            print("starting iteration: " + str(itn))
            
            for itn in range(iterations):
                random.shuffle(TRAIN_DATA)
                losses = {}
                
                for batch in minibatch(TRAIN_DATA, size=32):
                    for text, annotations in batch:
                        doc = nlp.make_doc(text)
                        example = Example.from_dict(doc, annotations)
                        nlp.update([example], drop=0.35, sgd=optimizer, losses=losses)
                    print(losses)
    return(nlp)

In [None]:
nlp = train_spacy(TRAIN_DATA, 5)

starting iteration: 0
{'ner': 311.06647329897913}
{'ner': 365.3206098590797}
{'ner': 425.65270751929035}
{'ner': 472.74556510500804}
{'ner': 509.61692973343145}
{'ner': 551.2967360899007}
{'ner': 712.6195166166949}
{'ner': 810.9931888709239}
{'ner': 834.3236052853463}
{'ner': 875.1349024206924}
{'ner': 890.0719482854414}
{'ner': 914.5992676839661}
{'ner': 934.5225750293074}
{'ner': 943.2757144973274}
{'ner': 966.8027177131436}
{'ner': 986.9147381081314}
{'ner': 1005.3841109780252}
{'ner': 1029.9781325862168}
{'ner': 1038.6760124368063}
{'ner': 1049.7697546182653}
{'ner': 1061.2838144667467}
{'ner': 1081.694462889853}
{'ner': 1107.653521446937}
{'ner': 1134.5046140806419}
{'ner': 1146.3095087901731}
{'ner': 1155.5324603537626}
{'ner': 1169.735105362097}
{'ner': 1181.0151557567171}
{'ner': 1189.6326872613877}
{'ner': 1204.0446800645016}
{'ner': 1224.4260889219465}
{'ner': 1229.840808779256}
{'ner': 1236.1780638515143}
{'ner': 1245.1376751831795}
{'ner': 1260.7631441014858}
{'ner': 1270.8

{'ner': 123.16386975548926}
{'ner': 127.47331427507592}
{'ner': 127.47333407012468}
{'ner': 127.49883175428032}
{'ner': 127.49883574918715}
{'ner': 127.4988892614504}
{'ner': 129.4987986229317}
{'ner': 129.49879914106762}
{'ner': 129.49894024236784}
{'ner': 129.49894046114443}
{'ner': 133.53978156989606}
{'ner': 135.09405572517952}
{'ner': 137.26861706459405}
{'ner': 137.6123257003991}
{'ner': 143.61232582168242}
{'ner': 145.94920557629774}
{'ner': 147.93305297560906}
{'ner': 147.95723864519255}
{'ner': 147.95736065435207}
{'ner': 147.95737070812282}
{'ner': 147.9595468902667}
{'ner': 147.9608392469699}
{'ner': 147.96784325836393}
{'ner': 153.32881961235557}
{'ner': 153.32883079484972}
{'ner': 153.32883079780316}
{'ner': 153.32894305841495}
{'ner': 162.28091029269592}
{'ner': 166.0523044287922}
{'ner': 168.24894237053576}
{'ner': 175.65995613490654}
{'ner': 177.4419627027692}
{'ner': 181.0920635144851}
{'ner': 181.0931487880146}
{'ner': 181.09314892534653}
{'ner': 186.9932655835627}
{'

{'ner': 32.833454426138864}
{'ner': 32.83345454094105}
{'ner': 32.833454544277366}
{'ner': 33.2945488985261}
{'ner': 33.294548898647044}
{'ner': 33.31482657785179}
{'ner': 36.3931921421981}
{'ner': 39.95573651834978}
{'ner': 40.00951411701771}
{'ner': 45.98992944993002}
{'ner': 47.989699554873376}
{'ner': 49.81060045218736}
{'ner': 49.81060045225356}
{'ner': 53.81060051188393}
{'ner': 53.8134939358591}
{'ner': 57.01327845022234}
{'ner': 57.71398987154814}
{'ner': 57.71399793364751}
{'ner': 57.713997987963616}
{'ner': 57.713997990518735}
{'ner': 61.6399797108354}
{'ner': 67.39124861022508}
{'ner': 74.8293819607304}
{'ner': 81.18831841031984}
{'ner': 87.88201271755923}
{'ner': 89.62028578800549}
{'ner': 91.50277532646851}
{'ner': 91.5027807849151}
{'ner': 92.53549987885489}
{'ner': 92.53550077095653}
{'ner': 92.66993145252013}
{'ner': 93.29136906953862}
{'ner': 93.29136916760726}
{'ner': 94.36468598679083}
{'ner': 94.36575098489102}
{'ner': 6.132181853006103e-06}
{'ner': 0.00112473409880

{'ner': 48.49082221828299}
{'ner': 48.49082222078035}
{'ner': 48.490822415738755}
{'ner': 48.49082241908635}
{'ner': 48.49082326328842}
{'ner': 48.49082327140643}
{'ner': 48.49082327161352}
{'ner': 48.49082327492317}
{'ner': 48.490823574882015}
{'ner': 48.490823697406924}
{'ner': 48.490823742022954}
{'ner': 48.490823750022706}
{'ner': 48.4908237558852}
{'ner': 48.490823765945315}
{'ner': 48.49082376621778}
{'ner': 48.490823768432065}
{'ner': 48.49082376963553}
{'ner': 48.4908237697878}
{'ner': 2.4431772578710042e-08}
{'ner': 0.0021129961163190145}
{'ner': 0.003631946079959388}
{'ner': 0.0036319463919293605}
{'ner': 0.003631990175557475}
{'ner': 0.003634092708114351}
{'ner': 2.0036364254064534}
{'ner': 5.981361938583647}
{'ner': 13.983748202163554}
{'ner': 13.983748229120684}
{'ner': 14.503304931502024}
{'ner': 14.504361769337303}
{'ner': 17.387944266684933}
{'ner': 17.389431409620723}
{'ner': 17.389782552023146}
{'ner': 21.3897824334099}
{'ner': 22.439843000138257}
{'ner': 24.439789814

{'ner': 29.37015840573386}
{'ner': 29.37015842102581}
{'ner': 29.37015842102581}
{'ner': 1.395725471197993e-06}
{'ner': 1.396590753440091e-06}
{'ner': 3.992650468932222}
{'ner': 3.992788132322023}
{'ner': 3.9927881530680662}
{'ner': 5.992303224820753}
{'ner': 6.324169013925161}
{'ner': 6.324179267359474}
{'ner': 6.324179267433976}
{'ner': 6.324179267437346}
{'ner': 6.324179268407138}
{'ner': 7.101733661228702}
{'ner': 7.10174214904553}
{'ner': 9.526414752740457}
{'ner': 11.432313220756212}
{'ner': 11.512079636528691}
{'ner': 13.504446509371716}
{'ner': 13.60236761472192}
{'ner': 13.60242757048926}
{'ner': 13.60242787457923}
{'ner': 13.602428036845934}
{'ner': 13.602428064060113}
{'ner': 15.596137868257784}
{'ner': 15.596179108887736}
{'ner': 15.59934470972379}
{'ner': 15.908472294172084}
{'ner': 15.910573509052242}
{'ner': 15.91057350935994}
{'ner': 15.910573584631706}
{'ner': 15.929684986966274}
{'ner': 15.92968498696836}
{'ner': 15.929684986968491}
{'ner': 15.929684986968493}
{'ner':

{'ner': 0.002650300351095803}
{'ner': 1.9681958643246185}
{'ner': 2.3291056839521094}
{'ner': 2.3291056839521116}
{'ner': 2.3291167783766595}
{'ner': 2.32911677837799}
{'ner': 2.3291167783779914}
{'ner': 2.3291167784317564}
{'ner': 2.3291167784954565}
{'ner': 2.3291167906031656}
{'ner': 2.3291167906031833}
{'ner': 2.338393299134876}
{'ner': 4.108630652012538}
{'ner': 4.1086306520449645}
{'ner': 4.108630652045095}
{'ner': 4.108630652215398}
{'ner': 4.109483421948237}
{'ner': 4.304282815636149}
{'ner': 7.978243306959506}
{'ner': 7.978243306981256}
{'ner': 9.956869081365177}
{'ner': 17.379053800310302}
{'ner': 17.38065011302903}
{'ner': 17.38065011303215}
{'ner': 17.78584618370323}
{'ner': 17.793747103917166}
{'ner': 17.793747103919706}
{'ner': 17.894023127322367}
{'ner': 19.893839070100153}
{'ner': 19.893839070100153}
{'ner': 19.89383907035736}
{'ner': 19.89383907035736}
{'ner': 19.893840726275403}
{'ner': 19.89974135886546}
{'ner': 21.890936799445306}
{'ner': 21.890936799445353}
{'ner':

In [None]:
nlp.to_disk("hp_ner_model")