In [9]:
import spacy
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
import json

In [2]:
text = '''Mr Dursley was the director of a firm called Grunnings which made drills He was a big beefy man with hardly any neck although he did have a very large mustache Mrs Dursley was thin and blonde and had nearly twice the usual amount of neck which came in very useful as she spent so much of her time craning over garden fences spying on the neighbors The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere'''

In [3]:
nlp = spacy.load("en_core_web_lg")
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

Mr Dursley PERSON
Grunnings ORG
Mrs Dursley PERSON
Dursleys PERSON
Dudley PERSON


In [7]:
text = '''Today is Nov 18th 2019.'''
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

Nov 18th 2019 DATE


In [8]:
#### How to use spacy to generate a set of rules to perform rules based NER
#### Generate training data using these rules to train spacy NER model

In [32]:
def load_data(file):

    with open(file, "r",encoding="utf-8") as f:
        data = json.load(f)
    return(data)


def generate_better_characters(file):
    hp_chars = load_data(file)
    new_characs = []
    for items in hp_chars:
        new_characs = new_characs + [items]
    print(len(new_characs))
    for items in hp_chars:
        items = items.replace("the","").replace("The","").replace("and","").replace("And","")
        items = items.replace("(","").replace(")","")
        names = items.split()
        for name in names:
            name = name.strip()
            name = name.split(",")
            new_characs = new_characs + name
    print(len(new_characs))
    final_chars = []
    new_characs = list(set(new_characs))
    print(len(new_characs))
    titles = ['Dr.', 'Mr.', 'Mrs.', 'Professor','Ms.','Miss', 'Aunt','Uncle','Mr. and Mrs.']
    for char in new_characs:
        if "" != char:
            final_chars = final_chars + new_characs
            for title in titles:
                titled_char = title + " " + char
                final_chars = final_chars + [ titled_char]
    final_chars = list(set(final_chars) - set([''])) 
    print(len(final_chars))
    final_chars.sort()
    return(final_chars)
generate_better_characters("hp_characters.json")

207
609
506
5049


['Abbott',
 'Aberforth',
 'Aberforth Dumbledore',
 'Alastor',
 'Alastor (Mad-Eye) Moody',
 'Albert',
 'Albert Runcorn',
 'Albus',
 'Albus Dumbledore',
 'Albus Severus Potter',
 'Alecto',
 'Alecto Carrow',
 'Alice',
 'Alice Longbottom',
 'Alicia',
 'Alicia Spinnet',
 'Amelia',
 'Amelia Bones',
 'Amos',
 'Amos Diggory',
 'Amycus',
 'Amycus Carrow',
 'Andromeda Tonks',
 'Angelina',
 'Angelina Johnson',
 'Anthony',
 'Anthony Goldstein',
 'Antioch',
 'Antioch, Cadmus, and Ignotus Peverell',
 'Antonin',
 'Antonin Dolohov',
 'Arabella',
 'Arabella Figg',
 'Aragog',
 'Argus',
 'Argus Filch',
 'Ariana',
 'Ariana Dumbledore',
 'Arthur',
 'Arthur Weasley',
 'Augusta',
 'Augusta Longbottom',
 'Augustus',
 'Augustus Rookwood',
 'Aunt',
 'Aunt Abbott',
 'Aunt Aberforth',
 'Aunt Aberforth Dumbledore',
 'Aunt Alastor',
 'Aunt Alastor (Mad-Eye) Moody',
 'Aunt Albert',
 'Aunt Albert Runcorn',
 'Aunt Albus',
 'Aunt Albus Dumbledore',
 'Aunt Albus Severus Potter',
 'Aunt Alecto',
 'Aunt Alecto Carrow',
 '

In [96]:
def create_training_data(file, type):
    data = generate_better_characters(file)
    patterns = []
    for item in data:
        pattern = {
            "label": type,
            "pattern":item
        }
        patterns = patterns + [pattern]
    return(patterns)

def generate_rules(patterns):
    nlp = English()
    ruler = EntityRuler(nlp)
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
#     uler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
    nlp.to_disk("hp_ner")

    
patterns = create_training_data("hp_characters.json", "PERSON")
generate_rules(patterns)

207
609
506
5049


In [97]:
nlp = spacy.load("hp_ner")

In [98]:
def test_model(model,text):
    doc = nlp(text)
    result = []
    for ent in doc.ents:
        result = result + [ent]
    return(result)

with open("hpchapter1.txt","r",encoding="utf-8") as f:
    text = f.read()
    le_data = {}
    chapters = text.split("CHAPTER")[1:]
    for chapter in chapters:
        chapter_num, chapter_title = chapter.split("\n\n")[0:2]
        segments = chapter.split("\n\n")[2:]
        hits = list()
        for segment in segments:
            segment = segment.replace("\n"," ").strip()
            results = test_model(nlp,segment)
            for result in  results:
                hits = hits + [result]
        le_data[chapter_num] = list(set(hits))

print(le_data)
            

{' ONE': [Dumbledore, Sirius Black, James Potter, Mrs. Dursley, Professor McGonagall, Mrs. Dursley, Voldemort, Dudley, Petunia, Professor McGonagall, Mr. Dursley, Professor McGonagall, Dudley, Mrs. Dursley, Hagrid, Professor McGonagall, Professor McGonagall, Harry, Lily, Ted, Pomfrey, Dumbledore, Dudley, Dumbledore, Dudley, Professor McGonagall, Dumbledore, Mr. Dursley, Mrs. Dursley, Mr. Dursley, Dumbledore, Harry, Hagrid, Professor McGonagall, Voldemort, Dumbledore, Mr. Dursley, Dumbledore, Albus Dumbledore, Harry, Mr. Dursley, Dudley, Mr. Dursley, Harry, Mrs. Dursley, Dumbledore, Harry, Professor McGonagall, Lily, Professor Dumbledore, Mr. Dursley, Mr. and Mrs. Dursley, Mrs. Dursley, Voldemort, Mrs. Dursley, Mr. Dursley, Hagrid, Professor McGonagall, Dumbledore, Mr. Dursley, Mrs. Dursley, Hagrid, Petunia, Dumbledore, Professor McGonagall, Professor McGonagall, James, Hagrid, Mr. Dursley, Dumbledore, Dumbledore, Albus, Professor McGonagall, Professor McGonagall, Dumbledore, Mr. Dursle