# Custom Training Set for HP

In [10]:
import spacy
import regex as re
from pprint import pprint as pp
import json
from spacy.lang.en import English


In [2]:
def load_data(file):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

In [3]:
def generate_better_characters(file):
    data = load_data(file)
    print (len(data))
    new_characters = []
    for item in data:
        new_characters.append(item)
    for item in data:
        item = item.replace("The", "").replace("the", "").replace("and", "").replace("And", "")
        names = item.split(" ")
        for name in names:
            name = name.strip()
            new_characters.append(name)
        if "(" in item:
            names = item.split("(")
            for name in names:
                name = name.replace(")", "").strip()
                new_characters.append(name)
        if "," in item:
            names = item.split(",")
            for name in names:
                name = name.replace("and", "").strip()
                if " " in name:
                    new_names = name.split()
                    for x in new_names:
                        x = x.strip()
                        new_characters.append(x)
                new_characters.append(name)
    print (len(new_characters))
    final_characters = []
    titles = ["Dr.", "Professor", "Mr.", "Mrs.", "Ms.", "Miss", "Aunt", "Uncle", "Mr. and Mrs."]
    for character in new_characters:
        if "" != character:
            final_characters.append(character)
            for title in titles:
                titled_char = f"{title} {character}"
                final_characters.append(titled_char)


    print (len(final_characters))
    final_characters = list(set(final_characters))
    print (len(final_characters))
    final_characters.sort()
    return (final_characters)

In [4]:
def create_patterns(file, label):
    data = generate_better_characters(file)
    patterns = []
    for item in data:
        pattern = {
            "label":label,
            "pattern": item
        }
        patterns.append(pattern)
    return patterns

In [7]:
patterns = create_patterns("./data/hp_characters.json", "PERSON")

207
622
6160
5119


In [18]:
# let's read our original text
with open("./data/hp.txt", "r") as f:
    text = f.read()

In [80]:
nlp = spacy.blank("en")
nlp.add_pipe("sentencizer")

chapters = text.split("CHAPTER")[1:]
corpus = []
for chapter in chapters:# split chapter number and the chapter title (demarcated by "\n\n")
    segments = chapter.split("\n\n")[2:]
    for segment in segments:
        doc = nlp(segment)
        sentences = [sentence.text for sentence in doc.sents]
        corpus.extend(sentences)

In [84]:
# let's load the NLP model and test out our saved Entity Ruler model
nlp = spacy.load("hp_ner")

In [102]:
TRAIN_DATA = []

for sentence in corpus:
    doc = nlp(sentence)
    # intialize entities list for this sentence
    entities = []
    # extract entities
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)
        # append on the list of entities for this sentence
        entities.append([ent.start_char, ent.end_char, ent.label_])
    
    # append the sentence entities to our TRAIN_DATA
    TRAIN_DATA.append([sentence, {"entities": entities}])      

Mr. and Mrs. Dursley 0 20 PERSON
Mr. Dursley 0 11 PERSON
Mrs. Dursley 0 12 PERSON
Dudley 36 42 PERSON
Mrs. 0 4 PERSON
Potter 5 11 PERSON
Mrs. Dursley 16 28 PERSON
Mrs. Dursley 87 99 PERSON
Dudley 80 86 PERSON
Mr. and Mrs. Dursley 5 25 PERSON
Mr. Dursley 0 11 PERSON
Mrs. Dursley 70 82 PERSON
Dudley 133 139 PERSON
Mr. Dursley 20 31 PERSON
Mrs. 64 68 PERSON
Dursley 69 76 PERSON
Dudley 109 115 PERSON
Dudley 145 151 PERSON
Mr. Dursley 23 34 PERSON
Mr. Dursley 14 25 PERSON
Mr. Dursley 0 11 PERSON
Mr. Dursley 3 14 PERSON
Mr. Dursley 0 11 PERSON
Mr. Dursley 0 11 PERSON
Mr. Dursley 0 11 PERSON
Mr. Dursley 19 30 PERSON
Dursley 50 57 PERSON
Mr. Dursley 0 11 PERSON
Mr. Dursley 0 11 PERSON
Harry 64 69 PERSON
Mr. Dursley 0 11 PERSON
Potter 0 6 PERSON
Potter 45 51 PERSON
Harry 73 78 PERSON
Harry 63 68 PERSON
Mrs. Dursley 31 43 PERSON
Mr. Dursley 28 39 PERSON
Mr. Dursley 23 34 PERSON
Mr. Dursley 0 11 PERSON
Mr. Dursley 5 16 PERSON
Mr. Dursley 0 11 PERSON
Mrs. Dursley 0 12 PERSON
Mrs. 35 39 PERSON
Dudl

In [110]:
from sklearn.model_selection import train_test_split

In [115]:
TRAIN, VALID = train_test_split(TRAIN_DATA, test_size=.3, random_state=158)

In [92]:
def save_data(file, data):
    with open(file, "w") as f:
        json.dump(data, f, indent=4)

In [116]:
save_data("./data/HP_chars_bc_train.json", TRAIN)
save_data("./data/HP_chars_bc_valid.json", VALID)

In [117]:
import warnings
from pathlib import Path

import spacy
from spacy.tokens import DocBin
from spacy.util import filter_spans
from tqdm import tqdm

In [121]:
def convert_spacy_data(lang: str, dataset: list):
    nlp = spacy.blank(lang)
    db = DocBin()
    for text, annot in tqdm(dataset):
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annot["entities"]:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                msg = f"""
                Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' 
                does not align with token boundaries:\n\n{repr(text)}\n
                """
                warnings.warn(msg)
            else:
                ents.append(span)
        # ents = filter_spans(ents)
        doc.ents = ents
        db.add(doc)
    return db

In [122]:
training_set = convert_spacy_data("en", TRAIN)
training_set.to_disk("./data/HP_training_set_bc.spacy")

100%|███████████████████████████████████████████████████████████████████████████████████| 4696/4696 [00:02<00:00, 1993.58it/s]


In [123]:
valid_set = convert_spacy_data("en", VALID)
valid_set.to_disk("./data/HP_valid_set_bc.spacy")

100%|███████████████████████████████████████████████████████████████████████████████████| 2013/2013 [00:01<00:00, 1980.70it/s]


In [129]:
!python -m spacy init fill-config base_config.cfg config_2.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config_2.cfg
You can now add your data and train your pipeline:
python -m spacy train config_2.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [128]:
spacy.require_gpu()

    Found GPU0 Quadro K5200 which is of cuda capability 3.5.
    PyTorch no longer supports this GPU because it is too old.
    The minimum cuda capability supported by this library is 3.7.
    


True

In [130]:
!python -m spacy train config_2.cfg --paths.train ./data/HP_training_set_bc.spacy --paths.dev ./data/HP_valid_set_bc.spacy --output ./models/output --gpu-id 0

[38;5;4mℹ Saving to output directory: models/output[0m
[38;5;4mℹ Using GPU: 0[0m
    Found GPU%d %s which is of cuda capability %d.%d.
    PyTorch no longer supports this GPU because it is too old.
    The minimum cuda capability supported by this library is %d.%d.
    
[1m
Traceback (most recent call last):
  File "/home/bengsoon/anaconda3/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/bengsoon/anaconda3/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/home/bengsoon/anaconda3/lib/python3.9/site-packages/spacy/__main__.py", line 4, in <module>
    setup_cli()
  File "/home/bengsoon/anaconda3/lib/python3.9/site-packages/spacy/cli/_util.py", line 71, in setup_cli
    command(prog_name=COMMAND)
  File "/home/bengsoon/anaconda3/lib/python3.9/site-packages/click/core.py", line 1128, in __call__
    return self.main(*args, **kwargs)
  File "/home/bengsoon/anaconda3/lib/python3.9/si