In [1]:
pip install -U spacy

Note: you may need to restart the kernel to use updated packages.


In [10]:
import spacy
import json
import random

In [12]:
def load_data(file):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

In [13]:
def save_data(file, data):
    with open(file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [18]:
nlp = spacy.load("commodities_ner")

In [34]:
def test_model(model, text):
    doc = nlp(text)
    results = []
    entities = []
    for ent in doc.ents:
        entities.append((ent.start_char, ent.end_char, ent.label_))
    if len(entities) > 0:
        results = [text, {"entities": entities}]
        return results

In [36]:
TRAINING_DATA = []

In [41]:
with open("data/commodities/ft-commodities-articles.txt") as f:
    text = f.read()
    articles = text.split("\n")
    for article in articles:
        articleUuid, articleBodyText = article.split("|||")
        segments = articleBodyText.split("||")
        hits = []
        for segment in segments:
            results = test_model(nlp, segment)
            if results != None:
                TRAINING_DATA.append(results)
                
print(TRAINING_DATA[0])
print(len(TRAINING_DATA))

['How did we get here? Back in the early 2000s, when the dotcom bubble burst, many companies were left with nothing of value except their patents, which were then bought up by financial companies or larger tech entities that then tried to milk some cash from them. At the same time, the ecosystem of software suppliers that served the burgeoning commercial internet and smartphone markets began to broaden.', {'entities': [(237, 241, 'COMMODITY')]}]
696


In [42]:
save_data("data/commodities/commodities_training_data.json", TRAINING_DATA)

In [46]:
def train_spacy(data, iterations):
    TRAINING_DATA = data
    nlp = spacy.blank("en")
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(nar, last=True)
    for _, annotations in TRAINING_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes("other_pipes"):
        optimiser = nlp.begin_training()
        for iteration in range(iterations):
            print("Starting iteration " + str(iteration))
            random.shuffle(TRAINING_DATA) # Randomise training data so it does not only memorise order but rather attributes
            losses = {}
            for text, annotations in TRAINING_DATA:
                nlp.update(
                    [text],
                    [annotations],
                    drop=0.2, # Dropout to prevent overfitting
                    sgd=optimiser,
                    losses=losses
                )
            print(losses)
    return (nlp)

In [None]:
nlp = train_spacy(TRAINING_DATA, 30)
nlp.to_disk("commodities_ner_model")