In [10]:
!python -m spacy download pl_core_news_md

Collecting pl-core-news-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pl_core_news_md-3.8.0/pl_core_news_md-3.8.0-py3-none-any.whl (49.5 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.5/49.5 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m31m55.9 MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: pl-core-news-md
Successfully installed pl-core-news-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pl_core_news_md')


In [3]:
import json
import random

import spacy
from spacy.util import minibatch
from spacy.training.example import Example

In [15]:
def load_spacy_data(path):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    tupled_data = []
    for entry in data:
        entities = [tuple(ent) for ent in entry[1]["entities"]]
        tupled_data.append((entry[0], {"entities": entities}))
    return tupled_data

# train_data = load_spacy_data("../corpus/themodders_forum_monster_sentences_small/monster_small_train.json")
# test_data = load_spacy_data("../corpus/themodders_forum_monster_sentences_small/monster_small_test.json")
train_data = load_spacy_data("../corpus/themodders_forum_monster_sentences_big/monster_big_no_overlap_train.json")
test_data = load_spacy_data("../corpus/themodders_forum_monster_sentences_big/monster_big_no_overlap_test.json")

In [16]:
nlp = spacy.load('pl_core_news_md')

if 'ner' not in nlp.pipe_names:
    ner = nlp.add_pipe('ner')
else:
    ner = nlp.get_pipe('ner')

# Add any labels that appear in the annotations
for _, annotations in train_data:
    for ent in annotations['entities']:
        if ent[2] not in ner.labels:
            ner.add_label(ent[2])

In [17]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()

    epochs = 20
    for epoch in range(epochs):
        random.shuffle(train_data)
        losses = {}
        batches = minibatch(train_data, size=64)
        for batch in batches:
            examples = []
            for text, annotations in batch:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                examples.append(example)
            nlp.update(examples, drop=0.5, losses=losses)
        print(f'Epoch {epoch + 1}, Losses: {losses}')



Epoch 1, Losses: {'ner': np.float32(25370.086)}
Epoch 2, Losses: {'ner': np.float32(3212.8362)}
Epoch 3, Losses: {'ner': np.float32(2857.9634)}
Epoch 4, Losses: {'ner': np.float32(2436.3489)}
Epoch 5, Losses: {'ner': np.float32(1966.661)}
Epoch 6, Losses: {'ner': np.float32(1713.335)}
Epoch 7, Losses: {'ner': np.float32(1461.3326)}
Epoch 8, Losses: {'ner': np.float32(1261.7994)}
Epoch 9, Losses: {'ner': np.float32(1136.4562)}
Epoch 10, Losses: {'ner': np.float32(861.63806)}
Epoch 11, Losses: {'ner': np.float32(768.39575)}
Epoch 12, Losses: {'ner': np.float32(618.00964)}
Epoch 13, Losses: {'ner': np.float32(520.9365)}
Epoch 14, Losses: {'ner': np.float32(453.6163)}
Epoch 15, Losses: {'ner': np.float32(411.7076)}
Epoch 16, Losses: {'ner': np.float32(348.5293)}
Epoch 17, Losses: {'ner': np.float32(290.60196)}
Epoch 18, Losses: {'ner': np.float32(302.4146)}
Epoch 19, Losses: {'ner': np.float32(253.97888)}
Epoch 20, Losses: {'ner': np.float32(244.34789)}


In [40]:
nlp.to_disk('gmonsters_ner_0.1')