In [7]:
import json
from tqdm import tqdm
import pandas as pd
import os
import spacy
from spacy.tokens import DocBin

In [12]:
with open("sample_data/annotations.json") as infile:
    training_data = json.load(infile)

training_data

{'classes': ['INGRED', 'AMOUNT', 'UNIT', 'PREP'],
 'annotations': [['520 g (1 Ib 2 02) tin of sweetcorn, drained, ',
   {'entities': [[0, 3, 'AMOUNT'],
     [4, 5, 'UNIT'],
     [7, 8, 'AMOUNT'],
     [9, 11, 'UNIT'],
     [18, 34, 'INGRED'],
     [36, 43, 'PREP']]}],
  ['3 tbsp vegetable oil, ',
   {'entities': [[0, 1, 'AMOUNT'], [2, 6, 'UNIT'], [7, 21, 'INGRED']]}],
  ['pinch of asafoetida, ', {'entities': [[0, 5, 'PREP'], [9, 19, 'INGRED']]}],
  ['1 heaped tsp black mustard seeds, ',
   {'entities': [[0, 1, 'AMOUNT'], [2, 12, 'UNIT'], [13, 32, 'INGRED']]}],
  ['1 tsp ground turmeric, ',
   {'entities': [[0, 1, 'AMOUNT'], [2, 5, 'UNIT'], [6, 21, 'INGRED']]}],
  ['salt, to taste, ', {'entities': [[0, 4, 'INGRED'], [6, 14, 'PREP']]}],
  ['1 tbsp finely chopped coriander (cilantro), ',
   {'entities': [[0, 1, 'AMOUNT'],
     [2, 6, 'UNIT'],
     [7, 21, 'PREP'],
     [22, 31, 'INGRED'],
     [33, 41, 'INGRED']]}],
  ['1 tbsp roasted peanuts, crushed, ',
   {'entities': [[0, 1, 'AMOUNT']

Load existing model and create a new doc bin

In [10]:
nlp = spacy.load("en_core_web_sm")
db = DocBin()

Fill the docbin with the training data of labelled entities

In [17]:
# fill doc bin
for entry in tqdm(training_data["annotations"]):
    doc = nlp.make_doc(entry[0])
    ents = []
    for entity in entry[1]["entities"]:
        start = entity[0]
        end = entity[1]
        label = entity[2]
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

100%|██████████| 10/10 [00:00<00:00, 2175.36it/s]


Save training data

In [18]:
db.to_disk("sample_data/annotations.spacy")

In [19]:
# load trained model
nlp = spacy.load("output/model-best")

In [48]:
doc = nlp("400ml vegetable stock")

In [49]:
[(token.text, token.ent_type_) for token in doc]

[('400ml', 'INGRED'), ('vegetable', 'INGRED'), ('stock', 'INGRED')]