In [1]:
import typer
import warnings
from pathlib import Path
import json

import spacy
from spacy.tokens import DocBin

Import training data

In [2]:
with open("sample_data/topic_training.json") as infile:
    training_data = json.load(infile)

Convert data into spacy training format

In [3]:
nlp = spacy.load("en_core_web_sm")
db = DocBin()

for item in training_data:
    doc = nlp.make_doc(item["text"])
    doc.cats = item["topic"]
    db.add(doc)

db.to_disk("topic_training.spacy")

Run the following commands in the terminal

```bash
python -m spacy init fill-config textcat_base_config.cfg textcat-config.cfg
python -m spacy train textcat-config.cfg --output ./textcat_output --paths.train ./topic_training.spacy --paths.dev ./topic_training.spacy
```

Test model

In [25]:
nlp = spacy.load("output/model-best/")

In [28]:
with open("output.txt", "r") as infile:
    texts = infile.read().split("\n")

In [30]:
cats = []
for text in texts:
    doc = nlp(text)
    potential_cats = doc.cats
    # get max value
    max_value = max(potential_cats.values())
    # get key of max value
    max_key = [k for k, v in potential_cats.items() if v == max_value]
    cats.append({"text": text, "topic": max_key, "scores": potential_cats})   

In [31]:
cats

[{'text': 'SERVES 4',
  'topic': ['other'],
  'scores': {'title': 0.011118074879050255,
   'blurb': 0.011039288714528084,
   'ingredient': 0.017033729702234268,
   'method': 0.011209753341972828,
   'other': 0.9495991468429565}},
 {'text': '520 g (1 Ib 2 02) tin of sweetcorn, drained',
  'topic': ['ingredient'],
  'scores': {'title': 0.0005198395228944719,
   'blurb': 0.0007484618690796196,
   'ingredient': 0.9969903230667114,
   'method': 0.001095274114049971,
   'other': 0.0006461279117502272}},
 {'text': '3 tbsp vegetable oil',
  'topic': ['ingredient'],
  'scores': {'title': 0.005252712871879339,
   'blurb': 0.005238130688667297,
   'ingredient': 0.9756233096122742,
   'method': 0.007316912990063429,
   'other': 0.006569028832018375}},
 {'text': 'pinch of asafoetida',
  'topic': ['ingredient'],
  'scores': {'title': 0.006961580831557512,
   'blurb': 0.006963755935430527,
   'ingredient': 0.9679685235023499,
   'method': 0.009352678433060646,
   'other': 0.008753418922424316}},
 {'t

In [29]:
nlp("Serve the chowder in bowls, with the halved eggs on top and more corainder scattered over.").cats

{'title': 0.11102817952632904,
 'blurb': 0.11597983539104462,
 'ingredient': 0.12232184410095215,
 'method': 0.560245156288147,
 'other': 0.09042499214410782}