In [13]:
import spacy
from spacy.tokens import DocBin
import numpy as np

## STEPS TO TRAIN THE MODEL WITH CLI
1. prepare data in the following format: [(text, label), ...]
2. convert data to the DocBin class as shown below
3. create base.cfg
    1. create base config: 
        1. manual way: 
            1. go to https://spacy.io/usage/training#quickstart
            2. copy generated config to your base.cfg file at the root of the project
        2. automatic https://spacy.io/api/cli#init:
            1. `python -m spacy init config base.cfg --lang en --pipeline textcat_multilabel --optimize efficiency --force`
    2. manually define in the base.cfg (or specify later with train command): 
        * train = "data/train.spacy" 
        * dev = "data/valid.spacy"
4. create a config.cfg file
    * in terminal run to automatically create a config file: `python3 -m spacy init fill-config ./base.cfg ./config.cfg`
5. train new model and store to the ./output folder.
    * run in terminal to train model: `python3 -m spacy train config.cfg --output ./output --paths.train ./data/train.spacy --paths.dev ./data/valid.spacy`
6. test trained model and tune if needed by modifying hyperparameters in the config.cfg and repeating step 5.

### 1. DUMMY DATA PREPARATION

In [7]:
! mkdir ./data
trainpath = "./data/train.spacy"
validpath = "./data/valid.spacy"

In [8]:
# define dummy data to test api
traindata = [("would you please help me", "HELP"), ("give me some hints", "HELP"), ("how should I proceed", "HELP"), ("I don't understand", "HELP"),
                ("yeap", "YES"), ("ok", "YES"), ("fine", "YES"), ("sure", "YES"), ("perfect", "YES"),
                ("no", "NO"), ("cancel", "NO"), ("disagree", "NO")]
validdata = [("please help", "HELP"), ("hints", "HELP"), ("help", "HELP"), ("I don't get it", "HELP"),
                ("let's do it", "YES"), ("right", "YES"), ("well, ok", "YES"), ("ok, fine", "YES"),
                ("nah", "NO"), ("I don't want it", "NO"), ("no no no", "NO"), ("stop it", "NO")]
labels = ["HELP", "YES", "NO"]

### 2. CONVERT DATA TO DocBin

In [9]:
def makedocs(data: list, labels: list, package):
    """
    :param data: tuples of text with labels
    :param labels: list of existing labels
    :param package: spacy language package
    :return: list of spacy doc files
    """
    docs = []
    for doc, label in package.pipe(data, as_tuples=True):
        for key in labels:
            doc.cats[key] = int(label == key)
        docs.append(doc)
    return (docs)

In [10]:
def datatodocbin(data: list, labels: list, path: str, package):
    """
    :param data: tuples of text with labels
    :param labels: list of existing labels
    :param package: spacy language package
    :param path: where to store docbin
    """
    docs = makedocs(data, labels, package)
    docbin = DocBin(docs=docs)
    docbin.to_disk(path)

In [11]:
# download language model
! python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[K     |████████████████████████████████| 13.9 MB 5.2 MB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [14]:
# save data as docbins
engpackage = spacy.load("en_core_web_sm")
datatodocbin(data=traindata, labels=labels, path=trainpath, package=engpackage)
datatodocbin(data=validdata, labels=labels, path=validpath, package=engpackage)

### 3. CREATE base.cfg (details: https://spacy.io/api/cli#init)

In [15]:
! python -m spacy init config base.cfg --lang en --pipeline textcat_multilabel --optimize efficiency --force

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: textcat_multilabel
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
base.cfg
You can now add your data and train your pipeline:
python -m spacy train base.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


### 4. CREATE config.cfg (details: https://spacy.io/api/cli#init-fill-config)

In [16]:
! python3 -m spacy init fill-config ./base.cfg ./config.cfg

[38;5;3m⚠ Nothing to auto-fill: base config is already complete[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


### 5. TRAIN NEW MODEL

In [18]:
! mkdir ./output

In [19]:
! python3 -m spacy train config.cfg --output ./output --paths.train ./data/train.spacy --paths.dev ./data/valid.spacy

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[2022-03-14 16:05:18,435] [INFO] Set up nlp object from config
[2022-03-14 16:05:18,443] [INFO] Pipeline: ['textcat_multilabel']
[2022-03-14 16:05:18,445] [INFO] Created vocabulary
[2022-03-14 16:05:18,445] [INFO] Finished initializing nlp object
[2022-03-14 16:05:18,460] [INFO] Initialized pipeline components: ['textcat_multilabel']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.01       82.29    0.82
200     200           0.71       82.29    0.82
400     400           0.37       83.33    0.83
600     600           0.23       83.33    0.83
800     800           0.15       83.33    0.83
1000    1000           0.11       83.33    0.83
1200    1200           0.08       83.33    0.83
1400    1400           0.06  

### 6. TEST TRAINED MODEL

In [27]:
def test_model(model, data):
    """
    manual classification of valid data
    
    :param model: trained spacy model
    :param data: tuples of text with labels
    """
    for text in data:
        doc = model(text[0])
        print("CLASSIFICATION:", doc.cats)
        print("GROUND TRUTH", text)
        print("\n")

In [28]:
# test model
# to work with several models we need to implement the api for it ourselves (IF I AM NOT MISSING SOMETHING ABOUT SPACY)
nlp = spacy.load("output/model-last")
test_model(model=nlp, data=validdata)

CLASSIFICATION: {'HELP': 0.6775532364845276, 'YES': 0.23055101931095123, 'NO': 0.1631542295217514}
GROUND TRUTH ('please help', 'HELP')


CLASSIFICATION: {'HELP': 0.5253580212593079, 'YES': 0.3474675714969635, 'NO': 0.24426689743995667}
GROUND TRUTH ('hints', 'HELP')


CLASSIFICATION: {'HELP': 0.47562965750694275, 'YES': 0.38746094703674316, 'NO': 0.2715665102005005}
GROUND TRUTH ('help', 'HELP')


CLASSIFICATION: {'HELP': 0.893428385257721, 'YES': 0.08140502125024796, 'NO': 0.06446904689073563}
GROUND TRUTH ("I don't get it", 'HELP')


CLASSIFICATION: {'HELP': 0.5208970904350281, 'YES': 0.3509201407432556, 'NO': 0.24651892483234406}
GROUND TRUTH ("let's do it", 'YES')


CLASSIFICATION: {'HELP': 0.28137174248695374, 'YES': 0.5718020796775818, 'NO': 0.4161897897720337}
GROUND TRUTH ('right', 'YES')


CLASSIFICATION: {'HELP': 0.072362519800663, 'YES': 0.9025143980979919, 'NO': 0.09393086284399033}
GROUND TRUTH ('well, ok', 'YES')


CLASSIFICATION: {'HELP': 0.015303709544241428, 'YES': 0.