In [17]:
import spacy
from spacy.tokens import DocBin

In [19]:
# define dummy data to test api
trainengdata = [("would you please help me", "HELP"), ("give me some hints", "HELP"), ("how should I proceed", "HELP"),
                ("yeap", "YES"), ("ok", "YES"), ("fine", "YES"), ("sure", "YES"), ("perfect", "YES"),
                ("no", "NO"), ("cancel", "NO"), ("disagree", "NO")]
validengdata = [("please help", "HELP"), ("hints", "HELP"), ("help", "HELP"), ("I don't get it", "HELP"),
                ("let's do it", "YES"), ("right", "YES"), ("well, ok", "YES"), ("ok, fine", "YES"),
                ("nah", "NO"), ("I don't want it", "NO"), ("no no no", "NO"), ("stop it", "NO")]
englabels = ["HELP", "YES", "NO"]

In [24]:
# load eng model
nlp = spacy.load("en_core_web_sm")

In [25]:
def makedocs(data: list, labels: list):
    """
    :param data: tuples of text with labels
    :param labels: list of existing labels
    :return: list of spacy doc files
    """
    docs = []
    for doc, label in nlp.pipe(data, as_tuples=True):
        for key in labels:
            doc.cats[key] = int(label == key)
        docs.append(doc)
    return (docs)

In [26]:
def datatodocbin(data: list, labels: list, path: str):
    """
    :param data: tuples of text with labels
    :param labels: list of existing labels
    :param path: where to store docbin
    """
    docs = makedocs(data, labels)
    docbin = DocBin(docs=docs)
    docbin.to_disk(path)

In [27]:
# save data as docbins
datatodocbin(data=trainengdata, labels=englabels, path='data/train.spacy')
datatodocbin(data=validengdata, labels=englabels, path='data/valid.spacy')

In [None]:
# STEPS TO TRAIN THE MODEL
# 1.
#   to create base_config.cfg go to 
#       https://spacy.io/usage/training#quickstart
#   choose parameters: language=english, components=textcat, Text Classification - exclusive categories = unchecked, 
#                      hardware etc. and copy generated config to your base_config.cfg file at the root of the project
#   define: train = "data/train.spacy" 
#           dev = "data/valid.spacy"
# 2.
#   in terminal run to automatically create a config file:
#   python3 -m spacy init fill-config ./base_config.cfg ./config.cfg
# 3.
#   create folder output in the root directory.
#   run in terminal to train model: 
#       python3 -m spacy train config.cfg --output ./output

In [28]:
# load the last trained model
nlp = spacy.load("output/model-last")



In [40]:
# test model manually
for text in validengdata:
    doc = nlp(text[0])
    print("CLASSIFICATION:", doc.cats)
    print("GROUND TRUTH", text)
    print("\n")

CLASSIFICATION: {'HELP': 0.6788396835327148, 'YES': 0.2385694533586502, 'NO': 0.1696544885635376}
GROUND TRUTH ('please help', 'HELP')


CLASSIFICATION: {'HELP': 0.5321220755577087, 'YES': 0.3553994596004486, 'NO': 0.25086450576782227}
GROUND TRUTH ('hints', 'HELP')


CLASSIFICATION: {'HELP': 0.4852232336997986, 'YES': 0.39396926760673523, 'NO': 0.2773498594760895}
GROUND TRUTH ('help', 'HELP')


CLASSIFICATION: {'HELP': 0.5279896855354309, 'YES': 0.358663409948349, 'NO': 0.25301623344421387}
GROUND TRUTH ("I don't get it", 'HELP')


CLASSIFICATION: {'HELP': 0.29594287276268005, 'YES': 0.5742524266242981, 'NO': 0.4189189076423645}
GROUND TRUTH ("let's do it", 'YES')


CLASSIFICATION: {'HELP': 0.29594287276268005, 'YES': 0.5742524266242981, 'NO': 0.4189189076423645}
GROUND TRUTH ('right', 'YES')


CLASSIFICATION: {'HELP': 0.0812438428401947, 'YES': 0.8936100602149963, 'NO': 0.10380345582962036}
GROUND TRUTH ('well, ok', 'YES')


CLASSIFICATION: {'HELP': 0.018263142555952072, 'YES': 0.98

In [None]:
# TO TUNE THE MODEL
# look into config.cfg file and tune hyperparameters