In [2]:
import spacy
from spacy.tokens import DocBin

In [1]:
# define dummy data to test api
trainengdata = [("would you please help me", "HELP"), ("give me some hints", "HELP"), ("how should I proceed", "HELP"), ("I don't understand", "HELP"),
                ("yeap", "YES"), ("ok", "YES"), ("fine", "YES"), ("sure", "YES"), ("perfect", "YES"),
                ("no", "NO"), ("cancel", "NO"), ("disagree", "NO")]
validengdata = [("please help", "HELP"), ("hints", "HELP"), ("help", "HELP"), ("I don't get it", "HELP"),
                ("let's do it", "YES"), ("right", "YES"), ("well, ok", "YES"), ("ok, fine", "YES"),
                ("nah", "NO"), ("I don't want it", "NO"), ("no no no", "NO"), ("stop it", "NO")]
englabels = ["HELP", "YES", "NO"]

In [3]:
# load eng model
nlp = spacy.load("en_core_web_sm")

In [4]:
def makedocs(data: list, labels: list):
    """
    :param data: tuples of text with labels
    :param labels: list of existing labels
    :return: list of spacy doc files
    """
    docs = []
    for doc, label in nlp.pipe(data, as_tuples=True):
        for key in labels:
            doc.cats[key] = int(label == key)
        docs.append(doc)
    return (docs)

In [5]:
def datatodocbin(data: list, labels: list, path: str):
    """
    :param data: tuples of text with labels
    :param labels: list of existing labels
    :param path: where to store docbin
    """
    docs = makedocs(data, labels)
    docbin = DocBin(docs=docs)
    docbin.to_disk(path)

In [6]:
# save data as docbins
datatodocbin(data=trainengdata, labels=englabels, path='data/train.spacy')
datatodocbin(data=validengdata, labels=englabels, path='data/valid.spacy')

## STEPS TO TRAIN THE MODEL
1. create base_config.cfg
    1. go to https://spacy.io/usage/training#quickstart
    2. select parameters: 
        * language=english
        * components=textcat
        * Text Classification - exclusive categories = unchecked, 
        * hardware etc. 
    3. copy generated config to your base_config.cfg file at the root of the project
    4. in the base_config.cfg define: 
        * train = "data/train.spacy" 
        * dev = "data/valid.spacy"
2. create a config.cfg file
    * in terminal run to automatically create a config file: `python3 -m spacy init fill-config ./base_config.cfg ./config.cfg`
3. create folder output in the root directory.
    * run in terminal to train model: `python3 -m spacy train config.cfg --output ./output`

In [8]:
def test_model(model):
    """
    manual classification of valid data
    
    :param model: trained spacy model
    """
    for text in validengdata:
        doc = model(text[0])
        print("CLASSIFICATION:", doc.cats)
        print("GROUND TRUTH", text)
        print("\n")

In [9]:
# test model
nlp = spacy.load("output/model-last")
test_model(model=nlp)

CLASSIFICATION: {'HELP': 0.6775532364845276, 'YES': 0.23055101931095123, 'NO': 0.1631542295217514}
GROUND TRUTH ('please help', 'HELP')


CLASSIFICATION: {'HELP': 0.5253580212593079, 'YES': 0.3474675714969635, 'NO': 0.24426689743995667}
GROUND TRUTH ('hints', 'HELP')


CLASSIFICATION: {'HELP': 0.47562965750694275, 'YES': 0.38746094703674316, 'NO': 0.2715665102005005}
GROUND TRUTH ('help', 'HELP')


CLASSIFICATION: {'HELP': 0.893428385257721, 'YES': 0.08140502125024796, 'NO': 0.06446904689073563}
GROUND TRUTH ("I don't get it", 'HELP')


CLASSIFICATION: {'HELP': 0.5208970904350281, 'YES': 0.3509201407432556, 'NO': 0.24651892483234406}
GROUND TRUTH ("let's do it", 'YES')


CLASSIFICATION: {'HELP': 0.28137174248695374, 'YES': 0.5718020796775818, 'NO': 0.4161897897720337}
GROUND TRUTH ('right', 'YES')


CLASSIFICATION: {'HELP': 0.072362519800663, 'YES': 0.9025143980979919, 'NO': 0.09393086284399033}
GROUND TRUTH ('well, ok', 'YES')


CLASSIFICATION: {'HELP': 0.015303709544241428, 'YES': 0.

In [None]:
# TO TUNE THE MODEL
# look into config.cfg file and tune hyperparameters

In [13]:
# to work with several model we need to implement the api for it ourselves (IF I AM NOT MISSING SOMETHING ABOUT SPACY)
model1 = spacy.load("output/model-last")
model2 = spacy.load("output/basemodel")
print("MODEL 1\n")
test_model(model1)
print("MODEL 2\n")
test_model(model2)

MODEL 1

CLASSIFICATION: {'HELP': 0.6775532364845276, 'YES': 0.23055101931095123, 'NO': 0.1631542295217514}
GROUND TRUTH ('please help', 'HELP')


CLASSIFICATION: {'HELP': 0.5253580212593079, 'YES': 0.3474675714969635, 'NO': 0.24426689743995667}
GROUND TRUTH ('hints', 'HELP')


CLASSIFICATION: {'HELP': 0.47562965750694275, 'YES': 0.38746094703674316, 'NO': 0.2715665102005005}
GROUND TRUTH ('help', 'HELP')


CLASSIFICATION: {'HELP': 0.893428385257721, 'YES': 0.08140502125024796, 'NO': 0.06446904689073563}
GROUND TRUTH ("I don't get it", 'HELP')


CLASSIFICATION: {'HELP': 0.5208970904350281, 'YES': 0.3509201407432556, 'NO': 0.24651892483234406}
GROUND TRUTH ("let's do it", 'YES')


CLASSIFICATION: {'HELP': 0.28137174248695374, 'YES': 0.5718020796775818, 'NO': 0.4161897897720337}
GROUND TRUTH ('right', 'YES')


CLASSIFICATION: {'HELP': 0.072362519800663, 'YES': 0.9025143980979919, 'NO': 0.09393086284399033}
GROUND TRUTH ('well, ok', 'YES')


CLASSIFICATION: {'HELP': 0.015303709544241428, 