In [18]:
import random
from pathlib import Path
import thinc.extra.datasets
import pandas as pd
import spacy
from spacy.util import minibatch, compounding

In [19]:

def train(train_data, val_data, model=None, output_dir=None, n_iter=10, init_tok2vec=None):
    '''
    train_data: training data, in the form of (train_texts [string], train_labels [{'class': bool}])
    val_data: validation data, in the form of (val_texts [string], val_labels [{'class': bool}])
    model: Spacy model name, default to None for blank model.
    output_dir: directory to save trained model. Default to None to not save model.
    n_iter: number of iterations. Default to 10.
    '''
    
    
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()

    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print(f"Loaded model '{model}'")
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"}
        )
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe("textcat")


    train_texts, train_cats = train_data
    dev_texts, dev_cats = val_data
    
    # add label to text classifier
    labels = train_cats[0].keys()
    for label in labels:
        textcat.add_label(label)
    
    data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
    # ie:[( text,{'cats': {'LABEl_1': True, 'LABEL_2': False}}), ... ]

    # get names of other pipes to disable them during training
    pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        if init_tok2vec is not None:
            with init_tok2vec.open("rb") as file_:
                textcat.model.tok2vec.from_bytes(file_.read())
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(4.0, 32.0, 1.001)
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(data)
            batches = minibatch(data, size=batch_sizes)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print(
                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
                    losses["textcat"],
                    scores["textcat_p"],
                    scores["textcat_r"],
                    scores["textcat_f"],
                )
            )

    if output_dir is not None:
        with nlp.use_params(optimizer.averages):
            nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        print(test_text, doc2.cats)
        
    
    return nlp


def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}



In [21]:
def load_data(split=0.8):
    df = pd.read_csv('data.csv')
    cats = ['hypertension']
    data = []
    for index, row in df.iterrows():
        text = row.text
        label = {}
        for cat in cats:
            cat_label = (row[cat] == 'Yes' or row[cat] == 'Maybe')
            label[cat] = cat_label
        data.append((text,label))
    random.shuffle(data)
    texts,labels = zip(*data)
    split = int(len(data)*split)
    train_data = (texts[:split], labels[:split])
    test_data = (texts[split:], labels[split:])
    return train_data, test_data

In [22]:
train_data, test_data = load_data()

In [27]:
nlp = train(train_data, test_data)

Created blank 'en' model
Training the model...
LOSS 	  P  	  R  	  F  
11.948	0.464	1.000	0.634
3.235	0.464	1.000	0.634
1.308	0.464	1.000	0.634
0.653	0.464	1.000	0.634
0.372	0.464	1.000	0.634
0.233	0.464	1.000	0.634
0.156	0.464	1.000	0.634
0.111	0.464	1.000	0.634
0.080	0.464	1.000	0.634
0.065	0.464	1.000	0.634


In [34]:
# test the trained model
doc = nlp("I dont know")
print(doc, doc.cats)

I dont know {'hypertension': 1.0}
