In [None]:
import random
from pathlib import Path
import thinc.extra.datasets

import spacy
from spacy.util import minibatch, compounding

In [None]:

def train(train_data, test_data, model=None, output_dir=None, n_iter=10, init_tok2vec=None):
    '''
    train_data: training data, in the form of (train_texts [string], train_labels [{'class': bool}])
    test_data: test data, in the form of (val_texts [string], val_labels [{'class': bool}])
    model: Spacy model name, default to None for blank model.
    output_dir: directory to save trained model. Default to None to not save model.
    n_iter: number of iterations. Default to 10.
    '''
    
    
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()

    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print(f"Loaded model '{model}'")
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"exclusive_classes": False, "architecture": "ensemble"}
        )
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe("textcat")
    
    # add label to text classifier
    labels = train_data[0][1]['cats'].keys()
    for label in labels:
        textcat.add_label(label)

    # get names of other pipes to disable them during training
    pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        if init_tok2vec is not None:
            with init_tok2vec.open("rb") as file_:
                textcat.model.tok2vec.from_bytes(file_.read())
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(4.0, 32.0, 1.001)
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = minibatch(train_data, size=batch_sizes)
            
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, test_data)
            print(
                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
                    losses["textcat"],
                    scores["textcat_p"],
                    scores["textcat_r"],
                    scores["textcat_f"],
                )
            )

    if output_dir is not None:
        with nlp.use_params(optimizer.averages):
            nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
        
    
    return nlp


def evaluate(tokenizer, textcat, test_data):
    
    docs = (tokenizer(text) for (text, label) in test_data)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    
    for i, doc in enumerate(textcat.pipe(docs)):
        
        targets = test_data[i][1]['cats']
        
        for label, score in doc.cats.items():
            
            if label not in targets:
                continue
            
            if score >= 0.5 and targets[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and targets[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and targets[label] < 0.5:
                tn += 1
            elif score < 0.5 and targets[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}



In [None]:
# def load_data(limit=0, split=0.8):
#     """Load data from the IMDB dataset."""
#     # Partition off part of the train data for evaluation
#     data, _ = thinc.extra.datasets.imdb()
#     random.shuffle(data)
#     data = data[-limit:]
    
#     data = [ (text_label_tuple[0], {"cats": {"POSITIVE": text_label_tuple[1]==1}}) for text_label_tuple in data]

#     split = int(len(data) * split)
#     train = data[:split]
#     test = data[split:]
#     return train, test

In [None]:
def load_SCM_data(limit=0, split=0.8):
    import pandas as pd
    df = pd.read_csv('data.csv')
    cats = ['hypertension']
    data = []
    for index, row in df.iterrows():
        text = row.text
        label = {}
        
        for cat in cats:
            cat_label = (row[cat] == 'Yes' or row[cat] == 'Maybe')
            label[cat] = cat_label
        
        data.append((text,{'cats':label}))
    data = data[-limit:]
    random.shuffle(data)
   
    split = int(len(data)*split)
    train_data = data[:split]
    test_data = data[split:]
    return train_data, test_data

In [None]:
train_data, test_data = load_SCM_data()

In [None]:
nlp = train(train_data, test_data, model='en_core_sci_lg',output_dir="trainedSpacyModel", n_iter=15)

In [None]:
# test the trained model
doc = nlp("Patient has high blood pressure")
print(doc.cats)