In [38]:
import spacy
import random
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from spacy.util import minibatch, compounding

## Prepare Train Data

In [18]:
use_cols = ["Zusammenfassung", "Verantwortliche Rolle"]
df = pd.read_csv("data/Export_KI_ALL_Tickets_202005121335.csv", usecols = use_cols)
for col in df:
    df[col] = df[col].str.lower()
df["Service_Desk"] = (df["Verantwortliche Rolle"] == "service desk").astype(int)
df.head()

Unnamed: 0,Zusammenfassung,Verantwortliche Rolle,Service_Desk
0,troubleshooting win10 vpn probleme,service desk,1
1,neuaufsetzen eines it-leihsystems (deei-nb-10584),2nd level support,0
2,srq: todo b-1973 - mod it services gmbh - seba...,service desk,1
3,internet line nlet,service desk,1
4,aw: internet line nlet,service desk,1


In [40]:
n_examples = 1000
texts = df["Zusammenfassung"].values[:n_examples]
labels = df["Service_Desk"].values[:n_examples]
cats = [{"SERVICE_DESK": bool(y), "OTHERS": not bool(y)} for y in labels]
split = int(len(texts) * 0.8)
(train_texts, train_cats), (test_texts, test_cats) = (texts[:split], cats[:split]), (texts[split:], cats[split:])
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))

train_data = train_data
train_data[:5]

[('troubleshooting win10 vpn probleme',
  {'cats': {'SERVICE_DESK': True, 'OTHERS': False}}),
 ('neuaufsetzen eines it-leihsystems (deei-nb-10584)',
  {'cats': {'SERVICE_DESK': False, 'OTHERS': True}}),
 ('srq: todo b-1973 - mod it services gmbh - sebastian heise - latitude 7480 + zubehör',
  {'cats': {'SERVICE_DESK': True, 'OTHERS': False}}),
 ('internet line nlet', {'cats': {'SERVICE_DESK': True, 'OTHERS': False}}),
 ('aw: internet line nlet', {'cats': {'SERVICE_DESK': True, 'OTHERS': False}})]

## Get a model and add TextCat to the Pipeline

In [None]:
model = "de_core_news_md"
nlp = spacy.load(model)

# add the text classifier to the pipeline if it doesn't exist
# nlp.create_pipe works for built-ins that are registered with spaCy
if "textcat" not in nlp.pipe_names:
    textcat = nlp.create_pipe("textcat", 
                              config={"exclusive_classes": True, 
                                      "architecture": "simple_cnn"})
    nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
else:
    textcat = nlp.get_pipe("textcat")

textcat.add_label("SERVICE_DESK")
textcat.add_label("OTHERS")

## Train the TextCat

In [None]:
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if label == "NEGATIVE":
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

In [None]:
# get names of other pipes to disable them during training
n_iter=3
pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    print("Training the model...")
    print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
    batch_sizes = compounding(4.0, 32.0, 1.001)
    for i in range(n_iter):
        losses = {}
        # batch up the examples using spaCy's minibatch
        random.shuffle(train_data)
        batches = minibatch(train_data, size=batch_sizes)
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
        with textcat.model.use_params(optimizer.averages):
            # evaluate on the dev data split off in load_data()
            scores = evaluate(nlp.tokenizer, textcat, test_texts, test_cats)
        print(
            "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
                losses["textcat"],
                scores["textcat_p"],
                scores["textcat_r"],
                scores["textcat_f"],
            ))