In [1]:
import spacy
import random
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from spacy.util import minibatch, compounding

## Prepare Train Data

In [2]:
use_cols = ["Zusammenfassung", "Verantwortliche Rolle"]
df = pd.read_csv("data/Export_KI_ALL_Tickets_202005121335.csv", usecols = use_cols)
for col in df:
    df[col] = df[col].str.lower()
df["Service_Desk"] = (df["Verantwortliche Rolle"] == "service desk").astype(int)
df.head()

Unnamed: 0,Zusammenfassung,Verantwortliche Rolle,Service_Desk
0,troubleshooting win10 vpn probleme,service desk,1
1,neuaufsetzen eines it-leihsystems (deei-nb-10584),2nd level support,0
2,srq: todo b-1973 - mod it services gmbh - seba...,service desk,1
3,internet line nlet,service desk,1
4,aw: internet line nlet,service desk,1


In [3]:
n_examples = 1000
sub_df = df[df["Service_Desk"]==1].loc[:n_examples/2].append(df[df["Service_Desk"]==0].loc[:n_examples/2])
sub_df.head(-10)

Unnamed: 0,Zusammenfassung,Verantwortliche Rolle,Service_Desk
0,troubleshooting win10 vpn probleme,service desk,1
2,srq: todo b-1973 - mod it services gmbh - seba...,service desk,1
3,internet line nlet,service desk,1
4,aw: internet line nlet,service desk,1
24,updates der mod linuxserver,service desk,1
...,...,...,...
480,bitte ticket eröffnen: zugriffsrechte j,midsize it,0
485,kein zugriff auf mail nach aktualisierung blac...,mobile support,0
486,windows-explorer reagiert nicht,midsize it,0
487,ticket ausscheidender ma - martin stein,midsize it,0


In [7]:
n_examples = 10000
sub_df = df[df["Service_Desk"]==1].loc[:n_examples/2].append(df[df["Service_Desk"]==0].loc[:n_examples/2])
sub_df = sub_df.sample(frac=1)
display(sub_df.head())
texts = sub_df["Zusammenfassung"].values
labels = sub_df["Service_Desk"].values
cats = [{"SERVICE_DESK": bool(y), "OTHERS": not bool(y)} for y in labels]
split = int(len(texts) * 0.8)
(train_texts, train_cats), (test_texts, test_cats) = (texts[:split], cats[:split]), (texts[split:], cats[split:])
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))

train_data = train_data
print("Train Data")
print(train_data[:5])
print(Test Data)
print(test_texts[:5])
print(test_cats[:5])

Unnamed: 0,Zusammenfassung,Verantwortliche Rolle,Service_Desk
1956,notes funktioniert nicht mehr,service desk,1
2255,spracherkennung (autokorrektur),service desk,1
3551,lotus notes ohne funktion,midsize it,0
3293,wg: lotus notes zertifikat,midsize it,0
3941,ad konto gesperrt,service desk,1


[('notes funktioniert nicht mehr', {'cats': {'SERVICE_DESK': True, 'OTHERS': False}}), ('spracherkennung (autokorrektur)', {'cats': {'SERVICE_DESK': True, 'OTHERS': False}}), ('lotus notes ohne funktion', {'cats': {'SERVICE_DESK': False, 'OTHERS': True}}), ('wg: lotus notes zertifikat', {'cats': {'SERVICE_DESK': False, 'OTHERS': True}}), ('ad konto gesperrt', {'cats': {'SERVICE_DESK': True, 'OTHERS': False}})]
['photoshop'
 'updates und reg.key einstellungen zur schwachstellenbeseitigung'
 'wg: aktualisierte präsentation - nachricht (html)  /// ich bekomme folgende fehlermeldung. khb'
 'entfernen der duplikate im outlook 365'
 'wg: hfa-vs01 - virus/malware detected']
[{'SERVICE_DESK': False, 'OTHERS': True}, {'SERVICE_DESK': False, 'OTHERS': True}, {'SERVICE_DESK': False, 'OTHERS': True}, {'SERVICE_DESK': False, 'OTHERS': True}, {'SERVICE_DESK': False, 'OTHERS': True}]


## Get a model and add TextCat to the Pipeline

In [4]:
model = "de_core_news_md"
nlp = spacy.load(model)

# add the text classifier to the pipeline if it doesn't exist
# nlp.create_pipe works for built-ins that are registered with spaCy
if "textcat" not in nlp.pipe_names:
    textcat = nlp.create_pipe("textcat", 
                              config={"exclusive_classes": True, 
                                      "architecture": "simple_cnn"})
    nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
else:
    textcat = nlp.get_pipe("textcat")

textcat.add_label("SERVICE_DESK")
textcat.add_label("OTHERS")

1

## Train the TextCat

In [16]:
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            #if label == "OTHERS":
            #    continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    accuracy = (tp + tn) / (tp+fp+tn+fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score, "acc": accuracy}

In [17]:
# get names of other pipes to disable them during training
n_iter=3
pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    print("Training the model...")
    print("{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F","Acc"))
    batch_sizes = compounding(4.0, 32.0, 1.001)
    for i in range(n_iter):
        losses = {}
        # batch up the examples using spaCy's minibatch
        random.shuffle(train_data)
        batches = minibatch(train_data, size=batch_sizes)
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
        with textcat.model.use_params(optimizer.averages):
            # evaluate on the dev data split off in load_data()
            scores = evaluate(nlp.tokenizer, textcat, test_texts, test_cats)
        print(
            "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}\t{3:.3f}".format(  # print a simple table
                losses["textcat"],
                scores["textcat_p"],
                scores["textcat_r"],
                scores["textcat_f"],
                scores["acc"]
            ))

Training the model...
LOSS 	  P  	  R  	  F  	 Acc 
11.331	0.000	0.000	0.000	0.000
1.184	0.000	0.000	0.000	0.000
0.266	0.000	0.000	0.000	0.000
