In [1]:
import spacy
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from os.path import join, exists

KeyboardInterrupt: 

## Extracting WordEmbeddings with Spacy

### Params

In [101]:
data_path = "data/Export_KI_ALL_Tickets_202005121335.csv"
assert exists(data_path)
save_path = "data/"
save_name = "vectors_de_en_core_md_no_preprocessing_test.csv"
remove_stop_words = True

### Load the Models

In [130]:
# english and german pretrained  core models with bulit in word-vectors
# run ($ python -m spacy download en_core_web_md) to download model first
nlp_en_md = spacy.load("en_core_web_md")
print(nlp_en_md.lang)
print(nlp_en_md.pipe_names)
nlp_de_md = spacy.load("de_core_news_md")
print(nlp_de_md.lang)
print(nlp_de_md.pipe_names)

en
['tagger', 'parser', 'ner']



KeyboardInterrupt



### Add optional Preprocessing

In [None]:
test = "Hallo with three Cars Internet"
doc = nlp_en_md(test)
print(doc.text)

In [None]:
if remove_stop_words:
    
    def stopword_remover(doc):
        new_doc = spacy.tokens.Doc(doc.vocab, words=[token.text for token in doc if not token.is_stop])
        return new_doc
    
    def add_stopword_remover_to_pipe(model):
        if "stopword_remover" not in model.pipe_names:
            model.add_pipe(stopword_remover, first=True, name="stopword_remover")
            print(model.pipe_names)
    
    add_stopword_remover_to_pipe(nlp_de_md)
    add_stopword_remover_to_pipe(nlp_en_md)
    

In [None]:
test = "Hallo with three Cars Internet"
doc = nlp_en_md(test)
print(doc.text)

### Load and prepare the Data

In [102]:
use_cols = ["Zusammenfassung", "Description"]
df = pd.read_csv(data_path, usecols = use_cols)

In [103]:
# fill nan with empty string
df = df.fillna("")

In [106]:
df = df.iloc[:100] # shorten data for test purposes
texts_short = df["Zusammenfassung"].values
texts_long = df["Description"].values

### Process the Text with Spacy Models

In [115]:
def texts_to_docs(nlp_model, texts):
    return list(nlp_model.pipe(texts,disable=["tagger", "parser", "ner"]))

In [116]:
start = datetime.now()
docs_short_en = texts_to_docs(nlp_en_md, texts_short)
end = datetime.now()
print("Time for procesing Zusammenfassung: ",(end-start).seconds, "seconds")
docs_short_de = texts_to_docs(nlp_de_md, texts_short)

start = datetime.now()
docs_long_en = texts_to_docs(nlp_en_md, texts_long)
end = datetime.now()
print("Time for procesing Description: ",(end-start).seconds, "seconds")
docs_long_de = texts_to_docs(nlp_de_md, texts_long)

Time for procesing Zusammenfassung:  0 seconds
Time for procesing Description:  0 seconds


### Optional preprocessing

In [112]:
if remove_stop_words:
    print(docs_long_de[0].text)
    spacy_stopwords_en = spacy.lang.en.stop_words.STOP_WORDS
    print('Number of stop words EN: %d' % len(spacy_stopwords_en))
    print('First ten stop words EN: %s' % list(spacy_stopwords_en)[:10])
    spacy_stopwords_de = spacy.lang.de.stop_words.STOP_WORDS
    print('Number of stop words DE: %d' % len(spacy_stopwords_de))
    print('First ten stop words DE: %s' % list(spacy_stopwords_de)[:10])
    
    def stopword_remover(doc):
        new_doc = Doc(doc.vocab, words=[token.text for token in doc if not token.is_stop])
        return new_doc
    
    def remove_stopwords_from_doc(doc):
        return [token for token in doc if not token.is_stop]
    def remove_stopwords(docs):
        return [remove_stopwords_from_doc(doc) for doc in docs]
    
    docs_short_en = remove_stopwords(docs_short_en)
    docs_short_de = remove_stopwords(docs_short_de)
    docs_long_en = remove_stopwords(docs_long_en)
    docs_long_de = remove_stopwords(docs_long_de)
    print(docs_long_de[0].text)
    

Probleme mit SSLVPN auf WIN10 Client
Number of stop words EN: 326
First ten stop words EN: ['keep', 'behind', 'here', 'at', 'of', 'during', 'hundred', 'make', 'for', 'about']
Number of stop words DE: 543
First ten stop words DE: ['dabei', 'ach', 'eben', 'gegenüber', 'wirklich', 'rechte', 'drei', 'sondern', 'deinem', 'sechstes']


AttributeError: 'list' object has no attribute 'text'

### Extract the Vectors

In [84]:
def extract_vectors(docs):
    return [doc.vector for doc in docs]    

In [85]:
doc_vectors_short_en = extract_vectors(docs_short_en)
doc_vectors_short_de = extract_vectors(docs_short_de)
doc_vectors_long_en = extract_vectors(docs_short_en)
doc_vectors_long_de = extract_vectors(docs_short_de)

### Append Vectors to Data

In [86]:
df["Zusammenfassung_Vector_DE"] = doc_vectors_short_de
df["Zusammenfassung_Vector_EN"] = doc_vectors_short_en
df["Description_Vector_DE"] = doc_vectors_long_de
df["Description_Vector_EN"] = doc_vectors_long_en
df.head()

Unnamed: 0,Zusammenfassung,Description,Zusammenfassung_Vector_DE,Zusammenfassung_Vector_EN,Description_Vector_DE,Description_Vector_EN
0,Troubleshooting WIN10 VPN PRobleme,Probleme mit SSLVPN auf WIN10 Client,"[0.03622775, 0.0800695, -0.02087775, 0.0008647...","[0.14730251, 0.10227725, 0.2237225, -0.0277614...","[0.03622775, 0.0800695, -0.02087775, 0.0008647...","[0.14730251, 0.10227725, 0.2237225, -0.0277614..."
1,Neuaufsetzen eines IT-Leihsystems (DEEI-NB-10584),Neuaufsetzen eines IT-Leihsystems (DEEI-NB-10584),"[0.11769233, 0.006969167, 0.18289499, 0.0163, ...","[-0.10569471, 0.17416045, -0.0766649, 0.081386...","[0.11769233, 0.006969167, 0.18289499, 0.0163, ...","[-0.10569471, 0.17416045, -0.0766649, 0.081386..."
2,SRQ: ToDo B-1973 - mod IT Services GmbH - Seba...,Bitte das NB 7480 für Sebastian einrichten.Ser...,"[0.12324329, -0.039272, 0.007267351, 0.0158169...","[-0.041469928, 0.122295424, 0.01044979, 0.0932...","[0.12324329, -0.039272, 0.007267351, 0.0158169...","[-0.041469928, 0.122295424, 0.01044979, 0.0932..."
3,Internet line NLET,"Hello together,as there are no news since 17th...","[0.17060132, -0.25492266, -0.08748933, -0.0716...","[-0.08410999, 0.08107033, -0.14993, -0.17815, ...","[0.17060132, -0.25492266, -0.08748933, -0.0716...","[-0.08410999, 0.08107033, -0.14993, -0.17815, ..."
4,AW: Internet line NLET,"Hi Arnaud,any news regarding the new internet ...","[0.1216642, -0.1919932, -0.034273, -0.0213934,...","[-0.15012078, 0.1254164, -0.170406, -0.1643039...","[0.1216642, -0.1919932, -0.034273, -0.0213934,...","[-0.15012078, 0.1254164, -0.170406, -0.1643039..."


### save data

In [78]:
df.to_csv(join(save_path,save_name))