In [1]:
import spacy
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from os.path import join, exists

## Extracting WordEmbeddings with Spacy

### Params

In [2]:
data_path = "data/Export_KI_ALL_Tickets_202005121335.csv"
assert exists(data_path)
save_path = "data/"
save_name = "vectors_de_en_core_md_np_stopwords.csv"

# set to true in case you want to do preprocessing before extracting the text-vectors
remove_stop_words = False
remove_punctuation = False
lemmatize = False

### Load the Models

In [3]:
# english and german pretrained  core models with bulit in word-vectors
# run ($ python -m spacy download en_core_web_md) to download model first
nlp_en_md = spacy.load("en_core_web_md")
print(nlp_en_md.lang)
print(nlp_en_md.pipe_names)
nlp_de_md = spacy.load("de_core_news_md")
print(nlp_de_md.lang)
print(nlp_de_md.pipe_names)

en
['tagger', 'parser', 'ner']
de
['tagger', 'parser', 'ner']


### Add optional Preprocessing Pipes

In [4]:
# for testing wether pipe works as intended
test_text = "older @ hello 55 searching and ):VPN three plus Internet user"
doc = nlp_en_md(test_text)
doc_vec_before = doc.vector
doc

older @ hello 55 searching and ):VPN three plus Internet user

In [5]:
if remove_stop_words:
    
    def stopword_remover(doc):
        new_doc = spacy.tokens.Doc(doc.vocab, words=[token.text for token in doc if not token.is_stop])
        return new_doc
    
    def add_stopword_remover_to_pipe(model):
        if "stopword_remover" not in model.pipe_names:
            model.add_pipe(stopword_remover, first=True, name="stopword_remover")
            print(model.pipe_names)
    
    add_stopword_remover_to_pipe(nlp_de_md)
    add_stopword_remover_to_pipe(nlp_en_md)

if remove_punctuation:
    
    def punctuation_remover(doc):
        new_doc = spacy.tokens.Doc(doc.vocab, words=[token.text for token in doc if not token.is_punct])
        return new_doc
    
    def add_punctuation_remover_to_pipe(model):
        if "punctuation_remover" not in model.pipe_names:
            model.add_pipe(punctuation_remover, first=True, name="punctuation_remover")
            print(model.pipe_names)
    
    add_punctuation_remover_to_pipe(nlp_de_md)
    add_punctuation_remover_to_pipe(nlp_en_md)


if lemmatize:
    
    def lemmatizer(doc):
        new_doc = spacy.tokens.Doc(doc.vocab, words=[token.lemma_ for token in doc])
        return new_doc
    
    def add_lemmatizer_to_pipe(model):
        if "lemmatizer" not in model.pipe_names:
            model.add_pipe(lemmatizer, first=True, name="lemmatizer")
            print(model.pipe_names)
    
    add_lemmatizer_to_pipe(nlp_de_md)
    add_lemmatizer_to_pipe(nlp_en_md)
    

In [7]:
# for testing wether pipe works as intended
doc = nlp_en_md(test_text)
doc_vec_after = doc.vector
print(test_text)
print(doc)
print((doc_vec_before == doc_vec_after).all)

older @ hello 55 searching and ):VPN three plus Internet user
older @ hello 55 searching and ):VPN three plus Internet user
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True


In [6]:
# use only if you need to remove any pipeobject for some reason
# nlp_en_md.remove_pipe('lemmatizer')
# nlp_en_md.remove_pipe('punctuation_remover')
# nlp_en_md.remove_pipe('stopword_remover')
# nlp_de_md.remove_pipe('lemmatizer')
# nlp_de_md.remove_pipe('punctuation_remover')
# nlp_de_md.remove_pipe('stopword_remover')

### Load and prepare the Data

In [8]:
use_cols = ["Zusammenfassung", "Description"]
df = pd.read_csv(data_path, usecols = use_cols)
# fill nan with empty string
df = df.fillna("")

In [9]:
# df = df.iloc[:100] # clip data for test purposes
texts_short = df["Zusammenfassung"].values
texts_long = df["Description"].values

### Process the Text with Spacy Models

In [10]:
def texts_to_docs(nlp_model, texts):
    return list(nlp_model.pipe(texts,disable=["tagger", "parser", "ner"]))

In [11]:
start = datetime.now()
docs_short_en = texts_to_docs(nlp_en_md, texts_short)
end = datetime.now()
print("Time for processing Zusammenfassung EN: ",(end-start).seconds, "seconds")
start = datetime.now()
docs_short_de = texts_to_docs(nlp_de_md, texts_short)
end = datetime.now()
print("Time for processing Zusammenfassung DE: ",(end-start).seconds, "seconds")

start = datetime.now()
docs_long_en = texts_to_docs(nlp_en_md, texts_long)
end = datetime.now()
print("Time for processing Description EN: ",(end-start).seconds, "seconds")
start = datetime.now()
docs_long_de = texts_to_docs(nlp_de_md, texts_long)
end = datetime.now()
print("Time for processing Description DE: ",(end-start).seconds, "seconds")

Time for procesing Zusammenfassung:  15 seconds
Time for procesing Description:  292 seconds


KeyboardInterrupt: 

### Extract the Vectors

In [None]:
def extract_vectors(docs):
    return [doc.vector for doc in docs]    

In [None]:
doc_vectors_short_en = extract_vectors(docs_short_en)
doc_vectors_short_de = extract_vectors(docs_short_de)
doc_vectors_long_en = extract_vectors(docs_short_en)
doc_vectors_long_de = extract_vectors(docs_short_de)

### Append Vectors to Data

In [23]:
df["Zusammenfassung_Vector_DE"] = doc_vectors_short_de
df["Zusammenfassung_Vector_EN"] = doc_vectors_short_en
df["Description_Vector_DE"] = doc_vectors_long_de
df["Description_Vector_EN"] = doc_vectors_long_en
df.head()

Unnamed: 0,Zusammenfassung,Description,Zusammenfassung_Vector_DE,Zusammenfassung_Vector_EN,Description_Vector_DE,Description_Vector_EN
0,Troubleshooting WIN10 VPN PRobleme,Probleme mit SSLVPN auf WIN10 Client,"[0.03622775, 0.0800695, -0.02087775, 0.0008647...","[0.14730251, 0.10227725, 0.2237225, -0.0277614...","[0.03622775, 0.0800695, -0.02087775, 0.0008647...","[0.14730251, 0.10227725, 0.2237225, -0.0277614..."
1,Neuaufsetzen eines IT-Leihsystems (DEEI-NB-10584),Neuaufsetzen eines IT-Leihsystems (DEEI-NB-10584),"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.038578, -0.00103804, 0.0117836, -0.00835, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.038578, -0.00103804, 0.0117836, -0.00835, ..."
2,SRQ: ToDo B-1973 - mod IT Services GmbH - Seba...,Bitte das NB 7480 für Sebastian einrichten.Ser...,"[0.14195207, -0.08712593, 0.008474229, -0.0156...","[-0.007871474, -0.050947335, 0.080323614, 0.05...","[0.14195207, -0.08712593, 0.008474229, -0.0156...","[-0.007871474, -0.050947335, 0.080323614, 0.05..."
3,Internet line NLET,"Hello together,as there are no news since 17th...","[0.17060132, -0.25492266, -0.08748933, -0.0716...","[-0.08410999, 0.08107033, -0.14993, -0.17815, ...","[0.17060132, -0.25492266, -0.08748933, -0.0716...","[-0.08410999, 0.08107033, -0.14993, -0.17815, ..."
4,AW: Internet line NLET,"Hi Arnaud,any news regarding the new internet ...","[0.15208025, -0.2399915, -0.04284125, -0.02674...","[-0.18983749, 0.073735505, -0.14006999, -0.167...","[0.15208025, -0.2399915, -0.04284125, -0.02674...","[-0.18983749, 0.073735505, -0.14006999, -0.167..."


### save data

In [24]:
df.to_csv(join(save_path,save_name))