In [1]:
import spacy
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from os.path import join, exists

## Extracting WordEmbeddings with Spacy

### Params

In [2]:
data_path = "data/Export_KI_ALL_Tickets_202005121335.csv"
assert exists(data_path)
save_path = "data/"
save_name = "vectors_de_en_core_md_np_stopwords.csv"

# set to true in case you want to do preprocessing before extracting the text-vectors
remove_stop_words = False
remove_punctuation = False
lemmatize = False

### Load the Models

In [3]:
# english and german pretrained  core models with bulit in word-vectors
# run ($ python -m spacy download en_core_web_md) to download model first
nlp_en_md = spacy.load("en_core_web_md")
print(nlp_en_md.lang)
print(nlp_en_md.pipe_names)
nlp_de_md = spacy.load("de_core_news_md")
print(nlp_de_md.lang)
print(nlp_de_md.pipe_names)

en
['tagger', 'parser', 'ner']
de
['tagger', 'parser', 'ner']


### Add optional Preprocessing Pipes

In [4]:
# for testing wether pipe works as intended
test_text = "older @ hello 55 searching and ):VPN three plus Internet user"
doc = nlp_en_md(test_text)
doc_vec_before = doc.vector
doc

older @ hello 55 searching and ):VPN three plus Internet user

In [5]:
if remove_stop_words:
    
    def stopword_remover(doc):
        new_doc = spacy.tokens.Doc(doc.vocab, words=[token.text for token in doc if not token.is_stop])
        return new_doc
    
    def add_stopword_remover_to_pipe(model):
        if "stopword_remover" not in model.pipe_names:
            model.add_pipe(stopword_remover, first=True, name="stopword_remover")
            print(model.pipe_names)
    
    add_stopword_remover_to_pipe(nlp_de_md)
    add_stopword_remover_to_pipe(nlp_en_md)

if remove_punctuation:
    
    def punctuation_remover(doc):
        new_doc = spacy.tokens.Doc(doc.vocab, words=[token.text for token in doc if not token.is_punct])
        return new_doc
    
    def add_punctuation_remover_to_pipe(model):
        if "punctuation_remover" not in model.pipe_names:
            model.add_pipe(punctuation_remover, first=True, name="punctuation_remover")
            print(model.pipe_names)
    
    add_punctuation_remover_to_pipe(nlp_de_md)
    add_punctuation_remover_to_pipe(nlp_en_md)


if lemmatize:
    
    def lemmatizer(doc):
        new_doc = spacy.tokens.Doc(doc.vocab, words=[token.lemma_ for token in doc])
        return new_doc
    
    def add_lemmatizer_to_pipe(model):
        if "lemmatizer" not in model.pipe_names:
            model.add_pipe(lemmatizer, first=True, name="lemmatizer")
            print(model.pipe_names)
    
    add_lemmatizer_to_pipe(nlp_de_md)
    add_lemmatizer_to_pipe(nlp_en_md)
    

In [13]:
# for testing wether pipe works as intended
doc = nlp_en_md(test_text)
doc_vec_after = doc.vector
print(test_text)
print(doc)
print((doc_vec_before == doc_vec_after).all())

older @ hello 55 searching and ):VPN three plus Internet user
older @ hello 55 searching and ):VPN three plus Internet user
True


In [6]:
# use only if you need to remove any pipeobject for some reason
# nlp_en_md.remove_pipe('lemmatizer')
# nlp_en_md.remove_pipe('punctuation_remover')
# nlp_en_md.remove_pipe('stopword_remover')
# nlp_de_md.remove_pipe('lemmatizer')
# nlp_de_md.remove_pipe('punctuation_remover')
# nlp_de_md.remove_pipe('stopword_remover')

### Load and prepare the Data

In [14]:
use_cols = ["Zusammenfassung", "Description"]
df = pd.read_csv(data_path, usecols = use_cols)
# fill nan with empty string
df = df.fillna("")

In [15]:
df = df.iloc[:1000] # clip data for test purposes
texts_short = df["Zusammenfassung"].values
texts_long = df["Description"].values

### Process the Text with Spacy Models

In [16]:
def texts_to_docs(nlp_model, texts):
    return list(nlp_model.pipe(texts,disable=["tagger", "parser", "ner"]))

In [17]:
start = datetime.now()
docs_short_en = texts_to_docs(nlp_en_md, texts_short)
end = datetime.now()
print("Time for processing Zusammenfassung EN: ",(end-start).seconds, "seconds")
start = datetime.now()
docs_short_de = texts_to_docs(nlp_de_md, texts_short)
end = datetime.now()
print("Time for processing Zusammenfassung DE: ",(end-start).seconds, "seconds")

start = datetime.now()
docs_long_en = texts_to_docs(nlp_en_md, texts_long)
end = datetime.now()
print("Time for processing Description EN: ",(end-start).seconds, "seconds")
start = datetime.now()
docs_long_de = texts_to_docs(nlp_de_md, texts_long)
end = datetime.now()
print("Time for processing Description DE: ",(end-start).seconds, "seconds")

Time for processing Zusammenfassung EN:  14 seconds
Time for processing Zusammenfassung DE:  12 seconds
Time for processing Description EN:  10 seconds
Time for processing Description DE:  1 seconds


### Extract the Vectors

In [18]:
def extract_vectors(docs):
    return [doc.vector for doc in docs]    

In [19]:
doc_vectors_short_en = extract_vectors(docs_short_en)
doc_vectors_short_de = extract_vectors(docs_short_de)
doc_vectors_long_en = extract_vectors(docs_short_en)
doc_vectors_long_de = extract_vectors(docs_short_de)

### Append Vectors to Data

In [None]:
df["Zusammenfassung_Vector_DE"] = doc_vectors_short_de
df["Zusammenfassung_Vector_EN"] = doc_vectors_short_en
df["Description_Vector_DE"] = doc_vectors_long_de
df["Description_Vector_EN"] = doc_vectors_long_en
df.head()

### save data

In [None]:
df.to_csv(join(save_path,save_name))