# First extraction with Word2Vec

In [60]:
import pandas as pd
import gensim
from gensim.models import word2vec
from nltk import tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

In [3]:
# Load the dataset
df = pd.read_csv('../01_data/chatbot_extract-1.csv', sep=',')
df.drop('Ticket Number', axis='columns', inplace=True)
df.drop('count of replies', axis='columns', inplace=True)
df

Unnamed: 0,ticket,Customer,ticket.1,ticket_state,ticket_template
0,Fehler in der App,Swissmem,"Normenauszug 2018, Rechenbuch Metall, Tabellen...",closed successful,"Mac: Access Denied -> Cookies, Virenscanner"
1,Fehler in der App,SVBA,Ich kann die Bilder nicht mehr öffnen um mit ...,closed successful,"Coupon einlösen, Produkte nicht vorhanden"
2,Fehler in der App,hep,Leider fehlen mir paar Kapitel im Lehrmittel (...,closed successful,"Mac: Access Denied -> Cookies, Virenscanner"
3,Anmeldung / Aktivierung,AGVS,Ich kann im FK z.b. die Seite 605 nicht lesen ...,closed successful,Linux
4,Anmeldung / Aktivierung,SVBA,Ich kann das Kapitel Technisches Englisch nich...,closed successful,"Coupon einlösen, Produkte nicht vorhanden"
5,Geräteanzahl überschritten,DIHK,Bitte iphone und ipad entfernen.,closed successful,"Mac: Access Denied -> Cookies, Virenscanner"
6,Fehler in der App,BWZ,"Mein problem ist, dass ich selbst wenn ich mi...",closed_max_devices,Linux
7,Geräteanzahl überschritten,VELEDES,Nun konnte ich das Buch Spezielle Branchenkund...,closed_waiting_for_reply,"Mac: Access Denied -> Cookies, Virenscanner"
8,Fehler in der App,hep,Ich kann in der Klasse F(18)-KSS den Bildschir...,closed successful,"Mac: Access Denied -> Cookies, Virenscanner"
9,Geräteanzahl überschritten,DIHK,Ich würde gerne die Lizenzen bei dem Geräten S...,closed successful,"Mac: Access Denied -> Cookies, Virenscanner"


In [15]:
# overwrite null values
df = df.fillna('blub')

In [19]:
ticket_texts = list(df['ticket.1'])
sentences = []
for txt in ticket_texts:
    sentences.extend(tokenize.sent_tokenize(txt))
    
words = []
for sent in sentences:
    words.extend(tokenize.word_tokenize(sent))

print('number of sentences',len(sentences))
print('number of words',len(words))

number of sentences 151
number of words 2261


In [62]:
def stemming(sent):
    snowball = SnowballStemmer('german', ignore_stopwords=True)
    stemmed = []
    for word in sent:
        stemmed.append(snowball.stem(word))
    return stemmed

def remove_stop_words(sent):
    stop_words = list(set(stopwords.words('german')))
    return [t for t in sent if not t in stop_words]

def prepare_sent_for_model():
    sent_toked = []
    for sent in sentences:
        words = tokenize.word_tokenize(sent)
        words = remove_stop_words(words)
        words = stemming(words)
        sent_toked.append(words)
    return sent_toked

sent_t = prepare_sent_for_model()
model = gensim.models.Word2Vec(sent_t, size=100)
model.save('first.embedding') 
first_model = gensim.models.Word2Vec.load('first.embedding')

In [63]:
len(first_model.wv.vocab)

48

In [64]:
first_model.wv.vocab

{',': <gensim.models.keyedvectors.Vocab at 0x20b52d035f8>,
 'mehr': <gensim.models.keyedvectors.Vocab at 0x20b52d03828>,
 '.': <gensim.models.keyedvectors.Vocab at 0x20b52d03588>,
 'es': <gensim.models.keyedvectors.Vocab at 0x20b52d037f0>,
 'seit': <gensim.models.keyedvectors.Vocab at 0x20b52d03358>,
 'buch': <gensim.models.keyedvectors.Vocab at 0x20b52d03320>,
 'funktioniert': <gensim.models.keyedvectors.Vocab at 0x20b52d035c0>,
 'ich': <gensim.models.keyedvectors.Vocab at 0x20b52d03550>,
 'fehl': <gensim.models.keyedvectors.Vocab at 0x20b52d03b38>,
 'kapitel': <gensim.models.keyedvectors.Vocab at 0x20b52d03198>,
 '(': <gensim.models.keyedvectors.Vocab at 0x20b52d03ba8>,
 ')': <gensim.models.keyedvectors.Vocab at 0x20b52d03ac8>,
 'app': <gensim.models.keyedvectors.Vocab at 0x20b52d03710>,
 'neu': <gensim.models.keyedvectors.Vocab at 0x20b52cc5080>,
 'bitt': <gensim.models.keyedvectors.Vocab at 0x20b52d12da0>,
 'sie': <gensim.models.keyedvectors.Vocab at 0x20b52d12ba8>,
 'steht': <gens

In [65]:
first_model.wv.similar_by_word('gerateanzahl')

[('alt', 0.19653919339179993),
 ('moglich', 0.1824682503938675),
 ('a7', 0.14756560325622559),
 ('sie', 0.14534863829612732),
 ('``', 0.11474406719207764),
 ('heut', 0.08499954640865326),
 ('funktioniert', 0.07824072241783142),
 ('konnt', 0.07354910671710968),
 ('steht', 0.06351776421070099),
 ('bitt', 0.06187428534030914)]