In [1]:
%config IPCompleter.greedy=True

### Load csv

In [10]:
file = 'ionesoft_tickets.csv'
encoding = 'ansi'
sep = ','

In [11]:
import pandas as pd

extract = 'ionesoft_tickets_corrected_grammar.csv'
names=['ticket', 'type', 'client', 'issuer', 'inquiry', 'answer']

corpus = pd.read_csv(file, sep=sep, encoding=encoding, names=names, skiprows=[0])
corpus.head()

Unnamed: 0,ticket,type,client,issuer,inquiry,answer
0,2019031000000000.0,Fehler in der App,SVBA,3,ich kann die bilder in übungen nicht bearbeite...,Fehlerbericht senden
1,2019030000000000.0,Fehler in der App,FOMA,3,lückentexte weg daten verloren gegangen von pp...,Fehlerbericht senden
2,2019023000000000.0,Anmeldung / Aktivierung,AGVS,3,bitte deaktivieren sie dieses gerät. vielen dank.,"Gerät entfernt, neu starten"
3,2019023000000000.0,Geräteanzahl überschritten,BEOOK,3,ich habe einen neuen laptop und möchte den alt...,"Gerät entfernt, neu starten"
4,2019023000000000.0,Geräteanzahl überschritten,BEOOK,3,offenbar habe ich die geräteanzahl überschritt...,"Gerät entfernt, neu starten"


### Pickle dump column as flatmap

In [14]:
out = 'inquiries.p'
col = 'inquiry'

In [22]:
import pickle

column = corpus[col].values.flatten().tolist()
pickle.dump(column, open(out, 'wb'))

### Default NLP Pipeline
1. Normalisation
1. Tokenisation
1. Remove stop words
1. Stemming / Lemming
1. TF-IDF

#### 1. Normalization

In [63]:
import re
import numpy as np

def normalize_document(doc):
    double_spaces = r' +'
    not_in_swiss_alphabet = r'[^\u00C0-\u017Fa-zA-Z\s]'
    
    doc = re.sub(not_in_swiss_alphabet, ' ', doc, re.I | re.A)
    doc = re.sub(double_spaces, ' ', doc, re.I | re.A)
    
    doc = doc.lower()
    return doc.strip()

normalize = np.vectorize(normalize_document)

#### test normalization

In [158]:
test = [
    'lückentexte acht aber weg daten tester/terterin tester/-in verloren gegangen',
    'äöü!#@éàè'
]

test = normalize(test)
print(test)

['lückentexte acht aber weg daten tester terterin tester in verloren gegangen'
 'äöü éàè']


#### 2. Tokenization

In [159]:
def tokenize_doc(doc):
    return [sent.split(' ') for sent in doc]

#### test tokenization

In [160]:
tokens = tokenize_doc(test)
print(tokens)

[['lückentexte', 'acht', 'aber', 'weg', 'daten', 'tester', 'terterin', 'tester', 'in', 'verloren', 'gegangen'], ['äöü', 'éàè']]


#### 3.  Remove stop words

In [181]:
def remove_stopwords(tokens, stop_words):
    return [token for token in tokens if token not in stop_words]

def remove_stopwords_doc(token_doc, stop_words):
    return list(map(lambda l: remove_stopwords(l, stop_words), token_doc))

#### test stop word removal

In [182]:
stop_words = pickle.load(open('./res/custom_ch_stopwords.p', 'rb'))

clean = remove_stopwords_doc(tokens, stop_words)
print(clean)

[['lückentexte', 'daten', 'tester', 'terterin', 'tester', 'verloren', 'gegangen'], ['äöü', 'éàè']]


#### 4. stemming / lemming

In [183]:
from nltk.stem.snowball import SnowballStemmer

def stemm(tokens, stemmer):
    return [stemmer.stem(token) for token in tokens]
    
def stemm_doc(token_doc, stemmer):
    return list(map(lambda l: stemm(l, stemmer), token_doc))

#### test stemming

In [184]:
snowball = SnowballStemmer('german', ignore_stopwords=True)
stemmed = stemm_doc(clean, snowball)
print(stemmed)

[['luckentext', 'dat', 'test', 'terterin', 'test', 'verlor', 'gegang'], ['aou', 'éàè']]
