In [1]:
import numpy as np
import re
from nltk.corpus import stopwords


# The `stop` is defined as earlier in this chapter
# Added it here for convenience, so that this section
# can be run as standalone without executing prior code
# in the directory
stop = stopwords.words('german')


def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    #tokenized = [w for w in text.split()]
    return tokenized


def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)  # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label            

In [67]:
from nltk.stem import SnowballStemmer 

def stem_sentence(index, sentence):
    words = tokenizer(sentence)

    # print(" ".join(SnowballStemmer.languages))

    print("\nSnowballStemmer (Porter vs. German):")
    sPorter = SnowballStemmer("porter")
    sGerman = SnowballStemmer("german")
    
    print("Sentence #", index, ":\"", sentence, "\"")
    for w in words: 
        print(" => ", w, ":", sPorter.stem(w), "|", sGerman.stem(w))
    
def stem_file(path, sample):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)
        i = 0
        for line in csv:
            if i % sample == 0:
                stem_sentence(index=i, sentence=line)
            i += 1
            
stem_file(path='german_emotions.csv', sample=500)


SnowballStemmer (Porter vs. German):
Sentence # 0 :" @Martin28a Sie haben ja auch Recht. Unser Tweet war etwas missverständlich. Dass das BVerfG Sachleistungen nicht ausschließt kritisieren wir.,0
 "
 =>  martin28a : martin28a | martin28a
 =>  ja : ja | ja
 =>  recht : recht | recht
 =>  tweet : tweet | tweet
 =>  missverständlich : missverständlich | missverstand
 =>  bverfg : bverfg | bverfg
 =>  sachleistungen : sachleistungen | sachleist
 =>  ausschließt : ausschließt | ausschliesst
 =>  kritisieren : kritisieren | kritisi
 =>  0 : 0 | 0

SnowballStemmer (Porter vs. German):
Sentence # 500 :" "@MartinSchulz Ihr hetzt permanent gegen Trump. Und diese ""Demokratiebgemeinschaft Europa"" ist auf Sand gebaut und aus dem Geld des von euch ausgebeuteten kleinen Mannes. Dann lieber ""Amerika First""  - das ist echt das ist ehrlich da kann sich der Ami (egal welche Farbe) mit identifizieren",0
 "
 =>  martinschulz : martinschulz | martinschulz
 =>  hetzt : hetzt | hetzt
 =>  permanent : pe

In [13]:
import numpy as np
import pandas as pd

def shuffle(path, rename):
    df = pd.read_csv(path, header=0)
    frame = df.reindex(np.random.permutation(df.index))
    if rename == 1:
        name = path[:path.rindex('.')]
        extension = path[path.rindex('.'):]
        newPath = name + "_shuffled" + extension
        print(path, '=>', newPath)
        frame.to_csv(newPath, sep=';', index=False, header=False)
    else:
        frame.to_csv(path, sep=';', index=False, header=False)
        
shuffle(path='german_emotions.csv', rename=1)

german_emotions.csv => german_emotions_shuffled.csv


In [6]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/martinkade/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
next(stream_docs(path='german_emotions.csv'))

('@Martin28a Sie haben ja auch Recht. Unser Tweet war etwas missverständlich. Dass das BVerfG Sachleistungen nicht ausschließt kritisieren wir.',
 0)

In [5]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [6]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier


vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21,
                         preprocessor=None, 
                         tokenizer=tokenizer)

In [7]:
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version

if Version(sklearn_version) < '0.18':
    clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
else:
    clf = SGDClassifier(loss='log', random_state=1, max_iter=1)

doc_stream = stream_docs(path='german_emotions.csv')

In [8]:
#OTHER
#PROFANITY
#ABUSE
#INSULT
#OFFENSE

#Only needed for progressbar
#import pyprind
#pbar = pyprind.ProgBar(45)

classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=100)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    #pbar.update()

In [9]:
X_test, y_test = get_minibatch(doc_stream, size=500)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.752


In [10]:
clf = clf.partial_fit(X_test, y_test)

In [11]:
#print(clf)

In [12]:
#print(y_test)

In [13]:
doc_stream = stream_docs(path='german_validation.csv')

X_val, y_val = get_minibatch(doc_stream, size=7)
X_val = vect.transform(X_val)
print('Accuracy: %.3f' % clf.score(X_val, y_val))

Accuracy: 0.714


In [14]:
predictions = clf.predict(X_val)
print(predictions)

[0 1 0 0 1 0 1]
