In [1]:
import numpy as np
import re
from nltk.corpus import stopwords
import csv


# The `stop` is defined as earlier in this chapter
# Added it here for convenience, so that this section
# can be run as standalone without executing prior code
# in the directory
stop = stopwords.words('german')


def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    #tokenized = [w for w in text.split()]
    return tokenized
         

In [2]:
import numpy as np
import pandas as pd

def shuffle(path, rename):
    df = pd.read_csv(path, header=0, delimiter=";")
    frame = df.reindex(np.random.permutation(df.index))
    if rename == 1:
        name = path[:path.rindex('.')]
        extension = path[path.rindex('.'):]
        newPath = name + "_shuffled" + extension
        print(path, '=>', newPath)
        frame.to_csv(newPath, sep=';', index=False, header=False)
    else:
        frame.to_csv(path, sep=';', index=False, header=False)
        
#shuffle(path='trainingsdata/training_gesamt_extended_cutted.csv', rename=1)

In [3]:
from nltk.stem import SnowballStemmer 

def stem_sentence(sentence, algorithm):
    words = tokenizer(sentence)
    
    # print("\nSnowballStemmer (Porter vs. German):")
    stemming = SnowballStemmer(algorithm)
    
    # for w in words: 
        # print(" => ", w, ":", sPorter.stem(w), "|", sGerman.stem(w))
        
    newWords = map(lambda w: stemming.stem(w), words) 
        
    # i = 0
    # for w in words:
    #     words[i] = stemming.stem(w)
    #     i+=1
    # return ' '.join(words)
    
    return ' '.join(newWords)

In [4]:
def stream_docs(path):
    #with open(path, 'rb') as csvfile:
   #     csv_reader = csv.reader(csvfile, delimiter=';', quotechar='\'')
   #     for line in csv_reader:
    #        text, label = line[:-3], int(line[-2])
    #        yield text, label     
    
    with open(path, 'r', encoding='utf-8') as csv:
        #next(csv)  # skip header
        for line in csv:
            #print(line)
            text, label = line[:-3], int(line[-2])
            text = stem_sentence(text, 'porter')
            yield text, label   

In [5]:
next(stream_docs(path='trainingsdata/training_gesamt_extended_cutted_shuffled.csv'))

('ch hass sarah connor aba lie music is the key oda is eig ganz gut hörbar liegt aba sicha net sarah natur 7',
 0)

In [6]:
def get_minibatch(doc_stream, size, i):
    docs, y = [], []
    try:
        for j in range(size):
            #print(i * size + j)
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [7]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier


vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21,
                         preprocessor=None, 
                         tokenizer=tokenizer)

In [8]:
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version

if Version(sklearn_version) < '0.18':
    clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
else:
    clf = SGDClassifier(loss='log', random_state=1, max_iter=1)

doc_stream = stream_docs(path='trainingsdata/training_gesamt_extended_cutted_shuffled.csv')

In [9]:
#Only needed for progressbar
import pyprind
pbar = pyprind.ProgBar(35) #35

classes = np.array([0, 1])
for i in range(35):
    X_train, y_train = get_minibatch(doc_stream, size=100, i = i)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:01


In [10]:
X_test, y_test = get_minibatch(doc_stream, size=500, i = 0) #900
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.836


In [11]:
clf = clf.partial_fit(X_test, y_test)

In [12]:
doc_stream = stream_docs(path='trainingsdata/german_validation.csv')

X_val, y_val = get_minibatch(doc_stream, size=7, i = 0)
X_val = vect.transform(X_val)
print('Accuracy: %.3f' % clf.score(X_val, y_val))

Accuracy: 0.571


In [13]:
predictions = clf.predict(X_val)
print(predictions)

[1 0 0 0 0 0 0]
