In [2]:
import numpy as np
import re
from nltk.corpus import stopwords

In [3]:
stop = stopwords.words('english')

In [5]:
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emotions = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emotions).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [7]:
def stream_docs(path):
    with open(path, 'r') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [8]:
next(stream_docs(path='movie_data.csv'))

('"I\'ll dispense with the obvious review of factual inaccuracies. They are too numerous to name. A much shorter list would be what they got right. 1. Dude named Noah. 2. Ark with animals on it. <br /><br />If you want a much more accurate portrayal of Noah\'s Ark and the destruction of Sodom, go rent ""The Bible"" (1966). It depicts the story of creation through Abraham attempting to sacrifice his son Isaac. It\'s a much better movie, and it may be that the abomination called ""Noah\'s Ark"" (1999) drove you to seek just such a film. http://www.imdb.com/title/tt0060164/<br /><br />I really couldn\'t stomach watching the whole movie. From reading other comments, I can see that even the atheists found it grossly inaccurate. As a Christian, it was intolerable to me. Possibly the worst movie ever made. No real point to this movie either, except maybe to showcase their sub-par computer animation.<br /><br />Was it a complete waste? Maybe not. God can use evil to work good. <br /><br />Roma

In [9]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [10]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore',
                         n_features=2**21,
                         preprocessor=None,
                         tokenizer=tokenizer)
clf = SGDClassifier(loss='log',random_state=1, n_iter=1)
doc_stream = stream_docs(path='movie_data.csv')

In [11]:
import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:34


In [12]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print 'Accurancy: %.3f' % clf.score(X_test, y_test)

Accurancy: 0.868


In [14]:
clf = clf.partial_fit(X_test, y_test)

# This is absolutely a very beautiful classfier so far!

# Next: Embedding this Machine Learning Model into a Web Application

In [17]:
import pickle
import os

dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
pickle.dump(stop,
            open(os.path.join(dest, 'stopwords.pkl'), 'wb'),
            protocol=2)
pickle.dump(clf,
            open(os.path.join(dest, 'classifier.pkl'), 'wb'),
            protocol=2)