# Classifier
Generating pickle for classifier for online deployment

In [3]:
import pyprind
import pandas as pd
import os

basepath = "aclImdb_v1/aclImdb"

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()

for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
                df = df.append([[txt, labels[l]]], ignore_index=True)
                pbar.update()
df.columns = ['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:46


In [6]:
import numpy as np

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index = False, encoding='utf-8')

In [37]:
df = pd.read_csv('movie_data.csv', encoding = 'utf-8')
df.head(5)
df.shape

(50000, 2)

In [38]:
# import nltk
# nltk.download('stopwords')

import re
from nltk.corpus import stopwords

stop = stopwords.words('english')
def tokenizer(txt):
    txt = re.sub('<[^>]*>', '', txt)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', txt.lower())
    txt = re.sub('[\W]+', ' ', txt.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in txt.split() if w not in stop]
    return tokenized

In [39]:
stop[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [40]:
def stream_docs(path):
    with open(path, 'r', encoding ='utf-8') as csv:
        next(csv)
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [41]:
next(stream_docs(path='movie_data.csv'))

('"This is a case of taking a fairy tale too far. The Enchanted Cottage delivers Dorothy McGuire as a ""terrible ugly"" spinster and Robert Young as a disfigured pilot. Long story short: Scarface marries Spinster, after which their love transforms them, miraculously (lighting, cosmetics and the removal of fake scars), into beautiful people\x97a magical change that they attribute to the enchantment of living in a seaside cottage that has been the abode of generations of honeymooners.<br /><br />If the story stopped there, fine; it would be a fable with a proverbial message: beauty is in the eye of the beholder. But it lurches ahead, reaching for reality. When Mr. and Mrs. Scarface greet their public, it comes as a painful shock to them that they\'re still homely. You see, they only appear beautiful to each other\x96 a situation which the audience is well prepared for because all the secondary characters have been sermonizing that ill-favored people really need to lower their expectation

In [42]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y


In [43]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer)

clf=SGDClassifier(loss='log', random_state=1, max_iter=1)
doc_stream = stream_docs(path='movie_data.csv')

In [44]:
import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0,1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    # print(X_train[:2])
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:19


In [45]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.859


In [46]:
import pickle
import os

dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)

pickle.dump(stop, open(os.path.join(dest,'stopwords.pkl'),'wb'), protocol=4)
pickle.dump(clf, open(os.path.join(dest,'classifier.pkl'),'wb'), protocol=4)