In [1]:
import nltk

# nltk.download('stopwords')
# from nltk.corpus import stopwords
# stop = stopwords.words('english')

stop = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

In [3]:
import pandas as pd
import numpy as np

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [4]:
df = pd.read_csv('movie_data.csv')

**Out-of-core algorithm**

In [5]:
import re

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

to read and return one document at a time

In [6]:
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

next (stream_docs(path='movie_data.csv'))

('"After five years in prison, Tony le Stéphanois (Jean Servais) meets his dearest friends Jo (Carl Möhner) and the Italian Mario Ferrati (Robert Manuel) and they invite Tony to steal a couple of jewels from the show-window of the famous jewelry Mappin & Webb Ltd, but he declines. Tony finds his former girlfriend Mado (Marie Sabouret), who became the lover of the gangster owner of the night-club L\' Âge d\' Or Louis Grutter (Pierre Grasset), and he humiliates her, beating on her back and taking her jewels. Then he calls Jo and Mario and proposes a burglary of the safe of the jewelry. They invite the Italian specialist in safes and elegant wolf Cesar (Perlo Vita) to join their team and they plot a perfect heist. They are successful in their plan, but the D. Juan Cesar makes things go wrong when he gives a valuable ring to his mistress.<br /><br />""Du Rififi Chez les Hommes"" is a magnificent film-noir, certainly among the best I have seen. The screenplay has credibility, supported by a

function to return a particular number of documents by the *size* parameter

In [7]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y 

using HashingVectorizer

In [8]:
import pyprind

In [13]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer)
clf = SGDClassifier(loss='log_loss', random_state=1, max_iter=1)
doc_stream = stream_docs(path='movie_data.csv')

In [14]:
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()



In [15]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %0.3f' %clf.score(X_test, y_test))

Accuracy: 0.855


**Serializing fitted scikit-learn estimators**

(Via the dump method of the pickle module, we then
serialized the trained logistic regression model as well as the stop word set from the
Natural Language Toolkit (NLTK) library, so that we don't have to install the NLTK
vocabulary on our server.
    
The dump method takes as its first argument the object that we want to pickle,
and for the second argument we provided an open file object that the Python object
will be written to. Via the wb argument inside the open function, we opened the file
in binary mode for pickle, and we set protocol=4 to choose the latest and most
efficient pickle protocol that has been added to Python 3.4, which is compatible with
Python 3.4 or newer)

In [16]:
import pickle
import os
dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)

pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4)
pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4)