The following implementation is almost all taken from the book: **Python Machine Learning (Pytorch) by Sebastian Raschka, Yuxi (Hayden) Liu, Vahid Mirjalili**.

In [22]:
import pyprind
import numpy as np
import pandas as pd
import os
import sys

In [23]:
# ignoring warnings
import warnings
warnings.filterwarnings("ignore")

In [24]:
basepath = "../IMDB_dataset/"
labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000, bar_char='█', stream=sys.stdout)
df = pd.DataFrame()

for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
                df = pd.concat([df, pd.DataFrame([[txt, labels[l]]])], ignore_index=True, axis=0)
                pbar.update()

df.columns = ['review', 'sentiment']

0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:13


In [25]:
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

In [26]:
df = pd.read_csv("movie_data.csv", encoding='utf-8')
df = df.rename(columns={'0': 'review', '1': 'sentiment'})
print(df.head(3))
print(df.shape)

                                              review  sentiment
0  In 1974, the teenager Martha Moxley (Maggie Gr...          1
1  OK... so... I really like Kris Kristofferson a...          0
2  ***SPOILER*** Do not read this, if you think a...          0
(50000, 2)


In [27]:
# Instead of the bag of words model we shall use the TfIdf Vectorizer
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True, norm="l2", smooth_idf=True)
np.set_printoptions(precision=2)

In [28]:
# prepare the data
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    # remove all non-word characters, convert to lowercase, append the emoticons,
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    return text

In [29]:
df["review"] = df["review"].apply(preprocessor)

In [30]:
# two tokenizers
def tokenizer(text):
    return text.split()

from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [31]:
from nltk.corpus import stopwords
stop = stopwords.words("english")

In [32]:
X_train = df.loc[:25000, "review"].values
y_train = df.loc[:25000, "sentiment"].values
X_test = df.loc[25000:, "review"].values
y_test = df.loc[25000:, "sentiment"].values

In [33]:
# We'll train a Naive Bayes classifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)

param_grid = [
    {
        'vect__ngram_range': [(1, 1)],
        'vect__stop_words': [None],
        'vect__tokenizer': [tokenizer, tokenizer_porter],
        'clf__alpha': [1.0, 0.1, 0.01]
    },
    {
        'vect__ngram_range': [(1, 1)],
        'vect__stop_words': [stop, None],
        'vect__tokenizer': [tokenizer],
        'vect__use_idf': [False],
        'vect__norm': [None],
        'clf__alpha': [1.0, 0.1, 0.01]
    },
]

nb_tfidf = Pipeline([
    ("vect", tfidf),
    ("clf", MultinomialNB())
])

gs_nb_tfidf = GridSearchCV(nb_tfidf, param_grid, scoring="accuracy", cv=5, verbose=2, n_jobs=1)
gs_nb_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END clf__alpha=1.0, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x15cd14d60>; total time=   1.2s
[CV] END clf__alpha=1.0, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x15cd14d60>; total time=   1.3s
[CV] END clf__alpha=1.0, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x15cd14d60>; total time=   1.3s
[CV] END clf__alpha=1.0, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x15cd14d60>; total time=   1.3s
[CV] END clf__alpha=1.0, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x15cd14d60>; total time=   1.2s
[CV] END clf__alpha=1.0, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer_porter at 0x15cd14c20>; total time=  36.5s
[CV] END clf__alpha=1.0, vect__ngram_range=(1, 1), vect_

In [34]:
print(f'Best parameter set: {gs_nb_tfidf.best_params_}')

Best parameter set: {'clf__alpha': 1.0, 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x15cd14d60>}


In [36]:
print(f'CV Accuracy: {gs_nb_tfidf.best_score_:.3f}')

CV Accuracy: 0.857


In [37]:
clf = gs_nb_tfidf.best_estimator_
print(f'Test Accuracy: {clf.score(X_test, y_test):.3f}')

Test Accuracy: 0.859


In [38]:
import pickle
import os
# inside the models folder
dest = os.path.join("models", "IMDB_reviews_classifier")
if not os.path.exists(dest):
    os.makedirs(dest)

pickle.dump(clf, open(os.path.join(dest, "IMDB_reviews_classifier_multinomial_naive_bayes.pkl"), "wb"), protocol=4)

In [41]:
# test the model
clf = pickle.load(open(os.path.join(dest, "IMDB_reviews_classifier_multinomial_naive_bayes.pkl"), "rb"))
print(f'Test Accuracy: {clf.score(X_test, y_test):.3f}')

Test Accuracy: 0.859


ONLINE LEARNING (Larger data optimization)

In [42]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    # remove all non-word characters, convert to lowercase, append the emoticons,
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1, max_iter=1)
doc_stream = stream_docs(path='movie_data.csv')

In [43]:
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()



In [44]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print(f'Accuracy: {clf.score(X_test, y_test):.3f}')

Accuracy: 0.868
