In [65]:
import pyprind
import numpy as np
import pandas as pd

import os
import re
import nltk

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [39]:
# change the 'basepath' to the directory of the
# unzipped movie dataset
basepath = 'C:\\Users\\alejo\\OneDrive\\Documents\\Datasets\\aclImdb'

# gathering all txt files into a single Dataframe object
labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file),'r', encoding='utf-8') as infile:
                txt = infile.read()
            # [[txt, labels[l]]]
            df = pd.concat([df, pd.DataFrame([[txt, labels[l]]])], ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:58


In [73]:
# saving to csv
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('C:\\Users\\alejo\\OneDrive\\Documents\\Datasets\\movie_data.csv', index=False, encoding='utf-8')

In [41]:
# reading dataset
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [42]:
df.shape

(50000, 2)

### Testing word bagging & such

In [47]:
# testing word bagging
count = CountVectorizer()
docs = np.array(['The sun is shining',
    'The weather is sweet',
    'The sun is shining, the weather is sweet, and one and one is two'])
bag = count.fit_transform(docs)
print(count.vocabulary_)
print()
print(bag.toarray())

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [52]:
# Word frequency testing
tfidf = TfidfTransformer(use_idf=True,
    norm='l2',
    smooth_idf=True)
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs))
    .toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


### Cleaning up data

In [53]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
        text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
        ' '.join(emoticons).replace('-', ''))
    return text

  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
  text = (re.sub('[\W]+', ' ', text.lower()) +


In [55]:
df['review'] = df['review'].apply(preprocessor)
df.loc[0, 'review'][-50:]

'zation my vote is seven title brazil not available'

In [57]:
# creating a tokenizer method and testing it
def tokenizer(text):
    return text.split()
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [60]:
# word stemming (cutting our verbs and plurals)
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [62]:
nltk.download('stopwords')

stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes'
    ' running and runs a lot')[-10:]
    if w not in stop]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alejo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


['runner', 'like', 'run', 'run', 'lot']

### Splitting dataset

In [63]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

### Finding most optimal model

In [71]:
# finding optimal set of parameters for logistic regression using GridSearch with 5-fold cross-validation
tfidf = TfidfVectorizer(strip_accents=None,
    lowercase=False,
    preprocessor=None)
param_grid = [{'vect__ngram_range': [(1,1)],
    'vect__stop_words': [stop, None],
    'vect__tokenizer': [tokenizer,
        tokenizer_porter],
    'clf__penalty': ['l1', 'l2'],
    'clf__C': [1.0, 10.0, 100.0]},
    {'vect__ngram_range': [(1,1)],
    'vect__stop_words': [stop, None],
    'vect__tokenizer': [tokenizer,
        tokenizer_porter],
    'vect__use_idf':[False],
    'vect__norm':[None],
    'clf__penalty': ['l1', 'l2'],
    'clf__C': [1.0, 10.0, 100.0]}
    ]
lr_tfidf = Pipeline([('vect', tfidf),
    ('clf',
    LogisticRegression(random_state=0,
        solver='liblinear'))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
    scoring='accuracy',
    cv=5, verbose=2,
    n_jobs=1)
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits




[CV] END clf__C=1.0, clf__penalty=l1, vect__ngram_range=(1, 1), vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'o



[CV] END clf__C=1.0, clf__penalty=l1, vect__ngram_range=(1, 1), vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'o



[CV] END clf__C=1.0, clf__penalty=l1, vect__ngram_range=(1, 1), vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'o



[CV] END clf__C=1.0, clf__penalty=l1, vect__ngram_range=(1, 1), vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'o



[CV] END clf__C=1.0, clf__penalty=l1, vect__ngram_range=(1, 1), vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'o



KeyboardInterrupt: 

In [72]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [None]:
print('CV Accuracy: %.3f'
    % gs_lr_tfidf.best_score_)
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f'
    % clf.score(X_test, y_test))

# model can predict correctly with a 90% accuracy

## Out-of-core Learning

In [83]:
import numpy as np
import re
import pyprind
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

In [74]:
stop = stopwords.words('english')
# tokenizer function to clean up unprocessed text data
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
        text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) \
        + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
  text = re.sub('[\W]+', ' ', text.lower()) \


In [78]:
# reads and returns one document at a time
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label
            
# verifying that stream_docs works
next(stream_docs(path='C:\\Users\\alejo\\OneDrive\\Documents\\Datasets\\movie_data.csv'))

('election is a chinese mob movie or triads in this case every two years an election is held to decide on a new leader and at first it seems a toss up between big d tony leung ka fai or as i know him the other tony leung and lok simon yam who was judge in full contact though once lok wins big d refuses to accept the choice and goes to whatever lengths he can to secure recognition as the new leader unlike any other asian film i watch featuring gangsters this one is not an action movie it has its bloody moments when necessary as in goodfellas but it s basically just a really effective drama there are a lot of characters which is really hard to keep track of but i think that plays into the craziness of it all a bit a 100 year old baton which is the symbol of power i mentioned before changes hands several times before things settle down and though it may appear that the film ends at the 65 or 70 minute mark there are still a couple big surprises waiting simon yam was my favorite character 

In [81]:
# returns a small batch of documents using the doc_stream function
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [89]:
vect = HashingVectorizer(decode_error='ignore',
    n_features=2**21,
    preprocessor=None,
    tokenizer=tokenizer)
clf = SGDClassifier(loss='log_loss', random_state=1)
doc_stream = stream_docs(path='C:\\Users\\alejo\\OneDrive\\Documents\\Datasets\\movie_data.csv')

In [90]:
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:12


In [91]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

# accuracy of 86% in 1% of the time of GridSearch

Accuracy: 0.866


In [92]:
clf = clf.partial_fit(X_test, y_test)

## The ACTUAL start

In [93]:
# pickling vectorizer

import pickle
import os
dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
pickle.dump(stop,
    open(os.path.join(dest, 'stopwords.pkl'), 'wb'),
    protocol=4)
pickle.dump(clf,
    open(os.path.join(dest, 'classifier.pkl'), 'wb'),
    protocol=4)

In [94]:
# unpickling
os.chdir('C:\\Users\\alejo\\OneDrive\\Escritorio\\GitHub Files\\Jupyter Notebooks\\movieclassifier')

import pickle
import re
import os
from vectorizer import vect
clf = pickle.load(open(os.path.join(
    'pkl_objects', 'classifier.pkl'),
    'rb'))

finish
SGDClassifier(loss='log_loss', random_state=1)
Prediction: positive
Probability: 95.71%


In [95]:
# uploading test documents to make predictions using our classifier objects
import numpy as np
label = {0:'negative', 1:'positive'}
example = ["I love this movie. It's amazing."]
X = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%' %\
    (label[clf.predict(X)[0]],
    np.max(clf.predict_proba(X))*100))

Prediction: positive
Probability: 95.71%


## Creating a SQLite database to store prediction data

In [96]:
import sqlite3
import os
conn = sqlite3.connect('reviews.sqlite')

c = conn.cursor()
c.execute('DROP TABLE IF EXISTS review_db')
c.execute('CREATE TABLE review_db'\
    ' (review TEXT, sentiment INTEGER, date TEXT)')
example1 = 'I love this movie'
c.execute("INSERT INTO review_db"\
    " (review, sentiment, date) VALUES"\
    " (?, ?, DATETIME('now'))", (example1, 1))

example2 = 'I disliked this movie'
c.execute("INSERT INTO review_db"\
    " (review, sentiment, date) VALUES"\
    " (?, ?, DATETIME('now'))", (example2, 0))
conn.commit()
conn.close()


In [97]:
conn = sqlite3.connect('reviews.sqlite')
c = conn.cursor()
c.execute("SELECT * FROM review_db WHERE date"\
    " BETWEEN '2017-01-01 00:00:00' AND DATETIME('now')")
results = c.fetchall()
conn.close()
print(results)

[('I love this movie', 1, '2024-04-30 03:54:51'), ('I disliked this movie', 0, '2024-04-30 03:54:51')]
