In [1]:
from tqdm.notebook import tqdm, trange # to create progress bar    

import pandas as pd

import os # to open the compressed text files
import tarfile

import numpy as np

# model persistence
from joblib import dump, load

# For NLP
from sklearn.feature_extraction.text import CountVectorizer # n-gram tokenizer
from sklearn.feature_extraction.text import TfidfTransformer # term frequency-inverse document frequency transformer
import re # regukar expression
from nltk.stem.porter import PorterStemmer # It allows us to map related words to the same stem.
import nltk # it has a list of stop words
from nltk.corpus import stopwords

In [2]:
# # uncompress the data file
# with tarfile.open('aclImdb_v1.tar.gz', 'r:gz') as tar:
#     tar.extractall()

In [3]:
# # change the 'basepath' to the directory of the
# # unzipped movie dataset
# basepath = 'aclImdb'

# labels = {'pos': 1, 'neg': 0}


# df = pd.DataFrame()

# for s in tqdm(('test', 'train')):
#     for l in tqdm(('pos', 'neg')):
#         path = os.path.join(basepath, s, l)
#         for file in tqdm(sorted(os.listdir(path))):
#             with open(os.path.join(path, file),
#                       'r', encoding='utf-8') as infile:
#                 txt = infile.read()
#             df = df.append([[txt, labels[l]]],
#                            ignore_index=True)

In [4]:
# df.columns = ['review', 'sentiment']

In [5]:
# # shuffle examples
# np.random.seed(0)
# df = df.reindex(np.random.permutation(df.index))

# # save to a csv file
# df.to_csv('movie_data.csv', index=False, encoding='utf-8')

In [6]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0
...,...,...
49995,"OK, lets start with the best. the building. al...",0
49996,The British 'heritage film' industry is out of...,0
49997,I don't even know where to begin on this one. ...,0
49998,Richard Tyler is a little boy who is scared of...,0


In [7]:
# # Examples of constructing a bag-of-words model
# docs = np.array(['The sun is shining',
#                  'The weather is sweet',
#                  'The sun is shining, the weather is sweet, and one and one is two'])
# print(docs)

# # unigram 1-gram
# count = CountVectorizer()
# bag = count.fit_transform(docs)
# print("\n1-gram")
# print(count.vocabulary_)
# # print(bag)
# print(bag.toarray())

# # ngram 2-gram
# count = CountVectorizer(ngram_range=(2,2))
# bag = count.fit_transform(docs)
# print("\n2-gram")
# print(count.vocabulary_)
# # print(bag)
# print(bag.toarray())

# # ngram 12-gram
# count = CountVectorizer(ngram_range=(2,2))
# bag = count.fit_transform(docs)
# print("\n12-gram")
# print(count.vocabulary_)
# # print(bag)
# print(bag.toarray())

In [8]:
# # TfidfTransformer class, which takes the raw term frequencies from the CountVectorizer class as input
# # and transforms them into tf-idfs:

# docs = np.array(['The sun is shining',
#                  'The weather is sweet',
#                  'The sun is shining, the weather is sweet, and one and one is two'])
# print(docs)
                 
# count = CountVectorizer()
# bag = count.fit_transform(docs)
# print(count.vocabulary_)
# print(bag.toarray())
# # it is a transformer that downweights these frequently occurring words in the feature vectors
# tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True) 

# np.set_printoptions(precision=2)

# print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

In [9]:
def preprocessor(text):
    """A function that eliminates html 
        marks and retain emoticons"""
    
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', ''))
    return text

# apply the preprocessor previously defined function on the first column of the dataframe and store in it
df['review'] = df['review'].apply(preprocessor)
df

Unnamed: 0,review,sentiment
0,in 1974 the teenager martha moxley maggie grac...,1
1,ok so i really like kris kristofferson and his...,0
2,spoiler do not read this if you think about w...,0
3,hi for all the people who have seen this wonde...,1
4,i recently bought the dvd forgetting just how ...,0
...,...,...
49995,ok lets start with the best the building altho...,0
49996,the british heritage film industry is out of c...,0
49997,i don t even know where to begin on this one i...,0
49998,richard tyler is a little boy who is scared of...,0


In [10]:
# Simple tokenizer
def tokenizer(text):
    return text.split()

tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [11]:
# Tokenizer that uses the Porter stemming algorithm:

porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

tokenizer_porter('runners like running and thus they run')


['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [12]:
#  Remove stop words (and, like, a, the, as, etc.)
nltk.download('stopwords')

stop = stopwords.words('english')
# [w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joses\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
################## Processing ####################

X_train = df.loc[0:25000, 'review'].values
y_train = df.loc[0:25000, 'sentiment'].values

X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None) # equivalent to CountVectorizer + TfidfTransformer

param_grid = [{'vect__ngram_range': [(1,1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer,tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1,1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer,tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]}]

lr_tfidf = Pipeline([('vect', tfidf), # vectoryzer 
                     ('clf', LogisticRegression(random_state=0, solver='liblinear'))]) # classifier

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5, verbose=2, # cv = number of folds
                           n_jobs=-1) #n proccesors, -1 all available

In [16]:
############## Train #################

# gs_lr_tfidf.fit(X_train, y_train)
# print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)
# clf = gs_lr_tfidf.best_estimator_
# dump(clf, 'movie_review_classifier.joblib') 

In [18]:
clf = load('movie_review_classifier.joblib') 
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

Test Accuracy: 0.899


In [44]:
labels = {0: 'negative', 1: 'positive'}

print(labels[clf.predict([example_test])[0]])

example_test = "The best movie I've seen in recent times."
print(f"The prediction is {labels[clf.predict([example_test])[0]]} with a probability of {np.max(clf.predict_proba([example_test]))}")
example_test = "The worst movie I've seen in recent times."
print(f"The prediction is {labels[clf.predict([example_test])[0]]} with a probability of {np.max(clf.predict_proba([example_test]))}")


negative
The prediction is positive with a probability of 0.9759399310821698
The prediction is negative with a probability of 0.9998255800144438


In [45]:
######## dump an NLP model and stop vocabulary with pickle 
import pickle
import os

dest = os.path.join('movieclassifier', 'pkl_objects')

print(dest)
if not os.path.exists(dest):
    os.makedirs(dest)
pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4)
pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4)

movieclassifier\pkl_objects
