In [1]:
import numpy as np
import pandas as pd
import os
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import warnings

In [2]:
# Read files into python 
reviews_train = []
for line in open("../data/movie_data/full_train.txt", 'r', encoding='utf-8'):
    reviews_train.append(line.strip())
    
reviews_test = []
for line in open("../data/movie_data/full_test.txt", 'r', encoding='utf-8'):
    reviews_test.append(line.strip())
    
target = [1 if i < 12500 else 0 for i in range(25000)]

In [3]:
reviews_train[5]

"This isn't the comedic Robin Williams, nor is it the quirky/insane Robin Williams of recent thriller fame. This is a hybrid of the classic drama without over-dramatization, mixed with Robin's new love of the thriller. But this isn't a thriller, per se. This is more a mystery/suspense vehicle through which Williams attempts to locate a sick boy and his keeper.<br /><br />Also starring Sandra Oh and Rory Culkin, this Suspense Drama plays pretty much like a news report, until William's character gets close to achieving his goal.<br /><br />I must say that I was highly entertained, though this movie fails to teach, guide, inspect, or amuse. It felt more like I was watching a guy (Williams), as he was actually performing the actions, from a third person perspective. In other words, it felt real, and I was able to subscribe to the premise of the story.<br /><br />All in all, it's worth a watch, though it's definitely not Friday/Saturday night fare.<br /><br />It rates a 7.7/10 from...<br />

In [4]:
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "

def preprocess_reviews(reviews):
    
    reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)

In [5]:
reviews_train_clean[5]

"this isn't the comedic robin williams nor is it the quirky insane robin williams of recent thriller fame this is a hybrid of the classic drama without over dramatization mixed with robin's new love of the thriller but this isn't a thriller per se this is more a mystery suspense vehicle through which williams attempts to locate a sick boy and his keeper also starring sandra oh and rory culkin this suspense drama plays pretty much like a news report until william's character gets close to achieving his goal i must say that i was highly entertained though this movie fails to teach guide inspect or amuse it felt more like i was watching a guy williams as he was actually performing the actions from a third person perspective in other words it felt real and i was able to subscribe to the premise of the story all in all it's worth a watch though it's definitely not friday saturday night fare it rates a   from the fiend "

#### Baseline

In [6]:
# baseline_vectorizer = CountVectorizer(binary=True)
# baseline_vectorizer.fit(reviews_train_clean)
# X_baseline = baseline_vectorizer.transform(reviews_train_clean)
# X_test_baseline = baseline_vectorizer.transform(reviews_test_clean)

# X_train, X_val, y_train, y_val = train_test_split(
#     X_baseline, target, train_size = 0.75
# )

# for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
#     lr = LogisticRegression(C=c)
#     lr.fit(X_train, y_train)
#     print ("Accuracy for C=%s: %s" 
#            % (c, accuracy_score(y_val, lr.predict(X_val))))
    
# #     Accuracy for C=0.01: 0.87328
# #     Accuracy for C=0.05: 0.88368
# #     Accuracy for C=0.25: 0.8832
# #     Accuracy for C=0.5: 0.87872
# #     Accuracy for C=1: 0.87648

# final_model = LogisticRegression(C=0.05)
# final_model.fit(X_baseline, target)
# print ("Final Accuracy: %s" 
#        % accuracy_score(target, final_model.predict(X_test_baseline)))
# # Final Accuracy: 0.88176

#### Remove Stop Words

In [7]:
# from nltk.corpus import stopwords

# english_stop_words = stopwords.words('english')
# def remove_stop_words(corpus):
#     removed_stop_words = []
#     for review in corpus:
#         removed_stop_words.append(
#             ' '.join([word for word in review.split() 
#                       if word not in english_stop_words])
#         )
#     return removed_stop_words

# no_stop_words_train = remove_stop_words(reviews_train_clean)
# no_stop_words_test = remove_stop_words(reviews_test_clean)

# cv = CountVectorizer(binary=True)
# cv.fit(no_stop_words_train)
# X = cv.transform(no_stop_words_train)
# X_test = cv.transform(no_stop_words_test)

# X_train, X_val, y_train, y_val = train_test_split(
#     X, target, train_size = 0.75
# )

# for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
#     lr = LogisticRegression(C=c)
#     lr.fit(X_train, y_train)
#     print ("Accuracy for C=%s: %s" 
#            % (c, accuracy_score(y_val, lr.predict(X_val))))

#### Stemming

In [8]:
# def get_stemmed_text(corpus):
#     from nltk.stem.porter import PorterStemmer
#     stemmer = PorterStemmer()

#     return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]

# stemmed_reviews_train = get_stemmed_text(reviews_train_clean)
# stemmed_reviews_test = get_stemmed_text(reviews_test_clean)

# cv = CountVectorizer(binary=True)
# cv.fit(stemmed_reviews_train)
# X = cv.transform(stemmed_reviews_train)
# X_test = cv.transform(stemmed_reviews_test)

# X_train, X_val, y_train, y_val = train_test_split(
#     X, target, train_size = 0.75
# )

# for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
#     lr = LogisticRegression(C=c)
#     lr.fit(X_train, y_train)
#     print ("Accuracy for C=%s: %s" 
#            % (c, accuracy_score(y_val, lr.predict(X_val))))
    
# final_stemmed = LogisticRegression(C=0.05)
# final_stemmed.fit(X, target)
# print ("Final Accuracy: %s" 
#        % accuracy_score(target, final_stemmed.predict(X_test)))

#### Lemmatization

In [9]:
# def get_lemmatized_text(corpus):
    
#     from nltk.stem import WordNetLemmatizer
#     lemmatizer = WordNetLemmatizer()
#     return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

# lemmatized_reviews_train = get_lemmatized_text(reviews_train_clean)
# lemmatized_reviews_test = get_lemmatized_text(reviews_test_clean)

# cv = CountVectorizer(binary=True)
# cv.fit(lemmatized_reviews_train)
# X = cv.transform(lemmatized_reviews_train)
# X_test = cv.transform(lemmatized_reviews_test)

# X_train, X_val, y_train, y_val = train_test_split(
#     X, target, train_size = 0.75
# )

# for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
#     lr = LogisticRegression(C=c)
#     lr.fit(X_train, y_train)
#     print ("Accuracy for C=%s: %s" 
#            % (c, accuracy_score(y_val, lr.predict(X_val))))
    
# final_lemmatized = LogisticRegression(C=0.25)
# final_lemmatized.fit(X, target)
# print ("Final Accuracy: %s" 
#        % accuracy_score(target, final_lemmatized.predict(X_test)))

#### n-grams 

In [10]:
# ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
# ngram_vectorizer.fit(reviews_train_clean)
# X = ngram_vectorizer.transform(reviews_train_clean)
# X_test = ngram_vectorizer.transform(reviews_test_clean)

# X_train, X_val, y_train, y_val = train_test_split(
#     X, target, train_size = 0.75
# )

# for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
#     lr = LogisticRegression(C=c)
#     lr.fit(X_train, y_train)
#     print ("Accuracy for C=%s: %s" 
#            % (c, accuracy_score(y_val, lr.predict(X_val))))
    
# # Accuracy for C=0.01: 0.88416
# # Accuracy for C=0.05: 0.892
# # Accuracy for C=0.25: 0.89424
# # Accuracy for C=0.5: 0.89456
# # Accuracy for C=1: 0.8944
    
# final_ngram = LogisticRegression(C=0.5)
# final_ngram.fit(X, target)
# print ("Final Accuracy: %s" 
#        % accuracy_score(target, final_ngram.predict(X_test)))

# # Final Accuracy: 0.89784

#### word_count

In [11]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score
# from sklearn.model_selection import train_test_split

# wc_vectorizer = CountVectorizer(binary=False)
# wc_vectorizer.fit(reviews_train_clean)
# X = wc_vectorizer.transform(reviews_train_clean)
# X_test = wc_vectorizer.transform(reviews_test_clean)

# X_train, X_val, y_train, y_val = train_test_split(
#     X, target, train_size = 0.75, 
# )

# for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
#     lr = LogisticRegression(C=c)
#     lr.fit(X_train, y_train)
#     print ("Accuracy for C=%s: %s" 
#            % (c, accuracy_score(y_val, lr.predict(X_val))))
    
# # Accuracy for C=0.01: 0.87456
# # Accuracy for C=0.05: 0.88016
# # Accuracy for C=0.25: 0.87936
# # Accuracy for C=0.5: 0.87936
# # Accuracy for C=1: 0.87696
    
# final_wc = LogisticRegression(C=0.05)
# final_wc.fit(X, target)
# print ("Final Accuracy: %s" 
#        % accuracy_score(target, final_wc.predict(X_test)))

# # Final Accuracy: 0.88184

#### tf-idf

In [12]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score
# from sklearn.model_selection import train_test_split

# tfidf_vectorizer = TfidfVectorizer()
# tfidf_vectorizer.fit(reviews_train_clean)
# X = tfidf_vectorizer.transform(reviews_train_clean)
# X_test = tfidf_vectorizer.transform(reviews_test_clean)

# X_train, X_val, y_train, y_val = train_test_split(
#     X, target, train_size = 0.75
# )

# for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
#     lr = LogisticRegression(C=c)
#     lr.fit(X_train, y_train)
#     print ("Accuracy for C=%s: %s" 
#            % (c, accuracy_score(y_val, lr.predict(X_val))))

# # Accuracy for C=0.01: 0.79632
# # Accuracy for C=0.05: 0.83168
# # Accuracy for C=0.25: 0.86768
# # Accuracy for C=0.5: 0.8736
# # Accuracy for C=1: 0.88432
    
# final_tfidf = LogisticRegression(C=1)
# final_tfidf.fit(X, target)
# print ("Final Accuracy: %s" 
#        % accuracy_score(target, final_tfidf.predict(X_test)))

# # Final Accuracy: 0.88204

### svm

In [13]:
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.svm import LinearSVC
# from sklearn.metrics import accuracy_score
# from sklearn.model_selection import train_test_split

# ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
# ngram_vectorizer.fit(reviews_train_clean)
# X = ngram_vectorizer.transform(reviews_train_clean)
# X_test = ngram_vectorizer.transform(reviews_test_clean)

# X_train, X_val, y_train, y_val = train_test_split(
#     X, target, train_size = 0.75
# )

# for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
#     svm = LinearSVC(C=c)
#     svm.fit(X_train, y_train)
#     print ("Accuracy for C=%s: %s" 
#            % (c, accuracy_score(y_val, svm.predict(X_val))))
    
# # Accuracy for C=0.01: 0.89104
# # Accuracy for C=0.05: 0.88736
# # Accuracy for C=0.25: 0.8856
# # Accuracy for C=0.5: 0.88608
# # Accuracy for C=1: 0.88592
    
# final_svm_ngram = LinearSVC(C=0.01)
# final_svm_ngram.fit(X, target)
# print ("Final Accuracy: %s" 
#        % accuracy_score(target, final_svm_ngram.predict(X_test)))

# # Final Accuracy: 0.9998

#### Final Model - removing a small set of stop words along with an n-gram range from 1 to 3 and a linear support vector classifier gave me the best results.

In [15]:
%%time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC


stop_words = ['in', 'of', 'at', 'a', 'the']
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3), stop_words=stop_words)
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75, random_state=2
)

for c in [0.001, 0.005, 0.01, 0.05, 0.1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))
    
# Accuracy for C=0.001: 0.88784
# Accuracy for C=0.005: 0.89456
# Accuracy for C=0.01: 0.89376
# Accuracy for C=0.05: 0.89264
# Accuracy for C=0.1: 0.8928
    
final = LinearSVC(C=0.01)
final.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final.predict(X_test)))

# Final Accuracy: 0.90064

Accuracy for C=0.001: 0.8856
Accuracy for C=0.005: 0.89216
Accuracy for C=0.01: 0.8944
Accuracy for C=0.05: 0.89312
Accuracy for C=0.1: 0.892
Final Accuracy: 0.90064
Wall time: 1min 56s


In [19]:
X_test.shape

(25000, 5358730)

In [16]:
import pickle

In [18]:
%%time
# save the model to disk
filename = 'SVM_model.sav'
pickle.dump(final, open(filename, 'wb'))

# save the CountVectorizer to disk
filename = 'SVM_vect.sav'
pickle.dump(ngram_vectorizer, open(filename, 'wb'))

Wall time: 21.5 s


In [None]:
# # top positive and negative features 
# feature_to_coef = {
#     word: coef for word, coef in zip(
#         ngram_vectorizer.get_feature_names(), final.coef_[0]
#     )
# }

# for best_positive in sorted(
#     feature_to_coef.items(), 
#     key=lambda x: x[1], 
#     reverse=True)[:30]:
#     print (best_positive)
    
# print("\n\n")
# for best_negative in sorted(
#     feature_to_coef.items(), 
#     key=lambda x: x[1])[:30]:
#     print (best_negative)