In [1]:
import pandas as pd
import os

In [2]:

reviews_train = []
for line in open('data/full_train.txt', 'r'):
    reviews_train.append(line.strip())
    
reviews_test = []
for line in open('data/full_test.txt', 'r'):
    reviews_test.append(line.strip())

In [3]:
reviews_train[12498]

"A Christmas Together actually came before my time, but I've been raised on John Denver and the songs from this special were always my family's Christmas music. For years we had a crackling cassette made from a record that meant it was Christmas. A few years ago, I was finally able to track down a video of it on Ebay, so after listening to all the music for some 21 years, I got to see John and the Muppets in action for myself. If you ever get the chance, it's a lot of fun--great music, heart-warming and cheesy. It's also interesting to see the 70's versions of the Muppets and compare them to their newer versions today. I believe Denver actually took some heat for doing a show like this--I guess normally performers don't compromise their images by doing sing-a-longs with the Muppets, but I'm glad he did. Even if you can't track down the video, the soundtrack is worth it too. It has some Muppified traditional favorites, but also some original Denver tunes as well."

## Data cleaning

In [5]:
import re

REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)

## TF-IDF

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True)
cv.fit(reviews_train_clean)
X = cv.transform(reviews_train_clean)
X_test = cv.transform(reviews_test_clean)

In [8]:
X_test.shape

(25000, 92715)

## train and evaluate the model

In [9]:

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

target = [1 if i < 12500 else 0 for i in range(25000)]
# use the X to get traning and cross-validation data 
X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))

Accuracy for C=0.01: 0.87328
Accuracy for C=0.05: 0.87968
Accuracy for C=0.25: 0.8792
Accuracy for C=0.5: 0.88112
Accuracy for C=1: 0.8808


## Select the best model

In [11]:

final_model = LogisticRegression(C=0.05)
final_model.fit(X, target)
print (f'Final Accuracy: {accuracy_score(target, final_model.predict(X_test))}')

Final Accuracy: 0.88152


### save the final model into pickle

In [12]:
import pickle
saved_model = open('model.pickle', 'wb')
pickle.dump(final_model, saved_model)