## Import Libraries

In [230]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import random
from sklearn.ensemble import BaggingClassifier

## Sentiment and Review Classes

In [231]:
class Sentiment:
    POSITIVE = 'POSITIVE'
    NEUTRAL = 'NEUTRAL'
    NEGATIVE = 'NEGATIVE'

class Review:
    def __init__(self, text, rating):
        self.text = text
        self.sentiment = self.get_sentiment(rating)
    
    def get_sentiment(self, rating):
        if rating >= 4:
            return Sentiment.POSITIVE
        if rating == 3:
            return Sentiment.NEUTRAL
        if rating <= 2:
            return Sentiment.NEGATIVE
    def __repr__(self):
        return 'Text: ' + self.text[:50] + '\nSentiment: ' + self.sentiment + '\n'

class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        self.balance_data()
    
    def balance_data(self):
        positive = []
        neutral = []
        negative = []
        for r in reviews:
            if r.sentiment == Sentiment.POSITIVE:
                positive += [r]
            elif r.sentiment == Sentiment.NEGATIVE:
                negative += [r]
        min_len = min(len(positive), len(negative))
        positive = positive[:min_len]
        negative = negative[:min_len]
        self.reviews = positive + negative
        random.shuffle(self.reviews)
    
    def get_reviews(self):
        return self.reviews

## Read Data

In [232]:
file_name = 'data/Books_small_10000.json'
reviews = []
with open(file_name) as f:
    for line in f:
        rev = json.loads(line)
        reviews += [Review(rev['reviewText'], int(rev['overall']))]
print(reviews[:2])

[Text: I bought both boxed sets, books 1-5.  Really a gre
Sentiment: POSITIVE
, Text: I enjoyed this short book. But it was way way to s
Sentiment: NEUTRAL
]


## Prep data

In [233]:
r_container = ReviewContainer(reviews)
reviews = r_container.get_reviews()
print(len(reviews))
train, test = train_test_split(reviews, test_size=0.33, random_state=42)
train_x = [x.text for x in train]
train_y = [x.sentiment for x in train]
test_x = [x.text for x in test]
test_y = [x.sentiment for x in test]

1288


## Vectorize X

In [234]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [235]:
vectorizer = TfidfVectorizer()
train_x_vector = vectorizer.fit_transform(raw_documents=train_x)
test_x_vector = vectorizer.transform(raw_documents=test_x)

## Classification

### Support Vector Machine

In [236]:
from sklearn.svm import SVC

clf_svm = SVC()
clf_svm.fit(train_x_vector, train_y)

SVC()

## Decision Tree

In [237]:
from sklearn.tree import DecisionTreeClassifier

clf_dct = DecisionTreeClassifier()
clf_dct.fit(train_x_vector, train_y)

DecisionTreeClassifier()

## Naive Bayes

In [238]:
from sklearn.naive_bayes import GaussianNB

clf_nb = GaussianNB()
clf_nb.fit(train_x_vector.toarray(), train_y)

GaussianNB()

## Logistic Regression

In [239]:
from sklearn.linear_model import LogisticRegression

clf_lr = LogisticRegression()
clf_lr.fit(train_x_vector, train_y)

LogisticRegression()

## Evaluation

In [240]:
# Mean Accuracy

print(clf_svm.score(test_x_vector, test_y))
print(clf_dct.score(test_x_vector, test_y))
print(clf_nb.score(test_x_vector.toarray(), test_y))
print(clf_lr.score(test_x_vector, test_y))

0.8380281690140845
0.6572769953051644
0.6596244131455399
0.8403755868544601


In [241]:
# F1 Score

from sklearn.metrics import f1_score

print(f1_score(test_y, clf_svm.predict(test_x_vector), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))

[0.84064665 0.         0.8353222 ]


## Test With Your Own Input

In [242]:
my_test = ['I thoroughly enjoyed this, 5 stars', 'bad book, do not buy', 'Horrible, waste of time']
test_vec = vectorizer.transform(my_test)
clf_svm.predict(test_vec)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

## Grid Search

In [248]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1,4,8,16,32)}

svc = SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vector, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [249]:
print(clf.score(test_x_vector, test_y))

0.823943661971831


In [250]:
my_test = ['I thoroughly enjoyed this, 5 stars', 'bad book, do not buy', 'Horrible, waste of time', 'Never buy this book', 'Not a good book', 'It"s not that great', "good one"]
test_vec = vectorizer.transform(my_test)
clf.predict(test_vec)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE',
       'POSITIVE', 'POSITIVE'], dtype='<U8')

## Saving Model

In [251]:
import pickle

with open('./models/svm_sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf_svm, f)

## Loading Model

In [252]:
with open('./models/svm_sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)
loaded_clf.predict(test_vec)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE',
       'POSITIVE', 'POSITIVE'], dtype='<U8')