# Data Class

In [1]:
import random

## To be consistent with sentiment use Enums
class Sentiment:
    bad = "MAD BAD"
    neutral = "MEH"
    good = "RAD CHAD"

## Make it neat by using class
class Review:
    def __init__(self, text, star):
        self.text = text #reviews['reviewText'][0]
        self.star = star #reviews['overall'][1]
        self.sentiment = self.get_sentiment() # create new object to for star
        
    def get_sentiment(self):
        if self.star < 3:
            return Sentiment.bad
        elif self.star == 3:
            return Sentiment.neutral
        else:
            return Sentiment.good
    
        
### Improve model result
### Evenly distribute positive and negative         
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
    
    ### Make it neat to get text and sentiment
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    
    def evenly_distribute(self):
        positive = list(filter(lambda x: x.sentiment == Sentiment.good, self.reviews))
        negative = list(filter(lambda x: x.sentiment == Sentiment.bad, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

# Load Data

In [2]:
import json

# file_name = './data/books_small.json'

### Improve model result
### Load larger dataset
file_name = './data/Books_small_10000.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

## Alternative 1
#       reviews.append(review['reviewText'], review['overall'])             
# 
# Use index to get specific review or rating
# reviews[-5][1] # Return 5.0
#
# Rather than append to tuple we can append to class object (Review) instead


    
print(len(reviews))
print(reviews[-5].text)
print(reviews[-5].star)
print(reviews[-5].sentiment)

10000
The whole series was great!  Melody is a fantastic writer and keeps you intrigues the for the entire book love it!
5.0
RAD CHAD


# Prep Data

In [3]:
# Split train and dataset
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

# print(len(test))

### Make it neat using class
train_cont = ReviewContainer(training)
test_cont = ReviewContainer(test)

# Bag of Words

In [4]:
# Get text and sentiment from training data
### Make it neat using class
train_cont.evenly_distribute() ### Evenly distribute positive and negative train data
train_x = train_cont.get_text()
train_y = train_cont.get_sentiment()

print(train_y[12:15])

# Get text and sentiment from test data
test_cont.evenly_distribute() #### do the same with test data
test_x = test_cont.get_text()
test_y = test_cont.get_sentiment()

print(test_y[12:15])
print(train_y.count(Sentiment.good))
print(train_y.count(Sentiment.bad))
print(test_y.count(Sentiment.good))
print(test_y.count(Sentiment.bad))

['RAD CHAD', 'RAD CHAD', 'MAD BAD']
['MAD BAD', 'MAD BAD', 'RAD CHAD']
436
436
208
208


In [14]:
# Tokenizing text

from sklearn.feature_extraction.text import TfidfVectorizer # CountVectorizer
vectorizer = TfidfVectorizer()

train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x) # this is a test data no need to fit data only transform

print(train_x_vectors.shape)
print(train_x_vectors[0].toarray())
print(test_x_vectors.shape)

## We can also break it down in two steps
# vectorizer.fit(train_y)
# train_y_vectors = vectorizer.transform(train_y)

# print(train_y_vectors.shape) # Return (670, 5)

(872, 8906)
[[0. 0. 0. ... 0. 0. 0.]]
(416, 8906)


# Classification

#### Linear SVM

In [15]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

print(test_x[0])
# test_x_vectors[0]

print(clf_svm.predict(test_x_vectors[0]))

It is something about the quiet old guy in the novels that draws me to a book, this book was a very good read
['RAD CHAD']


#### Decision Tree

In [16]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()

clf_dec.fit(train_x_vectors, train_y)

print(test_x[0])
# test_x_vectors[0]

print(clf_dec.predict(test_x_vectors[0]))

It is something about the quiet old guy in the novels that draws me to a book, this book was a very good read
['RAD CHAD']


#### Naive Bayes

In [17]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()

clf_gnb.fit(train_x_vectors.toarray(), train_y)

# print(test_x[0])
print(clf_gnb.predict(test_x_vectors[0].toarray()))

['RAD CHAD']


#### Logistic Regression

In [18]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()

clf_log.fit(train_x_vectors, train_y)

print(test_x[0])
print(clf_log.predict(test_x_vectors[0]))

It is something about the quiet old guy in the novels that draws me to a book, this book was a very good read
['RAD CHAD']


# Evaluation

How to determine if a model is good or not? we can use Mean Accuracy and F1 to measure its performance. 

Between those two which one is more important? It depend of what we're looking for, Accuracy is used when the True Positives and True negatives are more important while F1-score is used when the False Negatives and False Positives are crucial

#### Mean Accuracy

In [19]:
print("SVM:\n", clf_svm.score(test_x_vectors, test_y))
print("DEC:\n", clf_dec.score(test_x_vectors, test_y))
print("GNB:\n", clf_gnb.score(test_x_vectors.toarray(), test_y))
print("LOG:\n", clf_log.score(test_x_vectors, test_y))

SVM:
 0.8076923076923077
DEC:
 0.6370192307692307
GNB:
 0.6610576923076923
LOG:
 0.8052884615384616


It shows better results for SVM mean accuracy.

Before using TfidfVectorizer:
- SVM:
 0.7980769230769231
- DEC:
 0.6298076923076923
- GNB:
 0.6346153846153846
- LOG:
 0.8149038461538461

It gets better than just evenly distributing (+) and (-) on training data, but using smaller dataset and not random distribute stil gives best result

Before evenly distribute (+) and (-) test data:
- SVM:
 0.7124242424242424
- DEC:
 0.6209090909090909
- GNB:
 0.44575757575757574
- LOG:
 0.7448484848484849

It gets worse.

Before load larger dataset and evenly distribute (+) and (-):
- SVM:
 0.8242424242424242
- DEC:
 0.7575757575757576
- GNB:
 0.8121212121212121
- LOG:
 0.8303030303030303


#### F1 Scores

In [20]:
from sklearn.metrics import f1_score

### Leave out Sentiment.neutral because we don't need it
print("SVM:\n", f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.good, Sentiment.bad]))#, Sentiment.neutral)
print("DEC:\n", f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.good, Sentiment.bad]))#, Sentiment.bad]))
print("GNB:\n", f1_score(test_y, clf_gnb.predict(test_x_vectors.toarray()), average=None, labels=[Sentiment.good, Sentiment.bad]))#, Sentiment.bad]))
print("LOG:\n", f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.good, Sentiment.bad]))#, Sentiment.bad]))

SVM:
 [0.80582524 0.80952381]
DEC:
 [0.63260341 0.64133017]
GNB:
 [0.65693431 0.66508314]
LOG:
 [0.80291971 0.80760095]


After using TfidfVectorizer, LOG F1 score has insignificant increase while the rest has lower performance.

- SVM:
 [0.8028169  0.79310345]
- DEC:
 [0.63849765 0.62068966]
- GNB:
 [0.59574468 0.66666667]
- LOG:
 [0.82051282 0.808933  ]


After evenly distribute test data like training data, the F1 results shows passable result.

Before:
- SVM:
 [0.85363477 0.28146853]
- DEC:
 [0.76241722 0.18433818]
- GNB:
 [0.6199765  0.15049505]
- LOG:
 [0.8783008  0.31077216]

While the Mean Accuracy decreased, The F1 Score shows insignificant increase after after load larger dataset and evenly distribute (+) and (-) training data.

Before:
- SVM:
 [0.91319444 0.21052632 0.22222222]
- DEC:
 [0.86428571 0.15151515 0.11764706]
- GNB:
 [0.89678511 0.08510638 0.09090909]
- LOG:
 [0.91370558 0.12244898 0.1       ]
 
 This is the results without changing anything.


In [12]:
print(train_y.count(Sentiment.good))
train_y.count(Sentiment.bad)

436


436

The model gives us good number of Mean Accuracy with F1 Score skewed toward good sentiment because the data heavily bias on it.

What we can try to improve the model result:
- get larger dataset and evenly distribute positive and negative train data

it get worse on Mean Accuracy and insignificant increase on F1 bad sentiment score

- evenly distribute the same thing of test data and only use LOG model as it gives the best result

Based on F1 Score we can see better result on good and bad sentiment

- use TfidfVectorizer

It gives better result for SMV Mean Accuracy and LOG F1 Score

- Tune the model using Grid Search

### Fine tuned with Grid Search

In [45]:
from sklearn.model_selection import GridSearchCV

# We will use LOG as it shows better result for F1 Score
parameters_log = {'penalty':('l1', 'l2', 'elasticnet', 'none'), 'C':[1,4,8,16,32]}
parameters_svm = {'kernel':('linear', 'rbf'), 'C':[1,4,8,16,32]}

clf_log_grid = GridSearchCV(clf_log, parameters_log, cv=5)
clf_log_grid.fit(train_x_vectors, train_y)

clf_svm_grid = GridSearchCV(clf_svm, parameters_svm, cv=5)
clf_svm_grid.fit(train_x_vectors, train_y)

Traceback (most recent call last):
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penal

Traceback (most recent call last):
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

Traceback (most recent call last):
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, se

GridSearchCV(cv=5, estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 4, 8, 16, 32], 'kernel': ('linear', 'rbf')})

In [46]:
print("After GridSearch")

print("SVM Accuracy:\n", clf_svm_grid.score(test_x_vectors, test_y))
print("SVM F1 Score:\n", f1_score(test_y, clf_svm_grid.predict(test_x_vectors), average=None, labels=[Sentiment.good, Sentiment.bad]))#, Sentiment.bad]))

print("LOG Accuracy:\n", clf_log_grid.score(test_x_vectors, test_y))
print("LOG F1 Score:\n", f1_score(test_y, clf_log_grid.predict(test_x_vectors), average=None, labels=[Sentiment.good, Sentiment.bad]))#, Sentiment.bad]))

After GridSearch
SVM Accuracy:
 0.8052884615384616
SVM F1 Score:
 [0.8057554  0.80481928]
LOG Accuracy:
 0.8149038461538461
LOG F1 Score:
 [0.81967213 0.80987654]


It gives worse results for SVM but better results for LOG. (Even though it returns error and I don't know how to fix it, but I feel content with these now).

# Save/Load Model using Pickle

In [60]:
import pickle

with open('./model/sentiment_svm_clf.pkl', 'wb') as f:
    pickle.dump(clf_svm_grid, f)
    
with open('./model/sentiment_log_clf.pkl', 'wb') as f:
    pickle.dump(clf_log_grid, f)

In [61]:
# with open('./model/sentiment_svm_clf.pkl', 'rb') as f:
#     loaded_svm_clf = pickle.load(f)
    
with open('./model/sentiment_log_clf.pkl', 'rb') as f:
    loaded_log_clf = pickle.load(f)

In [62]:
print(test_x[9])
# print(loaded_svm_clf.predict(test_x_vectors[9]))

print(loaded_log_clf.predict(test_x_vectors[9]))

I loved this book and the previous books in this series. It brings out every emotion you can think of. I look forward to reading more books by this author.
['RAD CHAD']
