In [13]:
class Sentiment:
    positive = "POSITIVE"
    negative = "NEGATIVE"
    neutral  = "NEUTRAL"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <=2:
            return Sentiment.negative
        elif self.score == 3:
            return Sentiment.neutral
        else: #scores of 4 and 5
            return Sentiment.positive

## Load Data

In [108]:
import json

reviews = []
for f in open('Books_small_10000.json'):
    review = json.loads(f)
    reviews.append(Review(review['reviewText'],review['overall']))
    
reviews[7].sentiment

'POSITIVE'

## Prep Data

In [109]:
from sklearn.model_selection import train_test_split

X = [review.text for review in reviews]
y = [review.sentiment for review in reviews]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 47)

In [74]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

x_train_vectors = vectorizer.fit_transform(x_train)
x_test_vectors = vectorizer.transform(x_test)


## Classification

#### Linear SVM

In [96]:
from sklearn.svm import SVC

clf_svm = SVC(kernel = 'linear')
clf_svm.fit(x_train_vectors, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [97]:
clf_svm.predict(x_test_vectors[0])

array(['POSITIVE'], dtype='<U8')

In [71]:
x_test[0]

'Again, I thought for sure this was written by a woman (indeed, the original cover named Rebecca Steinbeck as the author). Mr Mason has shown with a deft touch not only his ability to cross genres but to also cross genders. A superb addition to a series that has the hallmarks of being a great trilogy (Mr Mason, I have seen on Goodreads a cover for a story called 69 INCHES OF LIGHT?)....'

#### Decision tree

In [82]:
from sklearn.tree import DecisionTreeClassifier

clf_dt = DecisionTreeClassifier(min_samples_split=6, random_state=7)

clf_dt.fit(x_train_vectors, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=6,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=7, splitter='best')

In [83]:
clf.predict(x_test_vectors[0])

array(['POSITIVE'], dtype='<U8')

#### Naive Bayes

In [86]:
from sklearn.naive_bayes import GaussianNB

clf_nb = GaussianNB()
clf_nb.fit(x_train_vectors.toarray(), y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [88]:
clf_nb.predict(x_test_vectors[0].toarray())

array(['POSITIVE'], dtype='<U8')

#### Logistic regression

In [93]:
from sklearn.linear_model import LogisticRegression

clf_lr = LogisticRegression(max_iter = 500)
clf_lr.fit(x_train_vectors, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [95]:
clf_lr.predict(x_test_vectors[0])

array(['POSITIVE'], dtype='<U8')

## Evaluation

#### Classification Accuracy

In [101]:
print("Linear SVM:", clf_svm.score(x_test_vectors, y_test))
print("Decision tree:", clf_dt.score(x_test_vectors, y_test))
print("Gaussian Naive Bayes: ", clf_nb.score(x_test_vectors.toarray(), y_test))
print("Logistic Regression:", clf_lr.score(x_test_vectors, y_test))

Linear SVM: 0.796969696969697
Decision tree: 0.7545454545454545
Gaussian Naive Bayes:  0.8
Logistiv Regression: 0.8181818181818182


#### F1 scores

In [107]:
from sklearn.metrics import f1_score

print("Linear SVM:", f1_score(y_test, clf_svm.predict(x_test_vectors), average = None, labels = [Sentiment.positive, Sentiment.neutral, Sentiment.negative]))
print("Decision tree:", f1_score(y_test, clf_dt.predict(x_test_vectors), average = None, labels = [Sentiment.positive, Sentiment.neutral, Sentiment.negative]))
print("Gaussian Naive Bayes:", f1_score(y_test, clf_nb.predict(x_test_vectors.toarray()), average = None, labels = [Sentiment.positive, Sentiment.neutral, Sentiment.negative]))
print("Logistic Regression:", f1_score(y_test, clf_lr.predict(x_test_vectors), average = None, labels = [Sentiment.positive, Sentiment.neutral, Sentiment.negative]))

Linear SVM: [0.88927944 0.2295082  0.2       ]
Decision tree: [0.86476868 0.16949153 0.05128205]
Gaussian Naive Bayes: [0.8877551  0.04347826 0.15384615]
Logistic Regression: [0.90344828 0.25454545 0.08      ]


In [113]:
# Why might the f1 scores for negative and neutral be this bad?

y_train.count(Sentiment.negative)

# Coz very few data points are there for negative and neutral. Training Data is skewed.

426