## Import Libraries

In [45]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Sentiment and Review Classes

In [46]:
class Sentiment:
    POSITIVE = 'POSITIVE'
    NEUTRAL = 'NEUTRAL'
    NEGATIVE = 'NEGATIVE'

class Review:
    def __init__(self, text, rating):
        self.text = text
        self.sentiment = self.get_sentiment(rating)
    
    def get_sentiment(self, rating):
        if rating >= 4:
            return Sentiment.POSITIVE
        if rating == 3:
            return Sentiment.NEUTRAL
        if rating <= 2:
            return Sentiment.NEGATIVE
    def __repr__(self):
        return 'Text: ' + self.text[:50] + '\nSentiment: ' + self.sentiment + '\n'

class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
    
    def balance_data(self):
        positive = []
        neutral = []
        negative = []
        for r in reviews:
            if r.sentiment == Sentiment.POSITIVE:
                positive += [r]
            if r.sentiment == Sentiment.NEUTRAL:
                neutral += [r]
            if r.sentiment == Sentiment.NEGATIVE:
                negative += [r]
        min_len = min(len(positive), len(neutral), len(negative))
        positive = positive[:min_len]
        neutral = neutral[:min_len]
        negative = negative[:min_len]
        self.reviews

## Read Data

In [47]:
file_name = 'data/Books_small_10000.json'
reviews = []
with open(file_name) as f:
    for line in f:
        rev = json.loads(line)
        reviews += [Review(rev['reviewText'], int(rev['overall']))]
print(reviews[:2])

[Text: I bought both boxed sets, books 1-5.  Really a gre
Sentiment: POSITIVE
, Text: I enjoyed this short book. But it was way way to s
Sentiment: NEUTRAL
]


## Prep data

In [48]:
train, test = train_test_split(reviews, test_size=0.33, random_state=42)
train_x = [x.text for x in train]
train_y = [x.sentiment for x in train]
test_x = [x.text for x in test]
test_y = [x.sentiment for x in test]

## Vectorize X

In [49]:
from sklearn.feature_extraction.text import CountVectorizer


In [50]:
vectorizer = CountVectorizer()
train_x_vector = vectorizer.fit_transform(raw_documents=train_x)
test_x_vector = vectorizer.transform(raw_documents=test_x)

## Classification

### Support Vector Machine

In [51]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vector, train_y)

SVC(kernel='linear')

## Decision Tree

In [52]:
from sklearn.tree import DecisionTreeClassifier

clf_dct = DecisionTreeClassifier()
clf_dct.fit(train_x_vector, train_y)

DecisionTreeClassifier()

## Naive Bayes

In [53]:
from sklearn.naive_bayes import GaussianNB

clf_nb = GaussianNB()
clf_nb.fit(train_x_vector.toarray(), train_y)

GaussianNB()

## Logistic Regression

In [54]:
from sklearn.linear_model import LogisticRegression

clf_lr = LogisticRegression()
clf_lr.fit(train_x_vector, train_y)

LogisticRegression()

## Evaluation

In [55]:
# Mean Accuracy

print(clf_svm.score(test_x_vector, test_y))
print(clf_dct.score(test_x_vector, test_y))
print(clf_nb.score(test_x_vector.toarray(), test_y))
print(clf_lr.score(test_x_vector, test_y))

0.8124242424242424
0.7766666666666666
0.6587878787878788
0.8409090909090909


In [56]:
# F1 Score

from sklearn.metrics import f1_score

print(f1_score(test_y, clf_svm.predict(test_x_vector), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))

[0.90738061 0.2656     0.40268456]


In [57]:
print(test_y.count(Sentiment.POSITIVE))
print(test_y.count(Sentiment.NEUTRAL))
print(test_y.count(Sentiment.NEGATIVE))

2767
325
208


## Test With Your Own Input

In [58]:
my_test = ["Very good", "bad"]
test_vec = vectorizer.transform(my_test)
clf_svm.predict(test_vec)

array(['POSITIVE', 'POSITIVE'], dtype='<U8')