In [51]:
import pandas as pd
import json

## Define Classes

In [52]:
class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score ==3:
            return Sentiment.NEUTRAL
        else: # score of 4 or 5
            return Sentiment.POSITIVE

## Load Data

In [53]:
file_name = './books_small.json'

In [54]:

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        #reviews.append((review['reviewText'],review['overall']))
        reviews.append(Review(review['reviewText'], review['overall']))
reviews[5].text
reviews[5].sentiment

'POSITIVE'

## prep Data

In [55]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(reviews, test_size=0.33, random_state=42)

In [56]:
len(train), len(test)

(670, 330)

In [57]:
train_x = [x.text for x in train]
train_y = [x.sentiment for x in train]

test_x = [x.text for x in test]
test_y = [x.sentiment for x in test]
train_x[0], train_y[0]

("Vivid characters and descriptions. The author has created a tale that grabs your attention and I couldn't put it down.",
 'POSITIVE')

## Bag of words vectorization

In [58]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(text_x)


print(train_x[0])
print(train_x_vectors[0])



Vivid characters and descriptions. The author has created a tale that grabs your attention and I couldn't put it down.
  (0, 7086)	1
  (0, 1148)	1
  (0, 350)	2
  (0, 1800)	1
  (0, 6595)	1
  (0, 562)	1
  (0, 3054)	1
  (0, 1558)	1
  (0, 6475)	1
  (0, 6593)	1
  (0, 2895)	1
  (0, 7353)	1
  (0, 539)	1
  (0, 1515)	1
  (0, 5197)	1
  (0, 3545)	1
  (0, 2007)	1


## Classificaiton

#### Linear SVM

In [59]:
from sklearn import svm

clf_svm = svm.SVC(kernel = 'linear')

clf_svm.fit(train_x_vectors, train_y)

test_x[0]
#test_x_vectors[0]

clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

#### Decision Tree

In [61]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

#### Naive Bayes

In [65]:
from sklearn.naive_bayes import GaussianNB

clf_gnf = DecisionTreeClassifier()
clf_gnf.fit(train_x_vectors, train_y)

clf_gnf.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

#### Logistic Regression

In [66]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array(['POSITIVE'], dtype='<U8')

### Evaluation

In [71]:
clf_svm.score(test_x_vectors, test_y), clf_dec.score(test_x_vectors, test_y),clf_log.score(test_x_vectors, test_y), clf_gnf.score(test_x_vectors, test_y)

(0.8242424242424242,
 0.7515151515151515,
 0.8303030303030303,
 0.7575757575757576)

### F1 scores


In [76]:
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels = [Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])



array([0.91319444, 0.21052632, 0.22222222])