In [43]:
import random

class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'
    
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        


In [44]:
import json


reviews = []
with open('./data/Books.json') as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

reviews[8].sentiment

'POSITIVE'

In [45]:
len(reviews)

10000

In [46]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

In [48]:
train_container.evenly_distribute()

train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))


436
436


### Using the bag of words vectorization

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit_transform(train_x)

<670x7372 sparse matrix of type '<class 'numpy.int64'>'
	with 41455 stored elements in Compressed Sparse Row format>

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
# Transform the training dataset into vectors that the machine can understand
train_x_vectors = vectorizer.fit_transform(train_x)

# Transform the test dataset into vectors
test_x_vectors = vectorizer.transform(test_x)

## Classification model

#### Using the Linear SVM

In [8]:
from sklearn.svm import SVC

# Declare an instance of the model and fit the training data to the model
clf_svm = SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

# Running a score assessment on the model to determine accuracy
clf_svm.score(test_x_vectors, test_y)


0.8242424242424242

In [9]:
# Testing a sample to view result
clf_svm.predict(test_x_vectors[10])

array(['POSITIVE'], dtype='<U8')

#### Using the Decision Tree

In [10]:
from sklearn.tree import DecisionTreeClassifier

# Declare an instance of the model and fit the training data to the model
clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)


# Running a score assessment on the model to determine accuracy
clf_dec.score(test_x_vectors, test_y)

0.7484848484848485

#### Naive Bayes

In [11]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors.toarray(), train_y)

# Score the model
clf_gnb.score(test_x_vectors.toarray(), test_y)

0.8121212121212121

#### Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.score(test_x_vectors, test_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8303030303030303

### Evaluation

#### Mean accuracy

In [13]:
print('SVM score:',clf_svm.score(test_x_vectors, test_y))
print('Decision Tree score:',clf_dec.score(test_x_vectors, test_y))
print('Naive Bayes:',clf_gnb.score(test_x_vectors.toarray(), test_y))
print('Logistic Regression:',clf_log.score(test_x_vectors, test_y))

SVM score: 0.8242424242424242
Decision Tree score: 0.7484848484848485
Naive Bayes: 0.8121212121212121
Logistic Regression: 0.8303030303030303


#### Using F1 score

In [14]:
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None)

array([0.22222222, 0.21052632, 0.91319444])

In [16]:
train_y[0:5]

['POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE']

In [17]:
train_y.count(Sentiment.POSITIVE)

552