In [2]:
class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'
    
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE

In [3]:
import json


reviews = []
with open('./data/Books_small.json') as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

reviews[8].sentiment

'POSITIVE'

In [4]:
len(reviews)

1000

In [5]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

In [10]:
train_x = [x.text for x in training]
train_y = [y.sentiment for y in training]

test_x = [x.text for x in test]
test_y = [y.sentiment for y in test]


### Using the bag of words vectorization

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit_transform(train_x)

<670x7372 sparse matrix of type '<class 'numpy.int64'>'
	with 41455 stored elements in Compressed Sparse Row format>

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
# Transform the training dataset into vectors that the machine can understand
train_x_vectors = vectorizer.fit_transform(train_x)

# Transform the test dataset into vectors
test_x_vectors = vectorizer.transform(test_x)

## Classification model

#### Using the Linear SVM

In [18]:
from sklearn.svm import SVC

# Declare an instance of the model and fit the training data to the model
clf_svm = SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

# Running a score assessment on the model to determine accuracy
clf_svm.score(test_x_vectors, test_y)


0.8242424242424242

In [17]:
# Testing a sample to view result
clf_svm.predict(test_x_vectors[10])

array(['POSITIVE'], dtype='<U8')

#### Using the Decision Tree

In [19]:
from sklearn.tree import DecisionTreeClassifier

# Declare an instance of the model and fit the training data to the model
clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)


# Running a score assessment on the model to determine accuracy
clf_dec.score(test_x_vectors, test_y)

0.7545454545454545