In [1]:
import random

class Sentiment:
    NEGATIVE = "NEGATIVE"
    POSITIVE = "POSITIVE"
    NEUTRAL = "NEUTRAL"

class Review:
    def __init__(self,text,score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self,reviews):
        self.reviews = reviews
    
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]

In [2]:
 import json
    

In [3]:
file_name = '../data/Books_small.json'

In [4]:
#change the text file into a dictionary
reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        #print(review['reviewText'])
        #print(review['overall'])
        reviews.append(Review(review['reviewText'],review['overall']))
reviews[5].sentiment

'POSITIVE'

In [5]:
len(reviews)

1000

In [7]:
from sklearn.model_selection import train_test_split

training,test = train_test_split(reviews,test_size=0.33,random_state=42)

train_container = ReviewContainer(training)

test_container = ReviewContainer(test)

In [8]:
len(training)

670

In [9]:
len(test)

330

In [10]:
print(training[0].sentiment)

POSITIVE


In [11]:
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_x[0])
print(train_y[0])

Vivid characters and descriptions. The author has created a tale that grabs your attention and I couldn't put it down.
POSITIVE


In [12]:
#Bag of words vectorisation
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

print(train_x[0])
print(train_x_vectors[0])

Vivid characters and descriptions. The author has created a tale that grabs your attention and I couldn't put it down.
  (0, 7086)	1
  (0, 1148)	1
  (0, 350)	2
  (0, 1800)	1
  (0, 6595)	1
  (0, 562)	1
  (0, 3054)	1
  (0, 1558)	1
  (0, 6475)	1
  (0, 6593)	1
  (0, 2895)	1
  (0, 7353)	1
  (0, 539)	1
  (0, 1515)	1
  (0, 5197)	1
  (0, 3545)	1
  (0, 2007)	1


In [13]:
#Classification selection model
#Linear SVM

from sklearn import svm
clf_svm = svm.SVC(kernel = 'linear')
clf_svm.fit(train_x_vectors,train_y)
test_x[0]
print(test[0].sentiment)
clf_svm.predict(test_x_vectors[0])

POSITIVE


array(['POSITIVE'], dtype='<U8')

In [14]:
#Classification selection model 2
#Decision Tree

from sklearn.tree import DecisionTreeClassifier
clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors,train_y)
print(test[0].sentiment)
clf_dec.predict(test_x_vectors[0])


POSITIVE


array(['POSITIVE'], dtype='<U8')

In [15]:
#Classification selection model 3
#Naive Bayes

from sklearn.naive_bayes import GaussianNB
clf_nb = GaussianNB()
clf_nb.fit(train_x_vectors.toarray(),train_y)
print(test[0].sentiment)
clf_nb.predict(test_x_vectors[0].toarray())

POSITIVE


array(['POSITIVE'], dtype='<U8')

In [16]:
#Classification selection model 4
#Logistic Regression

from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression()
clf_lr.fit(train_x_vectors,train_y)
print(test[0].sentiment)
clf_lr.predict(test_x_vectors[0])

POSITIVE


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


array(['POSITIVE'], dtype='<U8')

In [17]:
#Evaluation mean accuracy
print(clf_svm.score(test_x_vectors,test_y))
print(clf_dec.score(test_x_vectors,test_y))
print(clf_nb.score(test_x_vectors.toarray(),test_y))
print(clf_lr.score(test_x_vectors,test_y))

0.8242424242424242
0.7666666666666667
0.8121212121212121
0.8303030303030303


In [18]:
#F1 Scores
from sklearn.metrics import f1_score
print(f1_score(test_y,clf_svm.predict(test_x_vectors),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]))
print(f1_score(test_y,clf_dec.predict(test_x_vectors),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]))
# print(f1_score(test_y,clf_nb.predict(test_x_vectors.toarray().reshape(-1, 1)),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]))
print(f1_score(test_y,clf_lr.predict(test_x_vectors),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]))

[0.91319444 0.21052632 0.22222222]
[0.87609075 0.03508772 0.06666667]
[0.91370558 0.12244898 0.1       ]
