### Initiating data class

In [89]:
import random

class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        
        positive_shrunk = positive[:len(negative)]
        
        self.reviews = negative + positive_shrunk
        
        random.shuffle(self.reviews)

### Loading json data

In [63]:
import json

filepath = './dataset/Books_small_10000.json'

reviews = []

with open(filepath) as file:
    for line in file:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
        
reviews[1].sentiment

'NEUTRAL'

### Preparing Data

#### Spliting data into train and test set

In [82]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(reviews, test_size=0.33, random_state=42)

In [65]:
len(train), len(test)

(6700, 3300)

#### Evenly distribute negative and positive

In [90]:
train_cont = ReviewContainer(train)
train_cont.evenly_distribute()

test_cont = ReviewContainer(test)
test_cont.evenly_distribute()

In [91]:
len(test_cont.reviews)

416

In [66]:
train[1].text
train[1].sentiment

'POSITIVE'

In [67]:
## Extracting text and sentiment from train and test data

# train_x = [x.text for x in train]
# train_y = [x.sentiment for x in train]

# test_x = [x.text for x in test]
# test_y = [x.sentiment for x in test]

In [95]:
train_x = train_cont.get_text()
train_y = train_cont.get_sentiment()

test_x = test_cont.get_text()
test_y = test_cont.get_sentiment()

In [96]:
train_y.count(Sentiment.POSITIVE), train_y.count(Sentiment.NEGATIVE)

(436, 436)

### Bags of words vectorizer

In [111]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()

train_x_vector = vectorizer.fit_transform(train_x)
test_x_vector = vectorizer.transform(test_x)

### Classifications
#### Linear SVM model

In [112]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vector, train_y)

SVC(kernel='linear')

In [113]:
test_x[0]

"I wish there was a negative rating. This is quite possibly the worst book I've ever read. Almost incomprehensible, and stupid to boot. Wish I had that hour back."

In [114]:
clf_svm.predict(test_x_vector[0])

array(['POSITIVE'], dtype='<U8')

#### Decision Tree

In [115]:
from sklearn.tree import DecisionTreeClassifier

clf_tree = DecisionTreeClassifier()
clf_tree.fit(train_x_vector, train_y)

DecisionTreeClassifier()

In [116]:
clf_tree.predict(test_x_vector[0])

array(['NEGATIVE'], dtype='<U8')

#### Nïve Bayes

In [117]:
# from sklearn.naive_bayes import GaussianNB

# clf_gnb = GaussianNB()

In [118]:
# clf_gnb.fit(train_x_vector, train_y)

#### Logistic Regression

In [119]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vector, train_y)

LogisticRegression()

In [120]:
clf_log.predict(test_x_vector[0])

array(['POSITIVE'], dtype='<U8')

### Evaluation

In [121]:
# Mean Accuracy Score
print(clf_svm.score(train_x_vector, train_y))
print(clf_tree.score(train_x_vector, train_y))
print(clf_log.score(train_x_vector, train_y))

0.9885321100917431
1.0
0.9655963302752294


In [122]:
# F1 Scores --> **important
from sklearn.metrics import f1_score

print(f1_score(test_y, clf_svm.predict(test_x_vector), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_tree.predict(test_x_vector), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_log.predict(test_x_vector), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))

[0.80582524 0.80952381]
[0.64691358 0.66510539]
[0.80291971 0.80760095]


### Testing the model

In [125]:
test_set = ['I thoroughly enjoyed this book', 'bad book do not buy', 'horrible waste of time', 'It is brilliant']
new_test = vectorizer.transform(test_set)
clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE', 'POSITIVE'], dtype='<U8')

### Tuning the model with Grid Search

In [126]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'poly', 'rbf', 'sigmoid'), 'C': (1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vector, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32),
                         'kernel': ('linear', 'poly', 'rbf', 'sigmoid')})

In [140]:
clf.score(train_x_vector, train_y)

1.0

In [141]:
clf.get_params()

{'cv': 5,
 'error_score': nan,
 'estimator__C': 1.0,
 'estimator__break_ties': False,
 'estimator__cache_size': 200,
 'estimator__class_weight': None,
 'estimator__coef0': 0.0,
 'estimator__decision_function_shape': 'ovr',
 'estimator__degree': 3,
 'estimator__gamma': 'scale',
 'estimator__kernel': 'rbf',
 'estimator__max_iter': -1,
 'estimator__probability': False,
 'estimator__random_state': None,
 'estimator__shrinking': True,
 'estimator__tol': 0.001,
 'estimator__verbose': False,
 'estimator': SVC(),
 'n_jobs': None,
 'param_grid': {'kernel': ('linear', 'poly', 'rbf', 'sigmoid'),
  'C': (1, 4, 8, 16, 32)},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': False,
 'scoring': None,
 'verbose': 0}

### Saving the model

In [142]:
import pickle

In [144]:
with open('/Users/abdullahalmomin/Documents/PROJECTS/sklearn_model/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

### Load the file

In [145]:
with open('/Users/abdullahalmomin/Documents/PROJECTS/sklearn_model/sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)


In [147]:
print(test_x[0])
loaded_clf.predict(test_x_vector[0])

I wish there was a negative rating. This is quite possibly the worst book I've ever read. Almost incomprehensible, and stupid to boot. Wish I had that hour back.


array(['POSITIVE'], dtype='<U8')