### Importing Data

In [27]:

import random

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return "NEGATIVE"
        elif self.score == 3:
            return "NEUTRAL"
        else: #overall - 4 or 5
            return "POSITIVE"

class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == "NEGATIVE", self.reviews))
        positive = list(filter(lambda x: x.sentiment == "POSITIVE", self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

In [28]:
import json

filename = 'C:\\Users\\703095428\\Desktop\\Apu\\Sentiment analysis\\books_small_10000.json'

reviews =[]
with open(filename) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
        

In [29]:
print(reviews[6].sentiment)
print(len(reviews))

NEGATIVE
10000


'books.json'

### Data Prep

In [30]:

from sklearn.model_selection import train_test_split

train, test = train_test_split(reviews, test_size=0.33)


train_container = ReviewContainer(train)

test_container = ReviewContainer(test)

In [31]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

#### Bag of words vectorization

In [74]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

print(train_y.count("POSITIVE"))
print(train_y.count("NEGATIVE"))

0
0


In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

In [33]:
print(train_x[1])
train_x_vectors[1]

This book should come w with a "R" reading. There where some funny parts but they where way over shadows by the trash.


<1x9531 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>

## Classification

#### Linear SVM

In [34]:
from sklearn import svm

cl_svm = svm.SVC(kernel = "linear")
cl_svm.fit(train_x_vectors, train_y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

#### Logistic Regression

In [35]:
from sklearn.linear_model import LogisticRegression

cl_lg = LogisticRegression(max_iter=100)
cl_lg.fit( train_x_vectors, train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

#### Decission Tree Classifier

In [36]:
from sklearn.tree import DecisionTreeClassifier

cl_dtc = DecisionTreeClassifier()
cl_dtc.fit(train_x_vectors, train_y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

#### Random Forest Classifier

In [37]:
from sklearn.ensemble import RandomForestClassifier

cl_rf = RandomForestClassifier()
cl_rf.fit(train_x_vectors, train_y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### Evaluation

In [38]:
print(cl_svm.score(test_x_vectors, test_y))
print(cl_lg.score(test_x_vectors, test_y))
print(cl_dtc.score(test_x_vectors, test_y))
print(cl_rf.score(test_x_vectors, test_y))

0.8372641509433962
0.8278301886792453
0.6816037735849056
0.8042452830188679


In [39]:
#F1 Scores
from sklearn.metrics import f1_score
f1_score(test_y, cl_svm.predict(test_x_vectors), average=None, labels=["POSITIVE", "NEGATIVE"])

array([0.83610451, 0.83840749])

In [57]:
test_set = ['I dont like the book']
test_vectors = vectorizer.transform(test_set)

cl_svm.predict(test_vectors)

array(['NEGATIVE'], dtype='<U8')

### Tuning the model with GridSearch

In [63]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear','rbf'), 'C': (1,4,8,16,32,64)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=10)
clf.fit(train_x_vectors, train_y)

GridSearchCV(cv=10, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': (1, 4, 8, 16, 32, 64),
                         'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [12]:
cl_gnb.score(test_x_vectors, test_y)

0.7739393939393939

In [64]:
print(clf.score(test_x_vectors, test_y))

0.8372641509433962


## Save model

##### save model

In [70]:
import pickle

with open(".//sentiment_classifier.pkl", "wb") as f:
    pickle.dump(clf, f)

##### load model

In [71]:
with open(".//sentiment_classifier.pkl", "rb") as f:
    loded_clf = pickle.load(f)

In [73]:
print(test_x[3])

loded_clf.predict(test_x_vectors[3])

This is a must read!  It will help you to grow in your faith and relationships with others.  Highly recommend!


array(['POSITIVE'], dtype='<U8')