### Data Class

In [83]:
import random

class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: #Score of 4 or 5
            return Sentiment.POSITIVE

class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
    
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

### Loading Data

In [84]:
import json

file_name= r"C:\Users\delhi\Desktop\ML_PROJECT 2\Books_small_10000.json"


reviews= []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)             # creationg a dictionary so that we can use review['reviewText']
        reviews.append(Review(review['reviewText'], review['overall']))
#         print(review['reviewText'])
#         print(review['overall'])


# reviews[5]
# reviews[5][0]
# reviews[5][1]
# reviews[5].text
reviews[5].sentiment


        

'POSITIVE'

### Data Prepration

In [85]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)

test_container = ReviewContainer(test)

In [86]:
# train_x = [x.text for x in training]
# train_y = [x.sentiment for x in training]      DOING IT IN THE CLASS REVIEWCONTAINER    

# test_x =  [x.text for x in test]
# test_y =  [x.sentiment for x in test]

train_container.evenly_distribute()  # CALLING THE FUNCTION TO EVENLY DISTRIBUTE POSITIVE AND NEGATIVE SENTIMENT
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()  # TO IMPROVE F1 SCORE AND TO MAKE POSITIVE AND NEGATIVE EVEN
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))


436
436


### Bag of words vectorization

In [87]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# This book is great !
# This book was so bad

# vectorizer = CountVectorizer()
vectorizer =  TfidfVectorizer()  # fine tuning the model
train_x_vectors = vectorizer.fit_transform(train_x) # fit and transform in the same step, can also be done separately

test_x_vectors = vectorizer.transform(test_x)

print(train_x[0])
print(train_x_vectors[0].toarray())


# train_x_vectors # the two attributes on which we'll build the model
# train_y

I haven't read the previous tales from the Clifton Chronicles so i was pleasantly suprised. The villains are picturesque, the plot is unusual and the book is vintage Jeffrey Archer. Without reading the previous editions I don't have a full picture of the legacy of the Clifton family but enjoyable reading nevertheless.
[[0. 0. 0. ... 0. 0. 0.]]


### Classification

#### Linear SVM

In [88]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

test_x[0]

clf_svm.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

#### Decision Tree

In [89]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])


array(['POSITIVE'], dtype='<U8')

#### Naive Bayes

In [90]:
from sklearn.naive_bayes import  MultinomialNB

clf_mnb =  MultinomialNB()
clf_mnb.fit(train_x_vectors, train_y)

clf_mnb.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

#### Logistic Regression

In [91]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

#### Evaluating the algorithms

In [92]:
# Mean Accuracy
# comparing test vectors predicted values with actual values in test_y and measuring the accuracy
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_mnb.score(test_x_vectors, test_y))
print(clf_log.score(test_x_vectors, test_y))

0.8076923076923077
0.6634615384615384
0.8125
0.8052884615384616


In [93]:
# F1 Scores
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])
f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE,  Sentiment.NEGATIVE])
f1_score(test_y, clf_mnb.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE,  Sentiment.NEGATIVE])
# f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])


array([0.79144385, 0.82969432])

In [94]:
test_set = ['very fun', "bad book do not buy", 'horrible waste of time']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

#### Fine tuning with Grid Search

In [95]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [96]:
print(clf.score(test_x_vectors, test_y)) # improved accuracy by 1 point

0.8197115384615384


### Saving

In [99]:
import pickle

with open(r"C:\Users\delhi\Desktop\ML_PROJECT 2\Sentiment_classifier_Model\entiment_classifier.pkl", 'wb') as f:
    pickle.dump(clf, f)

#### Loading model

In [100]:
with open(r"C:\Users\delhi\Desktop\ML_PROJECT 2\Sentiment_classifier_Model\entiment_classifier.pkl", 'rb') as f:
    loaded_clf = pickle.load(f)

In [101]:
print(test_x[0])

loaded_clf.predict(test_x_vectors[0])

Unless you want to laugh at the ridiculousness of it all don't buy this. Thankfully I got this one for free but I should be paid for having read that. And before everyone gets up in arms telling me I have no right to trash anyone's work who was brave enough to publish l just have to quote this marvelous story and you will understand me: "her vaginal walls screeched" Yes, that's right, a direct quote. That does not sound like any woman's orgasm that I have ever read or heard about or experienced. That sounds painful. And the descriptions just get worse and worse.


array(['NEGATIVE'], dtype='<U8')