## Data Class

In [1]:
import random

#this is a enum class
class Sentiment:
    NEGATIVE="NEGATIVE"
    NEUTRAL="NEUTRAL"
    POSITIVE="POSITIVE"
    
    
#lets make a class called "Review"
class Review:
    def __init__(self,text,score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):#the scores
        if self.score <= 2:
            return Sentiment.NEGATIVE #prints out "NEGATIVE" string
        elif self.score == 3:
            return Sentiment.NEUTRAL  #prints out "NEUTRAL" string
        else: #score of 4 or 5
            return Sentiment.POSITIVE #prints out "POSITIVE" string
        

#lets make a class called "ReviewContainer" to even out the negatives and positives
class ReviewContainer:
    def __init__(self,reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        
        positive_shrunk = positive[:len(negative)]#this shrinks the positive amount to aprox the negative amount
        self.reviews = negative + positive_shrunk#this is the sum of reviews we will have
        random.shuffle(self.reviews)#this randamly shuffles our positive and negative values
        
        

## Load Data

In [2]:
import json

#file_name = './books_small.json'#rightnow its raw text, NOT a dictionary
file_name = './books_small_10000.json'#rightnow its raw text, NOT a dictionary

reviews =[]#create an empty list or dictionary and call it "reviews" 
with open(file_name) as f:
    for line in f:
        #lets make the raw text into a dictionary so that we can work with it a bit better
        #name of the dictionary will be "review"
        review = json.loads(line)
        #now i can print any column i want, lets print "reviewText" and "overall" column from that dictionary
        #review text or comments, review score or number of stars
        reviews.append(Review(review["reviewText"],review["overall"]))
        
reviews[5].text

'I hoped for Mia to have some peace in this book, but her story is so real and raw.  Broken World was so touching and emotional because you go from Mia\'s trauma to her trying to cope.  I love the way the story displays how there is no "just bouncing back" from being sexually assaulted.  Mia showed us how those demons come for you every day and how sometimes they best you. I was so in the moment with Broken World and hurt with Mia because she was surrounded by people but so alone and I understood her feelings.  I found myself wishing I could give her some of my courage and strength or even just to be there for her.  Thank you Lizzy for putting a great character\'s voice on a strong subject and making it so that other peoples story may be heard through Mia\'s.'

## Prep Data

In [3]:
#we currently have 1000 reviews. Since this is a Machine Learning process, we want to simpli those 1000 reviews
#into training data and test data. sklearning has built in fuctions to facilitate that process for us

from sklearn.model_selection import train_test_split

#the function that divides or split that information for us is the "train_test_split()".
#it returns 2 different values, so we have to save them in 2 values
training, test = train_test_split(reviews, test_size=0.33, random_state=42) ##33% of our data will be test data and
#66$ will be training data

#calling the class "ReviewContainer" and making a train_contaner
train_container= ReviewContainer(training)
#calling the class "ReviewContainer" and making a test_contaner
test_container= ReviewContainer(test)



In [67]:
#lets evenly distribute the amount of positives and negatives for our train data
#for this we call the funtion "evenly_distribute"
train_container.evenly_distribute()

#we want to pass values into the "bag of words" vectorizer, so basically we want to pass text and return
#if its positive or negative
# "x" is text is was we pass into our model
# "y" is the sentiment, as in "POSITIVE" or "NEGATIVE"
#so here we have our training data
train_x = train_container.get_text()
train_y = train_container.get_sentiment()


#now lets evenly distribute the amount of positives and negatives for our test data
#for this we call the funtion "evenly_distribute"
test_container.evenly_distribute()
#and here we have the testing data
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

#count the amount of positives and negatives from our train data
print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))


436
436


#### Bag of words vectorization

In [78]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#for our TRAIN data
vectorizer = TfidfVectorizer() #Convert a collection of text documents to a matrix of token counts
train_x_vectors = vectorizer.fit_transform(train_x)#this returns a matrix of 1 and 0

#for our TEST data
test_x_vectors = vectorizer.transform(test_x)

print(train_x[0])
print(train_x_vectors[0].toarray())


I did not finish reading this novel.  I found it hard to care about what happened to the characters and could not get into the story.
[[0. 0. 0. ... 0. 0. 0.]]


## Classification

#### 1) Linear SVM

In [79]:
from sklearn import svm
#clasfier svm
clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

test_x[0]

clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

#### 2) Decision tree

In [80]:
from sklearn.tree import DecisionTreeClassifier
#clasfier decision
clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

#### 3) Naive Bayes

In [81]:
from sklearn.naive_bayes import GaussianNB

from sklearn.tree import DecisionTreeClassifier
#clasfier gaussianNB
clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors, train_y)

clf_gnb.predict(test_x_vectors[0])

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

#### 4) Logistic Regression

In [82]:
from sklearn.linear_model import LogisticRegression


#clasfier logistics
clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

## Evaluation

In [83]:
#Mean Accuracy on all our test labels
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
#print(clf_gnb.score(test_x_vectors, test_y))
print(clf_log.score(test_x_vectors, test_y))

0.8076923076923077
0.6442307692307693
0.8052884615384616


In [84]:
#F1 Scores
from sklearn.metrics import f1_score

#here we use TEST data, not TRAING
f1_score(test_y, clf_svm.predict(test_x_vectors), average =None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])
#the prints values: 1st is the % chance of POSITIVE, 2do is NEUTRAL, 3rd is NEGATIVE
#f1_score(test_y, clf_log.predict(test_x_vectors), average =None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])

array([0.80582524, 0.80952381])

In [72]:
#now we're gonna do a little investigacion of our data

#train_y.count(Sentiment.POSITIVE)#gives us 5611 positive reviews
#test_y.count(Sentiment.NEGATIVE)#gives us 436 negative revires

#so what we need is kinda like even out the values, right now theyre to far apart and the modal is more
#bias to always go for "Positive" values
#for this we will make a Container call "ReviewContainer". This is a class defined in the first ce

In [86]:
#lets try passing some reviews to see how it clasifies them
#test_set = ['I thoroughly enjoyed this, 5 stars',"bad book do not buy","horrible waste of time"]
test_set = ['not great, 5 stars',"i loved it","i thought it was bad"]
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)


array(['POSITIVE', 'POSITIVE', 'NEGATIVE'], dtype='<U8')

## Tuning our model (with Grid Search)

In [89]:
from sklearn.model_selection import GridSearchCV

#lets make a dictionary object called "parameter"
parameters = {'kernel': ('linear', 'rbf'), 'C':(1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv =5)
clf.fit(train_x_vectors, train_y)


GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [90]:
print(clf.score(test_x_vectors, test_y))

0.8076923076923077


## Saving Model

In [92]:
import pickle

with open('./sentiment_classifier.pkl', 'wb') as f: #"wb" is write buffer
    pickle.dump(clf,f)#taking the value in "clf" and saving it in "f" which is our file

## Load Model

In [94]:
with open('./sentiment_classifier.pkl', 'rb') as f: #"rb" is write buffer
    loaded_clf = pickle.load(f) #load in the file
    
print(test_x[0])#prueba para ver si de verdad load todo
loaded_clf.predict(test_x_vectors[0])

Rating is for Kindle version only.No links to Endnotes in Kindle version.  No pics in Kindle version.


array(['POSITIVE'], dtype='<U8')