## Data Class

In [None]:
import random

#this is a enum class
class Sentiment:
    NEGATIVE="NEGATIVE"
    NEUTRAL="NEUTRAL"
    POSITIVE="POSITIVE"
    
    
#lets make a class called "Review"
class Review:
    def __init__(self,text,score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):#the scores
        if self.score <= 2:
            return Sentiment.NEGATIVE #prints out "NEGATIVE" string
        elif self.score == 3:
            return Sentiment.NEUTRAL  #prints out "NEUTRAL" string
        else: #score of 4 or 5
            return Sentiment.POSITIVE #prints out "POSITIVE" string
        

#lets make a class called "ReviewContainer" to even out the negatives and positives
class ReviewContainer:
    def __init__(self,reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        
        positive_shrunk = positive[:len(negative)]#this shrinks the positive amount to aprox the negative amount
        self.reviews = negative + positive_shrunk#this is the sum of reviews we will have
        random.shuffle(self.reviews)#this randamly shuffles our positive and negative values
        
        

## Load Data

In [None]:
import json


file_name = './database/books_small_10000.json'#rightnow its raw text, NOT a dictionary

reviews =[]
with open(file_name) as f:
    for line in f:        
        review = json.loads(line)        
        reviews.append(Review(review["reviewText"],review["overall"]))
        
reviews[5].text

## Prep Data

In [None]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42) ##33% of our data will be test data and

train_container= ReviewContainer(training)

test_container= ReviewContainer(test)



In [None]:
train_container.evenly_distribute()

train_x = train_container.get_text()
train_y = train_container.get_sentiment()


test_container.evenly_distribute()
#and here we have the testing data
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

#count the amount of positives and negatives from our train data
print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))


#### Bag of words vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#for our TRAIN data
vectorizer = TfidfVectorizer() 
train_x_vectors = vectorizer.fit_transform(train_x)

#for our TEST data
test_x_vectors = vectorizer.transform(test_x)

print(train_x[0])
print(train_x_vectors[0].toarray())


## Classification

#### 1) Linear SVM

In [None]:
from sklearn import svm
#clasfier svm
clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

test_x[0]

clf_svm.predict(test_x_vectors[0])

#### 2) Decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
#clasfier decision
clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

#### 3) Naive Bayes

In [None]:
#for reviewing
from sklearn.naive_bayes import GaussianNB

from sklearn.tree import DecisionTreeClassifier
#clasfier gaussianNB
clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors, train_y)

clf_gnb.predict(test_x_vectors[0])

#### 4) Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression


#clasfier logistics
clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])

## Evaluation

In [None]:
#Mean Accuracy on all our test labels
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
#print(clf_gnb.score(test_x_vectors, test_y))
print(clf_log.score(test_x_vectors, test_y))

In [None]:
#F1 Scores
from sklearn.metrics import f1_score

#here we use TEST data, not TRANING
f1_score(test_y, clf_svm.predict(test_x_vectors), average =None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])


In [None]:
#lets try passing some reviews to see how it clasifies them
#test_set = ['I thoroughly enjoyed this, 5 stars',"bad book do not buy","horrible waste of time"]
test_set = ['not great, 5 stars',"i loved it","i thought it was bad"]
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)


## Tuning our model (with Grid Search)

In [None]:
from sklearn.model_selection import GridSearchCV

#lets make a dictionary object called "parameter"
parameters = {'kernel': ('linear', 'rbf'), 'C':(1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv =5)
clf.fit(train_x_vectors, train_y)


In [None]:
print(clf.score(test_x_vectors, test_y))

## Saving Model

In [None]:
import pickle

with open('./sentiment_classifier.pkl', 'wb') as f: 
    pickle.dump(clf,f)

## Load Model

In [None]:
with open('./sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f) 
    
print(test_x[0])
loaded_clf.predict(test_x_vectors[0])