In [1]:
import json
file_name = "Books_small_10000.json"
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        print(review["reviewText"])
        print(review["overall"])
        break

I bought both boxed sets, books 1-5.  Really a great series!  Start book 1 three weeks ago and just finished book 5.  Sloane Monroe is a great character and being able to follow her through both private life and her PI life gets a reader very involved!  Although clues may be right in front of the reader, there are twists and turns that keep one guessing until the last page!  These are books you won't be disappointed with.
5.0


In [3]:
import random

In [4]:
class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self,text,score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score== 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE

"""class ReviewContainer:
    def __init__(self,reviews):
        self.reviews = reviews
    def evenly_distribute(self):
        negative = list([x for x in self.reviews if x== Sentiment.NEGATIVE])
        positive = list([x for x in self.reviews if x== Sentiment.POSITIVE])
        print(len(negative))
        print(len(positive))"""

class ReviewContainer:
    def __init__(self,reviews):
        self.reviews = reviews
    def get_text(self):
        return [x.text for x in self.reviews]
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]

        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE,self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE,self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        
        


In [5]:
import json

file_name = "Books_small_10000.json"
reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review["reviewText"],review["overall"]))


In [6]:
reviews[5].sentiment

'POSITIVE'

### Prep Data

In [7]:
from sklearn.model_selection import train_test_split


In [8]:
training, test = train_test_split(reviews,test_size = 0.33,random_state = 42)

In [9]:
train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

train_container.evenly_distribute()
test_container.evenly_distribute()


In [10]:
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_x = test_container.get_text()
test_y = test_container.get_sentiment()

In [11]:
print(train_y.count(Sentiment.NEGATIVE))
print(train_y.count(Sentiment.POSITIVE))

436
436


In [12]:
print(test_y.count(Sentiment.NEGATIVE))
print(test_y.count(Sentiment.POSITIVE))

208
208


### Bags of Words Vectorization

In [45]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [49]:
vectorizer = TfidfVectorizer()
train_x_vectors=vectorizer.fit_transform(train_x)
test_x_vectors=vectorizer.transform(test_x)

In [48]:
print(train_x[0])
print(train_x_vectors[0])

I have to wonder what demographic this book was intended for...Is it targeted for teens?  It is supposedly about teenage love (17 year olds) yet this book left me wondering if I lost my mind - especially given most people gave it 5 stars...I need to explain why I am baffled and concerned: The language/dialogue used in this book reads like an over-the-top soap opera x10 spoken by 65 year olds with a lot of life experience...The inner dialogues go on for pages, while at times, even the dialogues themselves are made up of 20 sentences to answer a simple question like &#34;How are you?&#34;  I was once 17 and have a 16 year old myself.  There is no way in hell any kids (because that's what you are at that age) today would ever use the kind of expressions these kids do in this book.  The heroine is a spoiled girl who, without reason, decides that she will never marry or have kids because she is too focused on having her dream career:  a professional cheerleader!  wow!  At 17 years old, she 

# Classification

### Linear SVM

In [50]:
from sklearn import svm

In [51]:
clf_svm = svm.SVC(kernel = "linear")
clf_svm.fit(train_x_vectors,train_y)

0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [18]:
test_x[10]
clf_svm.predict(test_x_vectors[10])

array(['POSITIVE'], dtype='<U8')

### Decision Trees

In [19]:
from sklearn.tree import DecisionTreeClassifier
clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors,train_y)
clf_dec.predict(test_x_vectors[10])

array(['POSITIVE'], dtype='<U8')

### Naive Bayes

In [20]:
from sklearn.naive_bayes import MultinomialNB

clf_nb = MultinomialNB()
clf_nb.fit(train_x_vectors, train_y)
clf_nb.predict(test_x_vectors[10])


array(['POSITIVE'], dtype='<U8')

#### Logistic Regression

In [21]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)
clf_log.predict(test_x_vectors[10])

array(['POSITIVE'], dtype='<U8')

# Evaluation

In [22]:
# Mean accuracy
print(clf_svm.score(test_x_vectors,test_y))
print(clf_dec.score(test_x_vectors,test_y))
print(clf_nb.score(test_x_vectors,test_y))
print(clf_log.score(test_x_vectors,test_y))

0.8076923076923077
0.6802884615384616
0.8125
0.8028846153846154


In [28]:
# F1 Score
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors),average = None, labels = ["POSITIVE","NEGATIVE","NEUTRAL"])

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


array([0.80582524, 0.80952381, 0.        ])

In [30]:
train_y.count("NEGATIVE")

436

In [31]:
test_set = ["this is absolutely icredible","my mama said this is briliant book","maybe, all of you gonna dislike it"]

In [34]:
new_test = vectorizer.transform(test_set)
clf_svm.predict(new_test)

array(['POSITIVE', 'POSITIVE', 'POSITIVE'], dtype='<U8')

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [52]:
import pickle

with open("clf_svm","wb") as f:
    pickle.dump((clf_svm,vectorizer),f)