In [28]:
import pandas as pd
import numpy as np
import json
import random

In [44]:
class Sentiment:
    NEGATIVE ='NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'

class Review:
    def __init__(self,text,score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else :
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self,reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    
    def evenly_distribute(self):
        negative = list(filter(lambda x:x.sentiment == Sentiment.NEGATIVE,self.reviews))
        positive = list(filter(lambda x:x.sentiment == Sentiment.POSITIVE,self.reviews))
        positive_shrunk = positive[:len(negative)]
        
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        
        

In [45]:
data = pd.read_json('Books_small_10000.json',lines=True)#importing json file
review_df = pd.DataFrame(data[['reviewText','overall']])#creating dataframe with with two fields

###### creating reviews list with Review object 

In [46]:
reviews = []
for row in review_df.itertuples():#return tuples of columns items
    reviews.append(Review(row[1],row[2]))
   

### test train split

In [47]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)

test_container = ReviewContainer(test)

train_container.evenly_distribute()

test_container.evenly_distribute()


In [60]:
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_x = test_container.get_text()
test_y = test_container.get_sentiment()


print(test_y.count(Sentiment.POSITIVE))
print(test_y.count(Sentiment.NEGATIVE))

208
208


### Bag of words vectorization || Feature extraction
#### convert a collection of text documents to a matrix of token counts

#### machine learning model love numerical vectors, metrices, bag of words as input but not string

In [71]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

#main data
#train_x_vectors
#train_y



### Classification
    #### predictive modeling problem where a class label is predicted for a given input example data

#### linear svm

In [72]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors,train_y) #fit is like training the model with data

clf_svm.predict(test_x_vectors[0]) #predicting the output for test data


array(['POSITIVE'], dtype='<U8')

#### Decision Tree

In [51]:
from sklearn.tree import DecisionTreeClassifier

In [73]:
clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

#### Naive Bayes

In [12]:
from sklearn.naive_bayes import GaussianNB

### Logistic regression

In [53]:
from sklearn.linear_model import LogisticRegression

In [74]:
clf_lr = LogisticRegression()
clf_lr.fit(train_x_vectors, train_y)

clf_lr.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Evaluation

In [75]:
#mean accuracy
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_lr.score(test_x_vectors, test_y))

0.8076923076923077
0.6346153846153846
0.8052884615384616


In [76]:
#f1_score accuracy

from sklearn.metrics import f1_score

print(f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.NEUTRAL, Sentiment.NEGATIVE,Sentiment.POSITIVE]))
print(f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.NEUTRAL, Sentiment.NEGATIVE,Sentiment.POSITIVE]))
print(f1_score(test_y, clf_lr.predict(test_x_vectors), average=None, labels=[Sentiment.NEUTRAL, Sentiment.NEGATIVE,Sentiment.POSITIVE]))

[0.         0.80952381 0.80582524]
[0.         0.64651163 0.62189055]
[0.         0.80760095 0.80291971]


  _warn_prf(
  _warn_prf(
  _warn_prf(


In [89]:
test_set = ['great','i love it','neutral']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'POSITIVE', 'POSITIVE'], dtype='<U8')

#### Tuning our model (With Grid Search)

In [82]:

from sklearn.model_selection import GridSearchCV

parameters = {'kernel':('linear','rbf'),'C':(1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc,parameters, cv = 5)
clf.fit(train_x_vectors, train_y)



  

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [83]:
print(clf.score(test_x_vectors,test_y))

0.8100961538461539


### Saving Model

In [85]:
import pickle #package used to save model/classifier so that we don't have to train again

with open('./models/sentiment_classifier.pkl','wb') as f:
    pickle.dump(clf,f)



### Load model

In [87]:
with open('./models/sentiment_classifier.pkl','rb') as f:
    loaded_clf = pickle.load(f)

### we can also save vectorizer as pickle

In [93]:
with open('./models/vectorizer.pkl','wb') as f:
    pickle.dump(vectorizer,f)

In [90]:
loaded_clf.predict(new_test)

array(['POSITIVE', 'POSITIVE', 'POSITIVE'], dtype='<U8')