# Predicting a review is a Positive or Negative review
Data contains book reviews taken from amazon 

In [1]:
import random

class Sentiment:
    Negative = "NEGATIVE"
    Neutral = "NEUTRAL"
    Positive = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self): ## Method for defining what stars are taken as negative/positive reviews
        if self.score <= 2:
            return Sentiment.Negative
        elif self.score >= 4:
            return Sentiment.Positive
        else: 
            return Sentiment.Neutral

class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    
    def evenly_distribute(self): ## Getting Equal numbers of negative and positive sentiments
        negative = list(filter(lambda x: x.sentiment == Sentiment.Negative, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.Positive, self.reviews))
        neutral = list(filter(lambda x: x.sentiment == Sentiment.Neutral, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk 
        random.shuffle(self.reviews)
        

In [2]:
# importing data
import json
file_name = './sklearn/data/Books_small_10000.json'
review = []
with open(file_name) as f:
    for line in f:
        text = json.loads(line)
        review.append(Review(text['reviewText'], text['overall'])) # Getting only reviews and stars from the data


## Splitting data in test and training sets

In [3]:

from sklearn.model_selection import train_test_split
training, test = train_test_split(review, test_size = 0.30 , random_state = 42)
train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

### Solving Unbalanced dataset problem

In [4]:
# taking equal numbers of negative and positive reviews
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.Positive))
print(train_y.count(Sentiment.Negative))
print(train_y.count(Sentiment.Neutral))

461
461
0


### Converting text into vectors

In [7]:
## Bag of words vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x_vector = vectorizer.fit_transform(train_x)
test_x_vector = vectorizer.transform(test_x)


 # Classification
### Linear SVM

In [8]:
from sklearn import svm
clf_svm = svm.SVC(kernel = 'linear')
clf_svm.fit(train_x_vector, train_y)
clf_svm.predict(test_x_vector[0])

array(['NEGATIVE'], dtype='<U8')

### Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier
clf_dec = RandomForestClassifier()
clf_dec.fit(train_x_vector, train_y)
clf_dec.predict(test_x_vector[0])

array(['NEGATIVE'], dtype='<U8')

### Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression
clf_log = LogisticRegression()
clf_log.fit(train_x_vector, train_y)
clf_svm.predict(test_x_vector[0])


array(['NEGATIVE'], dtype='<U8')

In [12]:
from sklearn.naive_bayes import MultinomialNB
>>> clf_nb = MultinomialNB()
>>> clf_nb.fit(train_x_vector, train_y)
clf_nb.predict(test_x_vector[0])

array(['NEGATIVE'], dtype='<U8')

## Evaluating The Models

In [13]:
# Mean Accuracy
print("Linear SVM: ",clf_svm.score(test_x_vector,test_y))  
print("Random Forest: ",clf_dec.score(test_x_vector,test_y))   
print("Logistic Regression: ",clf_log.score(test_x_vector,test_y))
print("Naive Bayes Classifier: ",clf_nb.score(test_x_vector,test_y))



Linear SVM:  0.8387978142076503
Random Forest:  0.7868852459016393
Logistic Regression:  0.8224043715846995
Naive Bayes Classifier:  0.8224043715846995


#### Linear SVM has highest accuracy

### Testing

In [12]:

test_set = ['okay okay', "bad book do not buy", ' I can say if you are looking for a decent watch in this price go for it . I loved the features. Sound quality is good . For fitness measure purposes we can use this watch .You can see notifications on the watch .Go for it']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['NEGATIVE', 'NEGATIVE', 'POSITIVE'], dtype='<U8')

#### Models are performing good only for one case (when reviews are Positive)

### Issue: In test data we have very low numbers of Negative and Positive reviews
So , added more data with equal numbers of positive and negative reviews



### Hyperparameter tuining SVM using Grid Search 


In [18]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1,4,8,16,32)} # Providing different parameters to choose from

clf_svm = svm.SVC()
clf1 = GridSearchCV(clf_svm, parameters, cv=5)
clf1.fit(train_x_vector, train_y)
print("Score: ",clf1.score(test_x_vector,test_y))## Score Improved


Score:  0.8415300546448088


In [19]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf,train_x_vector, train_y, cv=5)
print('Cross-Validation Accuracy Scores', scores)

Cross-Validation Accuracy Scores [0.77297297 0.85945946 0.82065217 0.85869565 0.8423913 ]


### HyperParameter Tuning Random Forest using Grid Search 

In [15]:
parameters = {'n_estimators': (10,100,500), 'max_features': (2,4,6)}
clf_dec = RandomForestClassifier()
clf = GridSearchCV(clf_dec, parameters, cv=5)
clf.fit(train_x_vector, train_y)
print("Best Parameter Score: ",clf.score(test_x_vector,test_y)) ## Score Improved

Best Parameter Score:  0.8306010928961749


In [21]:
scores = cross_val_score(clf,train_x_vector, train_y, cv=5)
print('Cross-Validation Accuracy Scores', scores)

Cross-Validation Accuracy Scores [0.81081081 0.84324324 0.8423913  0.83695652 0.84782609]


### Hyperparameter Tuining Naive Bayes

In [17]:
parameters={'alpha': [0.00001, 0.0001, 0.001, 0.1, 1, 10]}
clf_nb = MultinomialNB()
clf = GridSearchCV(clf_nb, parameters, cv=5)
clf.fit(train_x_vector, train_y)
print("Best Parameter Score: ",clf.score(test_x_vector,test_y))

Best Parameter Score:  0.8224043715846995


### Confusion matrix for SVM

In [24]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
predict = clf1.predict(test_x_vector)
metrics.confusion_matrix(test_y,predict)


array([[156,  27],
       [ 31, 152]], dtype=int64)

## Saving Model

In [110]:
import pickle

with open('./sklearn/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

#### Loading Model

In [111]:
with open('./sklearn/sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)