In [1]:
#Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

#Import Text Libraries
import re
import string
from wordcloud import STOPWORDS
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
%matplotlib inline

#Import Other Libraries
from sklearn import metrics
from imblearn.over_sampling import SMOTE 
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

#Import Classifers Libraries
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

# Data Exploration

In [2]:
#Pull digital music review data from Json file
file = '/Users/mac/Dropbox/Thinkful_Coursework/Course/Unit_3/Lesson 6/Amazon Reviews/digital_music.json'
df = pd.read_json(file, lines=True)

In [3]:
# Number of rows and columns
x = 93
print('Shape of dataframe')
print('-' *x)
print('No. of Rows:', df.shape[0])
print('No. of Columns:', df.shape[1])
print('-' *x)

# Check data types
print('Column Data Types')
print('-' *x)
print(df.dtypes)
print('-' *x)

# Check for missing data
print('Missing Data in Dataframe')
print('-' *x)
print(df.isnull().sum().sort_values(ascending=False))
print('-' *x)

Shape of dataframe
---------------------------------------------------------------------------------------------
No. of Rows: 64706
No. of Columns: 9
---------------------------------------------------------------------------------------------
Column Data Types
---------------------------------------------------------------------------------------------
asin              object
helpful           object
overall            int64
reviewText        object
reviewTime        object
reviewerID        object
reviewerName      object
summary           object
unixReviewTime     int64
dtype: object
---------------------------------------------------------------------------------------------
Missing Data in Dataframe
---------------------------------------------------------------------------------------------
reviewerName      177
unixReviewTime      0
summary             0
reviewerID          0
reviewTime          0
reviewText          0
overall             0
helpful             0
asin           

In [4]:
#Create Sentiment Feature to classify review scores
df['sentiment'] = np.where(df['overall'] >= 4, 1, 0)

In [5]:
#Check classes for imbalance
print('Postive reviews represent', round(df['sentiment'].value_counts()[1]/len(df) * 100), '% of the dataset')
print('Negative reviews represent', round(df['sentiment'].value_counts()[0]/len(df) * 100), '% of the dataset')

Postive reviews represent 81.0 % of the dataset
Negative reviews represent 19.0 % of the dataset


# Data Cleaning and Model Preparation

In [6]:
#Clean review text
cleanup_re = re.compile('[^a-z]+')
def cleanup(sentence):
    sentence = str(sentence)
    sentence = sentence.lower()
    sentence = cleanup_re.sub(' ', sentence).strip()
    return sentence

#Apply cleanup function
df['review_text_clean'] = df['reviewText'].apply(cleanup)

In [7]:
#Create train and test dataframes
split_df = df[['review_text_clean' , 'sentiment']]
train_df = split_df.sample(frac=0.8,random_state=200)
test_df =  split_df.drop(train_df.index)

X_train = train_df['review_text_clean']
X_test = test_df['review_text_clean']

y_train = train_df['sentiment']
y_test = test_df['sentiment']

In [8]:
#Convert review text to get token count matrix of token
stopwords = set(STOPWORDS)
stopwords.remove("not")

count_vect = CountVectorizer(min_df=2 ,stop_words=stopwords , ngram_range=(1,2))
tfidf_transformer = TfidfTransformer()
sm = SMOTE(ratio = 1.0)

#Normalize train count matrix to tf-idf representation
X_train_counts = count_vect.fit_transform(X_train) 
X_train_res, y_train_res = sm.fit_sample(X_train_counts, y_train)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_res)

#Normalize test count matrix to tf-idf representation
X_new_counts = count_vect.transform(X_test)
X_test_res, y_test_res = sm.fit_sample(X_new_counts, y_test)
X_test_tfidf = tfidf_transformer.transform(X_test_res)

In [9]:
#Fit and Run Classifer Models

In [10]:
#Define various classifers for voting
lr = LogisticRegression(solver='lbfgs' , C=1000)
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()
gnb = GaussianNB()
brb = BernoulliNB()
mnb = MultinomialNB()
knn = KNeighborsClassifier()
dtc = DecisionTreeClassifier()
svc = SVC()

In [11]:
#Fit MultinomialNB Classifer
mnb.fit(X_train_tfidf , y_train_res)
print("Multinomial Accuracy : {}".format(mnb.score(X_test_tfidf , y_test_res)))
y_pred_mnb = mnb.predict(X_test_tfidf)
print(confusion_matrix(y_test_res, y_pred_mnb))

#Fit BernoulliNB Classifer
brb.fit(X_train_tfidf, y_train_res)
print("Bernoulli Accuracy : {}".format(brb.score(X_test_tfidf , y_test_res)))
y_pred_brb = brb.predict(X_test_tfidf)
print(confusion_matrix(y_test_res, y_pred_brb))

#Fit Logistic Regression Classifer
lr.fit(X_train_tfidf, y_train_res)
print("Logistic Regression Accuracy : {}".format(lr.score(X_test_tfidf , y_test_res)))
y_pred_lr = lr.predict(X_test_tfidf)
print(confusion_matrix(y_test_res, y_pred_lr))

#Fit Gradient Booster Classifer
gb.fit(X_train_tfidf, y_train_res)
print("Gradient Booster Accuracy : {}".format(gb.score(X_test_tfidf , y_test_res)))
y_pred_gb = gb.predict(X_test_tfidf)
print(confusion_matrix(y_test_res, y_pred_gb))

Multinomial Accuracy : 0.7944631680308136
[[7566 2819]
 [1450 8935]]
Bernoulli Accuracy : 0.7563793933558016
[[7116 3269]
 [1791 8594]]
Logistic Regression Accuracy : 0.7957149735194993
[[6639 3746]
 [ 497 9888]]
Gradient Booster Accuracy : 0.7629754453538757
[[6621 3764]
 [1159 9226]]


In [12]:
#Fit Random Forest Classifer
rf.fit(X_train_tfidf, y_train_res)
print("Random Forest Accuracy : {}".format(rf.score(X_test_tfidf , y_test_res)))
y_pred_rf = rf.predict(X_test_tfidf)
print(confusion_matrix(y_test_res, y_pred_rf))

Random Forest Accuracy : 0.7100625902744343
[[5213 5172]
 [ 850 9535]]


In [13]:
# #Fit Decision Tree Classifer
# dtc.fit(X_train_tfidf, y_train_res)
# print("Decision Tree Accuracy : {}".format(dtc.score(X_test_tfidf , y_test_res)))
# y_pred_dtc = dtc.predict(X_test_tfidf)
# print(confusion_matrix(y_test_res, y_pred_dtc))

# Model Evaluation

In [14]:
#Use Voting Classifer with Hard Voting
vc = VotingClassifier(estimators=[('lr', lr), ('rf', rf),('gb',gb) ,('mnb',mnb),('brb', brb)], voting='hard')

vc.fit(X_train_tfidf, y_train_res)
vc.score(X_test_tfidf, y_test_res)

0.7992296581608088

In [15]:
y_pred_vc = vc.predict(X_test_tfidf)
vc_cm = confusion_matrix(y_test_res, y_pred_vc)
print(vc_cm)

[[6945 3440]
 [ 730 9655]]


In [16]:
neg_accuracy = (vc_cm[0][0]/(vc_cm[0][0] + vc_cm[0][1])) * 100
print(neg_accuracy)

pos_accuracy = (vc_cm[1][1]/(vc_cm[1][0] + vc_cm[1][1])) * 100
print(pos_accuracy)

66.87530091478094
92.97063071738084


# Conclusion 

The initial data set was highly unbalanced towards positive reviews, as 81% of reviews were positive and only 19% were negative.  Running classifers on the unbalanced dataset, produced an mean accuracy score of 80% with Logistic Regression performing the best at 88%.  However, these models were extremely poor at correctly classifying negative reviews at 17% for the Bernoulli model.  After using the smote oversampling method, the overall accuracy was reduced to 80%; however, the model accuracy for negative reviews increased to 68%.  