In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [36]:
#Reading data from github repository.
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/yelp.csv"
df = pd.read_csv(url,index_col=0)

In [37]:
#Checking the top rows of data.
df.head()

Unnamed: 0_level_0,date,review_id,stars,text,type,user_id,cool,useful,funny
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [38]:
#filtering out dataframe for 5-star and 1-star reviews.
df1 = df[(df.stars==5)|(df.stars==1)]

In [39]:
# Create X and y using the dataframe.
X = df1.text
y = df1.stars

In [40]:
#split data into training set and testing set
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=32)

In [41]:
#Instantiate CountVectorizer
cv = CountVectorizer(stop_words='english')
X_train_dtm = cv.fit_transform(X_train)
X_test_dtm = cv.transform(X_test)

In [42]:
# Initialize Multinomial Navie bayes
clf = MultinomialNB()
#Fit the model to training data
clf.fit(X_train_dtm,y_train)
y_pred = clf.predict(X_test_dtm)
# Calculate the accuracy score
clf.score(X_test_dtm,y_test)

0.92822185970636217

In [43]:
print(confusion_matrix(y_test,y_pred))

[[154  63]
 [ 25 984]]


In [44]:
print(clf.feature_count_)

[[ 31.   3.   1. ...,   0.   0.   0.]
 [ 34.   9.   0. ...,   4.   1.   1.]]


In [45]:
## Try Different models to compare the results.
# Initialize the SVM classifier
from sklearn.svm import LinearSVC
svm = LinearSVC()
svm.fit(X_train_dtm,y_train)
y_pred = svm.predict(X_test_dtm)
svm.score(X_test_dtm,y_test)

0.92577487765089728

In [46]:
print(confusion_matrix(y_test,y_pred))

[[157  60]
 [ 31 978]]


In [47]:
# Intialize Logistic Regression
reg = LogisticRegression()
# Fit the model
reg.fit(X_train_dtm,y_train)
y_pred = reg.predict(X_test_dtm)
# Calculate accuracy if the model
reg.score(X_test_dtm,y_test)

0.93393148450244701

In [48]:
# Print confusion matrix
print(confusion_matrix(y_test,y_pred))

[[156  61]
 [ 20 989]]


In [52]:
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          1       0.89      0.72      0.79       217
          5       0.94      0.98      0.96      1009

avg / total       0.93      0.93      0.93      1226

