In [1]:
# for Python 2: use print only as a function
from __future__ import print_function
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

In [2]:
import pandas as pd
df = pd.read_csv('reviews.csv')
df.shape

(27320, 14)

In [3]:
df.head(1)

Unnamed: 0,raw_review,id,review,real_category,lda_catogories,lda_catogories_12,lda_catogories_4,lda_catogories_5,lda_catogories_6,lda_catogories_7,lda_catogories_8,lda_catogories_9,lda_catogories_10,lda_catogories_11
0,I love this phone so much! I had this phone f...,B006QMZCT0_0,"[u'love', u'phone', u'much', u'phone', u'year'...",mobilephone,"[('cameras', 0.353), ('tablets', 0.169), ('lap...","[('cameras', 0.101), ('tablets', 0.102), ('TVs...","[('laptops', 0.118), ('mobilephone', 0.314), (...","[('TVs', 0.242), ('tablets', 0.108), ('laptops...","[('mobilephone', 0.976), ('video_surveillance'...","[('TVs', 0.279), ('mobilephone', 0.18), ('vide...","[('tablets', 0.059), ('laptops', 0.061), ('tab...","[('laptops', 0.022), ('laptops', 0.242), ('TVs...","[('tablets', 0.074), ('tablets', 0.163), ('tab...","[('laptops', 0.029), ('cameras', 0.174), ('lap..."


In [4]:
reviews_df = df[['raw_review', 'real_category']].copy()

In [5]:
# examine the first 5 rows
reviews_df.head(5)

Unnamed: 0,raw_review,real_category
0,I love this phone so much! I had this phone f...,mobilephone
1,This phone often just freezes or turns itself ...,mobilephone
2,What I would have like to know before I purcha...,mobilephone
3,I bought two of these phones for my kids. Bot...,mobilephone
4,"Liked the phone, but after a while I started h...",mobilephone


In [6]:
reviews_df.real_category.value_counts()

mobilephone           4812
video_surveillance    4675
cameras               4578
tablets               4482
TVs                   4461
laptops               4312
Name: real_category, dtype: int64

In [7]:
# convert label to a numerical variable
reviews_df['real_category_num'] = reviews_df.real_category.map({'mobilephone':0, 'cameras':1, 'video_surveillance':2, 'TVs':3, 'tablets':4 , 'laptops':5})

In [8]:
reviews_df.head(2)

Unnamed: 0,raw_review,real_category,real_category_num
0,I love this phone so much! I had this phone f...,mobilephone,0
1,This phone often just freezes or turns itself ...,mobilephone,0


In [9]:
# how to define X and y (from the SMS data) for use with COUNTVECTORIZER
X = reviews_df.raw_review
y = reviews_df.real_category_num
print(X.shape)
print(y.shape)

(27320,)
(27320,)


In [10]:
# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=10)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(21856,)
(5464,)
(21856,)
(5464,)


In [11]:
# instantiate CountVectorizer (with the default parameters)
vect = CountVectorizer(max_df=0.7, min_df=2, stop_words='english', ngram_range=(1, 3))

In [12]:
# learn training data vocabulary, then use it to create a document-term matrix
X_train_dtm = vect.fit_transform(X_train)

In [13]:
# examine the document-term matrix
X_train_dtm

<21856x324146 sparse matrix of type '<type 'numpy.int64'>'
	with 1920844 stored elements in Compressed Sparse Row format>

In [14]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<5464x324146 sparse matrix of type '<type 'numpy.int64'>'
	with 376785 stored elements in Compressed Sparse Row format>

# Naive Bayes model

In [15]:
# instantiate a Multinomial Naive Bayes model
nb = MultinomialNB()

In [16]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)

CPU times: user 284 ms, sys: 0 ns, total: 284 ms
Wall time: 303 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [17]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

In [18]:
# calculate accuracy of class predictions
metrics.accuracy_score(y_test, y_pred_class)

0.9092240117130308

In [19]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[862,   2,  19,   0,  55,   3],
       [  5, 867,  38,   4,  33,   3],
       [  3,   4, 860,   4,  12,   2],
       [  4,   3,  47, 753,  43,   6],
       [ 13,   2,  31,   4, 826,   9],
       [  5,   2,  21,   3, 116, 800]])

# Logistic regression model


In [20]:
# import and instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [21]:
# train the model using X_train_dtm
%time logreg.fit(X_train_dtm, y_train)

CPU times: user 1min 46s, sys: 3.15 s, total: 1min 49s
Wall time: 1min 27s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [22]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_dtm)

In [23]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

0.94125183016105418

In [24]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[909,   5,   3,   3,  13,   8],
       [  4, 913,   8,   9,  11,   5],
       [ 11,  15, 826,  11,  16,   6],
       [  6,   0,   6, 829,  10,   5],
       [ 13,   4,  10,  17, 817,  24],
       [  7,   3,  11,  16,  61, 849]])

# KNN model

In [25]:
# try K=1, record testing accuracy
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train_dtm, y_train)
y_pred = knn.predict(X_test_dtm)
metrics.accuracy_score(y_test, y_pred)

0.57430453879941434

In [26]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred)

array([[675,  39,   7, 205,  10,   5],
       [  8, 691,   4, 193,  51,   3],
       [ 23,  85, 254, 439,  70,  14],
       [  4,  34,   9, 806,   3,   0],
       [ 19,  67,   9, 440, 336,  14],
       [ 10,  64,  19, 422,  56, 376]])

# SVM model

In [27]:
from sklearn import svm
svm_clf = svm.SVC(kernel='linear')
# train the model using X_train_dtm
%time svm_clf.fit(X_train_dtm, y_train)

CPU times: user 2min 10s, sys: 1.26 s, total: 2min 11s
Wall time: 2min 21s


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [28]:
# make class predictions for X_test_dtm
y_pred_class = svm_clf.predict(X_test_dtm)

In [29]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

0.92789165446559296

In [30]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[900,   3,   8,   7,  19,   4],
       [  5, 898,  14,  14,  12,   7],
       [ 12,  13, 818,  15,  21,   6],
       [  7,   0,  11, 816,  16,   6],
       [ 12,   7,  17,  19, 800,  30],
       [  8,   4,  14,  21,  62, 838]])

![alt text](http://i.imgur.com/5ZriKGT.png "Title")