### Decision Tree Classifier on Amazon Fine Food Reviews

DataSet Source: https://www.kaggle.com/snap/amazon-fine-food-reviews

In [2]:
#Importing all the Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.cross_validation import cross_val_score
from collections import Counter
from sklearn import cross_validation
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')

In [3]:
#Loding Bow data:
import pickle
with open('train_bow.pickle','rb') as handle:
    train_bow = pickle.load(handle)
with open('test_bow.pickle','rb') as handle:
    test_bow = pickle.load(handle)

In [4]:
#Loding Tfidf data:
import pickle
with open('train_tfidf.pickle','rb') as handle:
    train_tfidf = pickle.load(handle)
with open('test_tfidf.pickle','rb') as handle:
    test_tfidf = pickle.load(handle)

In [5]:
#Loding Avg Word2Vec data:
import pickle
with open('train_avg_word2vec.pickle','rb') as handle:
    train_avg_word2vec = pickle.load(handle)
with open('test_avg_word2vec.pickle','rb') as handle:
    test_avg_word2vec = pickle.load(handle)

In [6]:
#Loding Avg Word2Vec data:
import pickle
with open('train_tfidf_word2vec.pickle','rb') as handle:
    train_tfidf_word2vec = pickle.load(handle)
with open('test_tfidf_word2vec.pickle','rb') as handle:
    test_tfidf_word2vec = pickle.load(handle)

In [7]:
with open('y_train.pickle','rb') as handle:
    y_train = pickle.load(handle)
with open('y_test.pickle','rb') as handle:
    y_test = pickle.load(handle)

In [8]:
with open('y_train_w.pickle','rb') as handle:
    y_train_w = pickle.load(handle)
with open('y_test_w.pickle','rb') as handle:
    y_test_w = pickle.load(handle)

In [9]:
#Feature Normalization
from sklearn.preprocessing import normalize
#Bow features
train_bow_normalize = normalize(train_bow, axis=0)
test_bow_normalize = normalize(test_bow, axis=0)
#Tfidf Features
train_tfidf_normalize = normalize(train_tfidf, axis=0)
test_tfidf_normalize = normalize(test_tfidf, axis=0)
#Avg word2Vec Features
train_avgw2v_normalize = normalize(train_avg_word2vec, axis=0)
test_avgw2v_normalize = normalize(test_avg_word2vec, axis=0)
#Tfidf Weighted Word2Vec
train_tfidfw2v_normalize = normalize(train_tfidf_word2vec, axis=0)
test_tfidfw2v_normalize = normalize(test_tfidf_word2vec, axis=0)

### Featurization: Bag of words

In [10]:
#Creating list of depths for DecisionTrees
depthList = list(range(1,30))
#Empty list that will hold cv scores
cv_scores = []
#Perform 10-fold cross validation
for d in depthList:
    DT = DecisionTreeClassifier(max_depth=d)
    scores = cross_val_score(DT, train_bow_normalize, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())

#Changing to misclassification error
MSE = [1 - x for x in cv_scores]

#Determining best alpha
optimal_d = depthList[MSE.index(min(MSE))]
print('\nThe optimal value of d is %d.' % optimal_d)

print("The misclassification error for each 'd' value is : ", np.round(MSE,4))


The optimal value of d is 8.
The misclassification error for each 'd' value is :  [0.1447 0.1436 0.1377 0.1341 0.1311 0.1303 0.1283 0.1259 0.126  0.127
 0.1271 0.1284 0.1289 0.13   0.1334 0.1326 0.1339 0.1349 0.1364 0.1373
 0.138  0.1391 0.1411 0.142  0.1413 0.1417 0.1429 0.1449 0.1431]


In [11]:
#Create a DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=optimal_d)
#Train the model using the training sets 
model.fit(train_bow_normalize, y_train)
#Predict Output 
predicted= model.predict(test_bow_normalize)
#Evaluate accuracy based on y_test and predicted.
acc = accuracy_score(y_test, predicted) * 100
print('\nThe accuracy of the DT classifier is %f%%' % (acc))
pre = precision_recall_fscore_support(y_test, predicted)
print(pre)
conf_matrix = confusion_matrix(y_test, predicted)
print(conf_matrix)


The accuracy of the DT classifier is 84.566667%
(array([0.65584416, 0.85593816]), array([0.19765166, 0.97870631]), array([0.3037594 , 0.91321462]), array([ 511, 2489], dtype=int64))
[[ 101  410]
 [  53 2436]]


### Featurization: Tfidf

In [12]:
#Creating list of depths for DecisionTrees
depthList = list(range(1,30))
#Empty list that will hold cv scores
cv_scores = []
#Perform 10-fold cross validation
for d in depthList:
    DT = DecisionTreeClassifier(max_depth=d)
    scores = cross_val_score(DT, train_tfidf_normalize, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())

#Changing to misclassification error
MSE = [1 - x for x in cv_scores]

#Determining best alpha
optimal_d = depthList[MSE.index(min(MSE))]
print('\nThe optimal value of d is %d.' % optimal_d)

print("The misclassification error for each 'd' value is : ", np.round(MSE,4))


The optimal value of d is 9.
The misclassification error for each 'd' value is :  [0.1451 0.1417 0.1377 0.1359 0.1311 0.1303 0.129  0.128  0.1269 0.1271
 0.128  0.1304 0.1309 0.1287 0.1311 0.1301 0.1341 0.1349 0.1363 0.1363
 0.1377 0.1401 0.1409 0.1411 0.1434 0.1441 0.1431 0.1464 0.1464]


In [13]:
#Create a DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=optimal_d)
#Train the model using the training sets 
model.fit(train_tfidf_normalize, y_train)
#Predict Output 
predicted= model.predict(test_tfidf_normalize)
#Evaluate accuracy based on y_test and predicted.
acc = accuracy_score(y_test, predicted) * 100
print('\nThe accuracy of the DT classifier is %f%%' % (acc))
pre = precision_recall_fscore_support(y_test, predicted)
print(pre)
conf_matrix = confusion_matrix(y_test, predicted)
print(conf_matrix)


The accuracy of the DT classifier is 84.966667%
(array([0.66666667, 0.86134752]), array([0.23483366, 0.97589393]), array([0.34732272, 0.91504992]), array([ 511, 2489], dtype=int64))
[[ 120  391]
 [  60 2429]]


### Featurization: Avg Word2Vec

In [14]:
#Creating list of depths for DecisionTrees
depthList = list(range(1,30))
#Empty list that will hold cv scores
cv_scores = []
#Perform 10-fold cross validation
for d in depthList:
    DT = DecisionTreeClassifier(max_depth=d)
    scores = cross_val_score(DT, train_avgw2v_normalize, y_train_w, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())

#Changing to misclassification error
MSE = [1 - x for x in cv_scores]

#Determining best alpha
optimal_d = depthList[MSE.index(min(MSE))]
print('\nThe optimal value of d is %d.' % optimal_d)

print("The misclassification error for each 'd' value is : ", np.round(MSE,4))


The optimal value of d is 1.
The misclassification error for each 'd' value is :  [0.1444 0.1444 0.1463 0.1506 0.1526 0.1621 0.1701 0.1786 0.1827 0.1874
 0.194  0.198  0.1981 0.2071 0.211  0.2157 0.2174 0.2216 0.2199 0.225
 0.2217 0.2233 0.2267 0.2266 0.226  0.225  0.2294 0.2234 0.2254]


In [15]:
#Create a DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=optimal_d)
#Train the model using the training sets 
model.fit(train_avgw2v_normalize, y_train_w)
#Predict Output 
predicted= model.predict(test_avgw2v_normalize)
#Evaluate accuracy based on y_test and predicted.
acc = accuracy_score(y_test_w, predicted) * 100
print('\nThe accuracy of the DT classifier is %f%%' % (acc))
pre = precision_recall_fscore_support(y_test_w, predicted)
print(pre)
conf_matrix = confusion_matrix(y_test_w, predicted)
print(conf_matrix)


The accuracy of the DT classifier is 82.966667%
(array([0.        , 0.82966667]), array([0., 1.]), array([0.        , 0.90690472]), array([ 511, 2489], dtype=int64))
[[   0  511]
 [   0 2489]]


### Featurization: Tfidf Weighted Word2Vec

In [16]:
#Creating list of depths for DecisionTrees
depthList = list(range(1,30))
#Empty list that will hold cv scores
cv_scores = []
#Perform 10-fold cross validation
for d in depthList:
    DT = DecisionTreeClassifier(max_depth=d)
    scores = cross_val_score(DT, train_tfidfw2v_normalize, y_train_w, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())

#Changing to misclassification error
MSE = [1 - x for x in cv_scores]

#Determining best alpha
optimal_d = depthList[MSE.index(min(MSE))]
print('\nThe optimal value of d is %d.' % optimal_d)

print("The misclassification error for each 'd' value is : ", np.round(MSE,4))


The optimal value of d is 1.
The misclassification error for each 'd' value is :  [0.1444 0.1444 0.1446 0.1483 0.1511 0.1581 0.1649 0.1674 0.176  0.1846
 0.1871 0.1896 0.1984 0.2009 0.2003 0.2049 0.2076 0.2087 0.2181 0.2154
 0.2111 0.2119 0.2193 0.216  0.2179 0.2156 0.2164 0.2183 0.2221]


In [17]:
#Create a DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=optimal_d)
#Train the model using the training sets 
model.fit(train_tfidfw2v_normalize, y_train_w)
#Predict Output 
predicted= model.predict(test_tfidfw2v_normalize)
#Evaluate accuracy based on y_test and predicted.
acc = accuracy_score(y_test_w, predicted) * 100
print('\nThe accuracy of the DT classifier is %f%%' % (acc))
pre = precision_recall_fscore_support(y_test_w, predicted)
print(pre)
conf_matrix = confusion_matrix(y_test_w, predicted)
print(conf_matrix)


The accuracy of the DT classifier is 82.966667%
(array([0.        , 0.82966667]), array([0., 1.]), array([0.        , 0.90690472]), array([ 511, 2489], dtype=int64))
[[   0  511]
 [   0 2489]]
