### Random Forest Classifier on Amazon Food Reviews

DataSet Source: https://www.kaggle.com/snap/amazon-fine-food-reviews

In [1]:
#Importing Necessary Python Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.cross_validation import cross_val_score
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')



In [2]:
#Loding Bow data:
import pickle
with open('train_bow.pickle','rb') as handle:
    train_bow = pickle.load(handle)
with open('test_bow.pickle','rb') as handle:
    test_bow = pickle.load(handle)

In [3]:
#Loding Tfidf data:
import pickle
with open('train_tfidf.pickle','rb') as handle:
    train_tfidf = pickle.load(handle)
with open('test_tfidf.pickle','rb') as handle:
    test_tfidf = pickle.load(handle)

In [4]:
#Loding Avg Word2Vec data:
import pickle
with open('train_avg_word2vec.pickle','rb') as handle:
    train_avg_word2vec = pickle.load(handle)
with open('test_avg_word2vec.pickle','rb') as handle:
    test_avg_word2vec = pickle.load(handle)

In [5]:
#Loding Avg Word2Vec data:
import pickle
with open('train_tfidf_word2vec.pickle','rb') as handle:
    train_tfidf_word2vec = pickle.load(handle)
with open('test_tfidf_word2vec.pickle','rb') as handle:
    test_tfidf_word2vec = pickle.load(handle)

In [6]:
with open('y_train.pickle','rb') as handle:
    y_train = pickle.load(handle)
with open('y_test.pickle','rb') as handle:
    y_test = pickle.load(handle)

In [7]:
with open('y_train_w.pickle','rb') as handle:
    y_train_w = pickle.load(handle)
with open('y_test_w.pickle','rb') as handle:
    y_test_w = pickle.load(handle)

In [8]:
#Feature Normalization
from sklearn.preprocessing import normalize
#Bow features
train_bow_normalize = normalize(train_bow, axis=0)
test_bow_normalize = normalize(test_bow, axis=0)
#Tfidf Features
train_tfidf_normalize = normalize(train_tfidf, axis=0)
test_tfidf_normalize = normalize(test_tfidf, axis=0)
#Avg word2Vec Features
train_avgw2v_normalize = normalize(train_avg_word2vec, axis=0)
test_avgw2v_normalize = normalize(test_avg_word2vec, axis=0)
#Tfidf Weighted Word2Vec
train_tfidfw2v_normalize = normalize(train_tfidf_word2vec, axis=0)
test_tfidfw2v_normalize = normalize(test_tfidf_word2vec, axis=0)

### Featurization: Bag of words

In [9]:
#Creating No of BaseLearners List.
baseLearners = list(range(1,30))
#Empty list that will hold cv scores
cv_scores = []
#Perform 10-fold cross validation
for b in baseLearners:
    RF = RandomForestClassifier(n_estimators=b,n_jobs=-1)
    scores = cross_val_score(RF, train_bow_normalize, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())

#Changing to misclassification error
MSE = [1 - x for x in cv_scores]

#Determining best No of base Learners.
optimal_b = baseLearners[MSE.index(min(MSE))]
print('\nThe optimal value of b is %d.' % optimal_b)

print("The misclassification error for each 'b' value is : ", np.round(MSE,4))


The optimal value of b is 12.
The misclassification error for each 'b' value is :  [0.1971 0.2386 0.1379 0.1594 0.1386 0.1427 0.1341 0.1274 0.128  0.1287
 0.1264 0.1213 0.1241 0.1236 0.1247 0.1233 0.124  0.1243 0.123  0.1231
 0.1273 0.125  0.1254 0.1234 0.1216 0.125  0.1251 0.1221 0.1251]


In [10]:
#Create a RandomForestClassifier
model = RandomForestClassifier(n_estimators=optimal_b,n_jobs=-1)
#Train the model using the training sets 
model.fit(train_bow_normalize, y_train)
#Predict Output 
predicted= model.predict(test_bow_normalize)
#Evaluate accuracy based on y_test and predicted.
acc = accuracy_score(y_test, predicted) * 100
print('\nThe accuracy of the RandomForest Classifier is %f%%' % (acc))
pre = precision_recall_fscore_support(y_test, predicted)
print(pre)
conf_matrix = confusion_matrix(y_test, predicted)
print(conf_matrix)


The accuracy of the RandomForest Classifier is 85.733333%
(array([0.70048309, 0.86895811]), array([0.28375734, 0.9750904 ]), array([0.40389972, 0.91897009]), array([ 511, 2489], dtype=int64))
[[ 145  366]
 [  62 2427]]


### Featurization: Tfidf

In [11]:
#Creating No of BaseLearners List.
baseLearners = list(range(1,30))
#Empty list that will hold cv scores
cv_scores = []
#Perform 10-fold cross validation
for b in baseLearners:
    RF = RandomForestClassifier(n_estimators=b,n_jobs=-1)
    scores = cross_val_score(RF, train_tfidf_normalize, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())

#Changing to misclassification error
MSE = [1 - x for x in cv_scores]

#Determining best No of base Learners.
optimal_b = baseLearners[MSE.index(min(MSE))]
print('\nThe optimal value of b is %d.' % optimal_b)

print("The misclassification error for each 'b' value is : ", np.round(MSE,4))


The optimal value of b is 16.
The misclassification error for each 'b' value is :  [0.1836 0.234  0.1437 0.1506 0.1293 0.1374 0.1299 0.1271 0.128  0.1263
 0.1271 0.1243 0.1287 0.1254 0.1266 0.1206 0.129  0.1274 0.1251 0.123
 0.1281 0.1253 0.1263 0.127  0.1263 0.122  0.1259 0.1246 0.1253]


In [12]:
#Create a RandomForestClassifier
model = RandomForestClassifier(n_estimators=optimal_b,n_jobs=-1)
#Train the model using the training sets 
model.fit(train_tfidf_normalize, y_train)
#Predict Output 
predicted= model.predict(test_tfidf_normalize)
#Evaluate accuracy based on y_test and predicted.
acc = accuracy_score(y_test, predicted) * 100
print('\nThe accuracy of the RandomForest Classifier is %f%%' % (acc))
pre = precision_recall_fscore_support(y_test, predicted)
print(pre)
conf_matrix = confusion_matrix(y_test, predicted)
print(conf_matrix)


The accuracy of the RandomForest Classifier is 85.300000%
(array([0.65350877, 0.86940837]), array([0.29158513, 0.96826035]), array([0.40324763, 0.91617563]), array([ 511, 2489], dtype=int64))
[[ 149  362]
 [  79 2410]]


### Featurization: Avg Word2Vec

In [13]:
#Creating No of BaseLearners List.
baseLearners = list(range(1,30))
#Empty list that will hold cv scores
cv_scores = []
#Perform 10-fold cross validation
for b in baseLearners:
    RF = RandomForestClassifier(n_estimators=b,n_jobs=-1)
    scores = cross_val_score(RF, train_avgw2v_normalize, y_train_w, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())

#Changing to misclassification error
MSE = [1 - x for x in cv_scores]
#Determining best No of base Learners.
optimal_b = baseLearners[MSE.index(min(MSE))]
print('\nThe optimal value of b is %d.' % optimal_b)

print("The misclassification error for each 'b' value is : ", np.round(MSE,4))


The optimal value of b is 27.
The misclassification error for each 'b' value is :  [0.2333 0.302  0.1806 0.2091 0.1684 0.183  0.1579 0.1676 0.1567 0.1623
 0.148  0.1539 0.1476 0.1509 0.1469 0.1497 0.1477 0.1481 0.146  0.1444
 0.1437 0.1429 0.1446 0.1454 0.1454 0.1476 0.1421 0.1434 0.1449]


In [14]:
#Create a RandomForestClassifier
model = RandomForestClassifier(n_estimators=optimal_b,n_jobs=-1)
#Train the model using the training sets 
model.fit(train_avgw2v_normalize, y_train_w)
#Predict Output 
predicted= model.predict(test_avgw2v_normalize)
#Evaluate accuracy based on y_test and predicted.
acc = accuracy_score(y_test_w, predicted) * 100
print('\nThe accuracy of the RandomForest Classifier is %f%%' % (acc))
pre = precision_recall_fscore_support(y_test_w, predicted)
print(pre)
conf_matrix = confusion_matrix(y_test_w, predicted)
print(conf_matrix)


The accuracy of the RandomForest Classifier is 80.800000%
(array([0.20183486, 0.83085438]), array([0.04305284, 0.9650462 ]), array([0.07096774, 0.8929368 ]), array([ 511, 2489], dtype=int64))
[[  22  489]
 [  87 2402]]


### Featurization: Tfidf Weighted Word2Vec

In [15]:
#Creating No of BaseLearners List.
baseLearners = list(range(1,30))
#Empty list that will hold cv scores
cv_scores = []
#Perform 10-fold cross validation
for b in baseLearners:
    RF = RandomForestClassifier(n_estimators=b,n_jobs=-1)
    scores = cross_val_score(RF, train_tfidfw2v_normalize, y_train_w, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())
#Changing to misclassification error
MSE = [1 - x for x in cv_scores]
#Determining best No of base Learners.
optimal_b = baseLearners[MSE.index(min(MSE))]
print('\nThe optimal value of b is %d.' % optimal_b)

print("The misclassification error for each 'b' value is : ", np.round(MSE,4))


The optimal value of b is 29.
The misclassification error for each 'b' value is :  [0.2321 0.2996 0.1839 0.2103 0.1617 0.1837 0.1607 0.1646 0.1521 0.1586
 0.1486 0.1556 0.1504 0.1501 0.1491 0.1481 0.1446 0.1473 0.1451 0.1451
 0.145  0.1461 0.1479 0.1439 0.1437 0.1426 0.143  0.1441 0.1407]


In [16]:
#Create a RandomForestClassifier
model = RandomForestClassifier(n_estimators=optimal_b,n_jobs=-1)
#Train the model using the training sets 
model.fit(train_tfidfw2v_normalize, y_train_w)
#Predict Output 
predicted= model.predict(test_tfidfw2v_normalize)
#Evaluate accuracy based on y_test and predicted.
acc = accuracy_score(y_test_w, predicted) * 100
print('\nThe accuracy of the RandomForest Classifier is %f%%' % (acc))
pre = precision_recall_fscore_support(y_test_w, predicted)
print(pre)
conf_matrix = confusion_matrix(y_test_w, predicted)
print(conf_matrix)


The accuracy of the RandomForest Classifier is 80.500000%
(array([0.19672131, 0.83078527]), array([0.04696673, 0.96062676]), array([0.07582938, 0.89100056]), array([ 511, 2489], dtype=int64))
[[  24  487]
 [  98 2391]]


#### Results:

* BoW---------->BaseLearners=12 ---------->Acc=85.73%
* Tfidf---------->BaseLearners=16 ----------->Acc=85.30%
* Avg Word2Vec--------->BaseLearners=27---------->Acc=80.80%
* Tfidf Word2Vec---------->BaseLearners=29---------->Acc=80.5%
* Bag of words featurization gives highest Accuracy with 12 baseLearners.