### Gradient Boosting DT on Amazon Food Reviews

DataSet Source: https://www.kaggle.com/snap/amazon-fine-food-reviews

In [1]:
#Importing Python Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.cross_validation import cross_val_score
from sklearn import cross_validation
from sklearn.ensemble import GradientBoostingClassifier
import warnings
warnings.filterwarnings('ignore')



In [2]:
#Loding Bow data:
import pickle
with open('train_bow.pickle','rb') as handle:
    train_bow = pickle.load(handle)
with open('test_bow.pickle','rb') as handle:
    test_bow = pickle.load(handle)

In [3]:
#Loding Tfidf data:
import pickle
with open('train_tfidf.pickle','rb') as handle:
    train_tfidf = pickle.load(handle)
with open('test_tfidf.pickle','rb') as handle:
    test_tfidf = pickle.load(handle)

In [4]:
#Loding Avg Word2Vec data:
import pickle
with open('train_avg_word2vec.pickle','rb') as handle:
    train_avg_word2vec = pickle.load(handle)
with open('test_avg_word2vec.pickle','rb') as handle:
    test_avg_word2vec = pickle.load(handle)

In [5]:
#Loding Avg Word2Vec data:
import pickle
with open('train_tfidf_word2vec.pickle','rb') as handle:
    train_tfidf_word2vec = pickle.load(handle)
with open('test_tfidf_word2vec.pickle','rb') as handle:
    test_tfidf_word2vec = pickle.load(handle)

In [6]:
with open('y_train.pickle','rb') as handle:
    y_train = pickle.load(handle)
with open('y_test.pickle','rb') as handle:
    y_test = pickle.load(handle)

In [7]:
with open('y_train_w.pickle','rb') as handle:
    y_train_w = pickle.load(handle)
with open('y_test_w.pickle','rb') as handle:
    y_test_w = pickle.load(handle)

In [8]:
#Feature Normalization
from sklearn.preprocessing import normalize
#Bow features
train_bow_normalize = normalize(train_bow, axis=0)
test_bow_normalize = normalize(test_bow, axis=0)
#Tfidf Features
train_tfidf_normalize = normalize(train_tfidf, axis=0)
test_tfidf_normalize = normalize(test_tfidf, axis=0)
#Avg word2Vec Features
train_avgw2v_normalize = normalize(train_avg_word2vec, axis=0)
test_avgw2v_normalize = normalize(test_avg_word2vec, axis=0)
#Tfidf Weighted Word2Vec
train_tfidfw2v_normalize = normalize(train_tfidf_word2vec, axis=0)
test_tfidfw2v_normalize = normalize(test_tfidf_word2vec, axis=0)

### Featurization: Bag of Words

In [9]:
#GridSearch to find No of BaseLearners, depth and LearningRate.
from sklearn.grid_search import GridSearchCV

#n_estimators = [100, 200, 300, 400, 500]
#learning_rate = [0.0001, 0.001, 0.01, 0.1]
#param_grid = dict(learning_rate=learning_rate, n_estimators=n_estimators)
param_grid = {'n_estimators':[100, 300, 500, 800], 'max_depth':[3, 5, 10]}
model = GridSearchCV(GradientBoostingClassifier(), param_grid, scoring='accuracy')
model.fit(train_bow_normalize, y_train)
print(model.best_estimator_)
print(model.score(test_bow_normalize,y_test))

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=300,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
0.8706666666666667


### Featurization: Tfidf

In [10]:
#GridSearch to find No of BaseLearners, depth and LearningRate.
from sklearn.grid_search import GridSearchCV
#n_estimators = [100, 200, 300, 400, 500]
#learning_rate = [0.0001, 0.001, 0.01, 0.1]
#param_grid = dict(learning_rate=learning_rate, n_estimators=n_estimators)
param_grid = {'n_estimators':[100, 300, 500, 800], 'max_depth':[3, 5, 10]}
model = GridSearchCV(GradientBoostingClassifier(), param_grid, scoring='accuracy')
model.fit(train_tfidf_normalize, y_train)
print(model.best_estimator_)
print(model.score(test_tfidf_normalize,y_test))

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=300,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
0.8703333333333333


### Featurization: Avg Word2Vec

In [13]:
#GridSearch to find No of BaseLearners, depth and LearningRate.
from sklearn.grid_search import GridSearchCV
#n_estimators = [100, 200, 300, 400, 500]
#learning_rate = [0.0001, 0.001, 0.01, 0.1]
#param_grid = dict(learning_rate=learning_rate, n_estimators=n_estimators)
param_grid = {'n_estimators':[100, 300, 500, 800], 'max_depth':[3, 5, 10]}
model = GridSearchCV(GradientBoostingClassifier(), param_grid, scoring='accuracy')
model.fit(train_avgw2v_normalize, y_train_w)
print(model.best_estimator_)
print(model.score(test_avgw2v_normalize,y_test_w))

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=800,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
0.795


### Featurization: Tfidf Weighted Word2Vec

In [12]:
#GridSearch to find No of BaseLearners, depth and LearningRate.
from sklearn.grid_search import GridSearchCV
#n_estimators = [100, 200, 300, 400, 500]
#learning_rate = [0.0001, 0.001, 0.01, 0.1]
#param_grid = dict(learning_rate=learning_rate, n_estimators=n_estimators)
param_grid = {'n_estimators':[100, 300, 500, 800], 'max_depth':[3, 5, 10]}
model = GridSearchCV(GradientBoostingClassifier(), param_grid, scoring='accuracy')
model.fit(train_tfidfw2v_normalize, y_train_w)
print(model.best_estimator_)
print(model.score(test_tfidfw2v_normalize, y_test_w))

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=500,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
0.823


#### Observation:

* We are getting better performance with bag of words featurization with n_estimators=300 and max_depth=3.