Note: KNN, SVM, DT, and RF parameter tuning were done in Assignment 1 for this dataset.

In [3]:
import ipynb
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import OrdinalEncoder
from numpy import asarray
from statistics import mean

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
from ipynb.fs.full.ParameterTuning import ParameterTuning

In [6]:
preprocessed_df = pd.read_csv("../data/preprocessedOnlineShoppingData.csv", index_col=0)

categorical_features = ['VisitorType_New_Visitor', 'VisitorType_Other', 'VisitorType_Returning_Visitor', 'Month_Aug', 'Month_Dec', 'Month_Feb', 'Month_Jul', 'Month_June', 'Month_Mar', 'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep']
target = "Revenue"

# Convert categorical feature types
preprocessed_df[categorical_features] = preprocessed_df[categorical_features].astype('category') 
preprocessed_df[target] = preprocessed_df[target].astype('category') 

print('Revenue distribution:\n'+ str(preprocessed_df['Revenue'].value_counts()))

Revenue distribution:
0    10422
1     1908
Name: Revenue, dtype: int64


In [7]:
# Balancing (oversampling)
count_class_0, count_class_1 = preprocessed_df['Revenue'].value_counts()
df_class_0 = preprocessed_df[preprocessed_df['Revenue'] == 0]
df_class_1 = preprocessed_df[preprocessed_df['Revenue'] == 1]
# Oversample
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
oversampled_df = pd.concat([df_class_0, df_class_1_over], axis=0)
# Split back into X_train and y_train    
print('Random over-sampling:\n'+ str(oversampled_df['Revenue'].value_counts()))

Random over-sampling:
1    10422
0    10422
Name: Revenue, dtype: int64


In [11]:
# MLP

grid_params = {
    'hidden_layer_sizes': [(10,30,10),(20,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

ParameterTuning (oversampled_df.copy(), target, MLPClassifier(), grid_params)

Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed:  5.1min finished


{'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (10, 30, 10), 'learning_rate': 'adaptive', 'solver': 'adam'}


In [14]:
# GDE

grid_params = {
    "n_estimators":[5,50,250,500],
    "max_depth":[1,3,5,7,9],
    "learning_rate":[0.01,0.1,1,10,100]
}

ParameterTuning (oversampled_df.copy(), target, GradientBoostingClassifier(), grid_params)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   38.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  8.4min finished


{'learning_rate': 1, 'max_depth': 9, 'n_estimators': 500}
