In [3]:
import ipynb
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [4]:
from ipynb.fs.full.TrainingFunction import Training_Repeat
from ipynb.fs.full.TrainingFunction import SavePredictionsToFile

from ipynb.fs.full.ParameterTuning import ParameterTuning

In [5]:
preprocessed_df = pd.read_csv("../data/preprocessedOnlineShoppingData.csv", index_col=0)

categorical_features = ['VisitorType_New_Visitor', 'VisitorType_Other', 'VisitorType_Returning_Visitor', 'Month_Aug', 'Month_Dec', 'Month_Feb', 'Month_Jul', 'Month_June', 'Month_Mar', 'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep']
target = "Revenue"

# Convert categorical feature types
preprocessed_df[categorical_features] = preprocessed_df[categorical_features].astype('category') 
preprocessed_df[target] = preprocessed_df[target].astype('category') 

print('Revenue distribution:\n'+ str(preprocessed_df['Revenue'].value_counts()))

Revenue distribution:
0    10422
1     1908
Name: Revenue, dtype: int64


In [6]:
# Balancing (oversampling)
count_class_0, count_class_1 = preprocessed_df['Revenue'].value_counts()
df_class_0 = preprocessed_df[preprocessed_df['Revenue'] == 0]
df_class_1 = preprocessed_df[preprocessed_df['Revenue'] == 1]
# Oversample
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
oversampled_df = pd.concat([df_class_0, df_class_1_over], axis=0)
# Split back into X_train and y_train    
print('Random over-sampling:\n'+ str(oversampled_df['Revenue'].value_counts()))

Random over-sampling:
1    10422
0    10422
Name: Revenue, dtype: int64


# KNN

In [7]:
knn_model = KNeighborsClassifier(n_neighbors=13, algorithm = 'brute', metric='manhattan', weights='uniform')

knn_stats = Training_Repeat(oversampled_df.copy(), target, knn_model)
SavePredictionsToFile("../accuracy/OnlineShoppingKNN.csv", knn_stats)

__________________________________________________________________________________Iteration:0
Accuracy: 0.853 (0.008)
__________________________________________________________________________________Iteration:1
Accuracy: 0.850 (0.009)
__________________________________________________________________________________Iteration:2
Accuracy: 0.850 (0.006)
__________________________________________________________________________________Iteration:3
Accuracy: 0.848 (0.009)
__________________________________________________________________________________Iteration:4
Accuracy: 0.851 (0.008)
__________________________________________________________________________________Iteration:5
Accuracy: 0.850 (0.008)
__________________________________________________________________________________Iteration:6
Accuracy: 0.852 (0.010)
__________________________________________________________________________________Iteration:7
Accuracy: 0.853 (0.007)
________________________________________________________

# Random Forest

In [8]:
rf_model = RandomForestClassifier(bootstrap= True, max_depth=10, max_features= 8, min_samples_leaf= 5, min_samples_split = 8, n_estimators = 100)

rf_stats = Training_Repeat(oversampled_df.copy(), target, rf_model)
SavePredictionsToFile("../accuracy/OnlineShoppingRF.csv", rf_stats)

__________________________________________________________________________________Iteration:0
Accuracy: 0.884 (0.006)
__________________________________________________________________________________Iteration:1
Accuracy: 0.884 (0.009)
__________________________________________________________________________________Iteration:2
Accuracy: 0.883 (0.004)
__________________________________________________________________________________Iteration:3
Accuracy: 0.883 (0.006)
__________________________________________________________________________________Iteration:4
Accuracy: 0.884 (0.007)
__________________________________________________________________________________Iteration:5
Accuracy: 0.883 (0.010)
__________________________________________________________________________________Iteration:6
Accuracy: 0.884 (0.008)
__________________________________________________________________________________Iteration:7
Accuracy: 0.884 (0.008)
________________________________________________________

# Decision Tree

In [9]:
dt_model = DecisionTreeClassifier(criterion='gini', max_depth= 4, min_samples_leaf= 3, min_samples_split = 2)

dt_stats = Training_Repeat(oversampled_df.copy(), target, dt_model)
SavePredictionsToFile("../accuracy/OnlineShoppingDT.csv", dt_stats)

__________________________________________________________________________________Iteration:0
Accuracy: 0.852 (0.010)
__________________________________________________________________________________Iteration:1
Accuracy: 0.852 (0.009)
__________________________________________________________________________________Iteration:2
Accuracy: 0.854 (0.011)
__________________________________________________________________________________Iteration:3
Accuracy: 0.853 (0.007)
__________________________________________________________________________________Iteration:4
Accuracy: 0.853 (0.007)
__________________________________________________________________________________Iteration:5
Accuracy: 0.853 (0.008)
__________________________________________________________________________________Iteration:6
Accuracy: 0.853 (0.006)
__________________________________________________________________________________Iteration:7
Accuracy: 0.854 (0.009)
________________________________________________________

# Support Vector Machine

In [10]:
svm_model = SVC(kernel= 'linear')

svm_stats = Training_Repeat(oversampled_df.copy(), target, svm_model)
SavePredictionsToFile("../accuracy/OnlineShoppingSVM.csv", svm_stats)

__________________________________________________________________________________Iteration:0
Accuracy: 0.822 (0.006)
__________________________________________________________________________________Iteration:1
Accuracy: 0.822 (0.008)
__________________________________________________________________________________Iteration:2
Accuracy: 0.822 (0.010)
__________________________________________________________________________________Iteration:3
Accuracy: 0.821 (0.006)
__________________________________________________________________________________Iteration:4
Accuracy: 0.821 (0.005)
__________________________________________________________________________________Iteration:5
Accuracy: 0.822 (0.012)
__________________________________________________________________________________Iteration:6
Accuracy: 0.821 (0.007)
__________________________________________________________________________________Iteration:7
Accuracy: 0.822 (0.008)
________________________________________________________

# Multi Layer Perceptron

In [11]:
# Hyperparameter Tuning

grid_params = {
    'hidden_layer_sizes': [(10,30,10),(20,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

ParameterTuning (oversampled_df.copy(), target, MLPClassifier(), grid_params)

Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed:  5.1min finished


{'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (10, 30, 10), 'learning_rate': 'adaptive', 'solver': 'adam'}


In [13]:
mlp_model = MLPClassifier(activation='relu', alpha=0.0001, hidden_layer_sizes=(10, 30, 10), learning_rate='adaptive', solver='adam')

mlp_stats = Training_Repeat(oversampled_df.copy(), target, mlp_model)
SavePredictionsToFile("../accuracy/OnlineShoppingMLP.csv", mlp_stats)

__________________________________________________________________________________Iteration:0
Accuracy: 0.869 (0.006)
__________________________________________________________________________________Iteration:1
Accuracy: 0.871 (0.009)
__________________________________________________________________________________Iteration:2
Accuracy: 0.871 (0.007)
__________________________________________________________________________________Iteration:3
Accuracy: 0.867 (0.006)
__________________________________________________________________________________Iteration:4
Accuracy: 0.867 (0.009)
__________________________________________________________________________________Iteration:5
Accuracy: 0.867 (0.008)
__________________________________________________________________________________Iteration:6
Accuracy: 0.867 (0.010)
__________________________________________________________________________________Iteration:7
Accuracy: 0.865 (0.010)
________________________________________________________

# Gradient Boosting Ensemble

In [14]:
# Hyperparameter Tuning

grid_params = {
    "n_estimators":[5,50,250,500],
    "max_depth":[1,3,5,7,9],
    "learning_rate":[0.01,0.1,1,10,100]
}

ParameterTuning (oversampled_df.copy(), target, GradientBoostingClassifier(), grid_params)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   38.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  8.4min finished


{'learning_rate': 1, 'max_depth': 9, 'n_estimators': 500}


In [15]:
gde_model = GradientBoostingClassifier(learning_rate=1, max_depth=9, n_estimators=500)

gde_stats = Training_Repeat(oversampled_df.copy(), target, gde_model)
SavePredictionsToFile("../accuracy/OnlineShoppingGDE.csv", gde_stats)

__________________________________________________________________________________Iteration:0
Accuracy: 0.962 (0.004)
__________________________________________________________________________________Iteration:1
Accuracy: 0.966 (0.006)
__________________________________________________________________________________Iteration:2
Accuracy: 0.965 (0.003)
__________________________________________________________________________________Iteration:3
Accuracy: 0.963 (0.004)
__________________________________________________________________________________Iteration:4
Accuracy: 0.965 (0.005)
__________________________________________________________________________________Iteration:5
Accuracy: 0.963 (0.005)
__________________________________________________________________________________Iteration:6
Accuracy: 0.963 (0.004)
__________________________________________________________________________________Iteration:7
Accuracy: 0.964 (0.004)
________________________________________________________

# Overall Results

In [19]:
# To copy-paste into the report

outputString = ""

for i in range (10):
    outputString = outputString+str(i+1)+' & %.5f & %.5f & %.5f & %.5f & %.5f & %.5f \\\ \\hline \n' % (knn_stats[i],svm_stats[i],dt_stats[i],rf_stats[i],mlp_stats[i],gde_stats[i])
    
print(outputString)

1 & 0.85276 & 0.82172 & 0.85195 & 0.88395 & 0.86884 & 0.96167 \\ \hline 
2 & 0.85037 & 0.82163 & 0.85219 & 0.88390 & 0.87109 & 0.96594 \\ \hline 
3 & 0.85032 & 0.82153 & 0.85372 & 0.88347 & 0.87071 & 0.96507 \\ \hline 
4 & 0.84845 & 0.82144 & 0.85329 & 0.88265 & 0.86716 & 0.96287 \\ \hline 
5 & 0.85113 & 0.82124 & 0.85319 & 0.88352 & 0.86716 & 0.96455 \\ \hline 
6 & 0.84974 & 0.82158 & 0.85257 & 0.88308 & 0.86720 & 0.96296 \\ \hline 
7 & 0.85204 & 0.82148 & 0.85252 & 0.88352 & 0.86720 & 0.96335 \\ \hline 
8 & 0.85291 & 0.82163 & 0.85353 & 0.88395 & 0.86548 & 0.96445 \\ \hline 
9 & 0.85060 & 0.82206 & 0.85214 & 0.88241 & 0.86600 & 0.96291 \\ \hline 
10 & 0.85056 & 0.82115 & 0.85334 & 0.88351 & 0.86720 & 0.96493 \\ \hline 

