In [1]:
import ipynb
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import OrdinalEncoder
from numpy import asarray
from statistics import mean

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from ipynb.fs.full.TrainingFunction import Train_Models_CV10
from ipynb.fs.full.TrainingFunction import SavePredictionsToFile

from ipynb.fs.full.ParameterTuning import ParameterTuning

In [4]:
preprocessed_df = pd.read_csv("../data/preprocessedOnlineShoppingData.csv", index_col=0)

categorical_features = ['VisitorType_New_Visitor', 'VisitorType_Other', 'VisitorType_Returning_Visitor', 'Month_Aug', 'Month_Dec', 'Month_Feb', 'Month_Jul', 'Month_June', 'Month_Mar', 'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep']
target = "Revenue"

# Convert categorical feature types
preprocessed_df[categorical_features] = preprocessed_df[categorical_features].astype('category') 
preprocessed_df[target] = preprocessed_df[target].astype('category') 

print('Revenue distribution:\n'+ str(preprocessed_df['Revenue'].value_counts()))

Revenue distribution:
0    10422
1     1908
Name: Revenue, dtype: int64


In [5]:
# Balancing (oversampling)
count_class_0, count_class_1 = preprocessed_df['Revenue'].value_counts()
df_class_0 = preprocessed_df[preprocessed_df['Revenue'] == 0]
df_class_1 = preprocessed_df[preprocessed_df['Revenue'] == 1]
# Oversample
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
oversampled_df = pd.concat([df_class_0, df_class_1_over], axis=0)
# Split back into X_train and y_train    
print('Random over-sampling:\n'+ str(oversampled_df['Revenue'].value_counts()))

Random over-sampling:
1    10422
0    10422
Name: Revenue, dtype: int64


# Models

Get Parameters from "OnlineShoppingIntention_ParameterTuning"

In [6]:
knn_model = KNeighborsClassifier(n_neighbors=13, algorithm = 'brute', metric='manhattan', weights='uniform')
rf_model = RandomForestClassifier(bootstrap= True, max_depth=10, max_features= 8, min_samples_leaf= 5, min_samples_split = 8, n_estimators = 100)
dt_model = DecisionTreeClassifier(criterion='gini', max_depth= 4, min_samples_leaf= 3, min_samples_split = 2)
svm_model = SVC(kernel= 'linear')
mlp_model = MLPClassifier(activation='relu', alpha=0.0001, hidden_layer_sizes=(10, 30, 10), learning_rate='adaptive', solver='adam')
gde_model = GradientBoostingClassifier(learning_rate=1, max_depth=9, n_estimators=500)

models = [knn_model, svm_model, dt_model, rf_model, mlp_model, gde_model]

model_names = []

for mod in models:
    model_names.append(type(mod).__name__)

In [7]:
stats = Train_Models_CV10 (preprocessed_df, target, models)

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9
Fold 10


In [8]:
print(stats)

{'KNeighborsClassifier': [0.8856447688564477, 0.8880778588807786, 0.8856447688564477, 0.8880778588807786, 0.9010543390105434, 0.8824006488240065, 0.8880778588807786, 0.8856447688564477, 0.884022708840227, 0.8905109489051095], 'SVC': [0.8815896188158961, 0.8905109489051095, 0.884022708840227, 0.878345498783455, 0.8921330089213301, 0.8807785888077859, 0.8791565287915653, 0.8807785888077859, 0.8880778588807786, 0.8953771289537713], 'DecisionTreeClassifier': [0.894566098945661, 0.9002433090024331, 0.894566098945661, 0.8929440389294404, 0.9156528791565288, 0.8937550689375506, 0.8929440389294404, 0.8994322789943228, 0.8913219789132197, 0.8978102189781022], 'RandomForestClassifier': [0.9075425790754258, 0.9075425790754258, 0.8953771289537713, 0.8978102189781022, 0.916463909164639, 0.8994322789943228, 0.8986212489862125, 0.9051094890510949, 0.8969991889699919, 0.9099756690997567], 'MLPClassifier': [0.9034874290348743, 0.9067315490673155, 0.8978102189781022, 0.8913219789132197, 0.92214111922141

In [13]:
# To copy-paste into the report

outputString = ""
print("      KNN       SVM       DT         RF        MLP       GDE")
for i in range (10):
    outputString = outputString+str(i+1)+' & %.5f & %.5f & %.5f & %.5f & %.5f & %.5f \\\ \\hline \n' % (stats[model_names[0]][i],stats[model_names[1]][i],stats[model_names[2]][i],stats[model_names[3]][i],stats[model_names[4]][i],stats[model_names[5]][i])

outputString = outputString+'\\hline avg'+' & %.5f & %.5f & %.5f & %.5f & %.5f & %.5f \\\ \\hline \n' % (mean(stats[model_names[0]]),mean(stats[model_names[1]]),mean(stats[model_names[2]]),mean(stats[model_names[3]]),mean(stats[model_names[4]]),mean(stats[model_names[5]]))
outputString = outputString+'std'+' & %.5f & %.5f & %.5f & %.5f & %.5f & %.5f \\\ \\hline \n' % (np.std(stats[model_names[0]]),np.std(stats[model_names[1]]),np.std(stats[model_names[2]]),np.std(stats[model_names[3]]),np.std(stats[model_names[4]]),np.std(stats[model_names[5]]))
    
print(outputString)

      KNN       SVM       DT         RF        MLP       GDE
1 & 0.88564 & 0.88159 & 0.89457 & 0.90754 & 0.90349 & 0.87835 \\ \hline 
2 & 0.88808 & 0.89051 & 0.90024 & 0.90754 & 0.90673 & 0.89457 \\ \hline 
3 & 0.88564 & 0.88402 & 0.89457 & 0.89538 & 0.89781 & 0.66910 \\ \hline 
4 & 0.88808 & 0.87835 & 0.89294 & 0.89781 & 0.89132 & 0.87186 \\ \hline 
5 & 0.90105 & 0.89213 & 0.91565 & 0.91646 & 0.92214 & 0.89700 \\ \hline 
6 & 0.88240 & 0.88078 & 0.89376 & 0.89943 & 0.89619 & 0.85726 \\ \hline 
7 & 0.88808 & 0.87916 & 0.89294 & 0.89862 & 0.88727 & 0.87997 \\ \hline 
8 & 0.88564 & 0.88078 & 0.89943 & 0.90511 & 0.90187 & 0.88159 \\ \hline 
9 & 0.88402 & 0.88808 & 0.89132 & 0.89700 & 0.89457 & 0.87997 \\ \hline 
10 & 0.89051 & 0.89538 & 0.89781 & 0.90998 & 0.90430 & 0.84347 \\ \hline 
avg & 0.88792 & 0.88508 & 0.89732 & 0.90349 & 0.90057 & 0.85531 \\ \hline 
std & 0.00490 & 0.00570 & 0.00672 & 0.00653 & 0.00923 & 0.06388 \\ \hline 



In [10]:
SavePredictionsToFile("../accuracy/accuracyOnlineShopping.csv", stats)