In [1]:
import ipynb
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import OrdinalEncoder
from numpy import asarray
from statistics import mean

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from ipynb.fs.full.TrainingFunction import Train_Models_CV10
from ipynb.fs.full.TrainingFunction import SavePredictionsToFile

from ipynb.fs.full.ParameterTuning import ParameterTuning

In [4]:
preprocessed_df = pd.read_csv("../data/preprocessedOnlineShoppingData.csv", index_col=0)

categorical_features = ['VisitorType_New_Visitor', 'VisitorType_Other', 'VisitorType_Returning_Visitor', 'Month_Aug', 'Month_Dec', 'Month_Feb', 'Month_Jul', 'Month_June', 'Month_Mar', 'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep']
target = "Revenue"

# Convert categorical feature types
preprocessed_df[categorical_features] = preprocessed_df[categorical_features].astype('category') 
preprocessed_df[target] = preprocessed_df[target].astype('category') 

print('Revenue distribution:\n'+ str(preprocessed_df['Revenue'].value_counts()))

Revenue distribution:
0    10422
1     1908
Name: Revenue, dtype: int64


In [5]:
# Balancing (oversampling)
count_class_0, count_class_1 = preprocessed_df['Revenue'].value_counts()
df_class_0 = preprocessed_df[preprocessed_df['Revenue'] == 0]
df_class_1 = preprocessed_df[preprocessed_df['Revenue'] == 1]
# Oversample
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
oversampled_df = pd.concat([df_class_0, df_class_1_over], axis=0)
# Split back into X_train and y_train    
print('Random over-sampling:\n'+ str(oversampled_df['Revenue'].value_counts()))

Random over-sampling:
1    10422
0    10422
Name: Revenue, dtype: int64


# Models

Get Parameters from "OnlineShoppingIntention_ParameterTuning"

In [6]:
knn_model = KNeighborsClassifier(n_neighbors=13, algorithm = 'brute', metric='manhattan', weights='uniform')
rf_model = RandomForestClassifier(bootstrap= True, max_depth=10, max_features= 8, min_samples_leaf= 5, min_samples_split = 8, n_estimators = 100)
dt_model = DecisionTreeClassifier(criterion='gini', max_depth= 4, min_samples_leaf= 3, min_samples_split = 2)
svm_model = SVC(kernel= 'linear')
mlp_model = MLPClassifier(activation='relu', alpha=0.0001, hidden_layer_sizes=(10, 30, 10), learning_rate='adaptive', solver='adam')
gde_model = GradientBoostingClassifier(learning_rate=1, max_depth=9, n_estimators=500)

models = [knn_model, svm_model, dt_model, rf_model, mlp_model, gde_model]

model_names = []

for mod in models:
    model_names.append(type(mod).__name__)

In [7]:
stats = Train_Models_CV10 (preprocessed_df, target, models)

KNeighborsClassifier Accuracy: 0.887 (0.004)
SVC Accuracy: 0.885 (0.007)
DecisionTreeClassifier Accuracy: 0.897 (0.008)
RandomForestClassifier Accuracy: 0.903 (0.008)
MLPClassifier Accuracy: 0.899 (0.008)
GradientBoostingClassifier Accuracy: 0.842 (0.081)


In [8]:
print(stats)

{'KNeighborsClassifier': array([0.88240065, 0.88240065, 0.88888889, 0.88726683, 0.88888889,
       0.88483374, 0.88402271, 0.89051095, 0.89213301, 0.89213301]), 'SVC': array([0.88402271, 0.88321168, 0.87591241, 0.88402271, 0.88240065,
       0.89618816, 0.88888889, 0.87266829, 0.88969992, 0.88807786]), 'DecisionTreeClassifier': array([0.89699919, 0.89213301, 0.88402271, 0.89051095, 0.89699919,
       0.88969992, 0.89862125, 0.90754258, 0.90916464, 0.90105434]), 'RandomForestClassifier': array([0.91240876, 0.90024331, 0.90186537, 0.90348743, 0.89213301,
       0.90592052, 0.89375507, 0.91646391, 0.89618816, 0.90754258]), 'MLPClassifier': array([0.9026764 , 0.88158962, 0.90754258, 0.8945661 , 0.90997567,
       0.89618816, 0.89618816, 0.90186537, 0.89781022, 0.90510949]), 'GradientBoostingClassifier': array([0.8215734 , 0.85320357, 0.61719384, 0.88483374, 0.87185726,
       0.88726683, 0.86618005, 0.87672344, 0.86536902, 0.87753447])}


# Overall Results

In [9]:
# To copy-paste into the report

outputString = ""
print("      KNN       SVM       DT         RF        MLP       GDE")
for i in range (10):
    outputString = outputString+str(i+1)+' & %.5f & %.5f & %.5f & %.5f & %.5f & %.5f \\\ \\hline \n' % (stats[model_names[0]][i],stats[model_names[1]][i],stats[model_names[2]][i],stats[model_names[3]][i],stats[model_names[4]][i],stats[model_names[5]][i])
    
print(outputString)

      KNN       SVM       DT         RF        MLP       GDE
1 & 0.88240 & 0.88402 & 0.89700 & 0.91241 & 0.90268 & 0.82157 \\ \hline 
2 & 0.88240 & 0.88321 & 0.89213 & 0.90024 & 0.88159 & 0.85320 \\ \hline 
3 & 0.88889 & 0.87591 & 0.88402 & 0.90187 & 0.90754 & 0.61719 \\ \hline 
4 & 0.88727 & 0.88402 & 0.89051 & 0.90349 & 0.89457 & 0.88483 \\ \hline 
5 & 0.88889 & 0.88240 & 0.89700 & 0.89213 & 0.90998 & 0.87186 \\ \hline 
6 & 0.88483 & 0.89619 & 0.88970 & 0.90592 & 0.89619 & 0.88727 \\ \hline 
7 & 0.88402 & 0.88889 & 0.89862 & 0.89376 & 0.89619 & 0.86618 \\ \hline 
8 & 0.89051 & 0.87267 & 0.90754 & 0.91646 & 0.90187 & 0.87672 \\ \hline 
9 & 0.89213 & 0.88970 & 0.90916 & 0.89619 & 0.89781 & 0.86537 \\ \hline 
10 & 0.89213 & 0.88808 & 0.90105 & 0.90754 & 0.90511 & 0.87753 \\ \hline 



In [11]:
SavePredictionsToFile("../accuracy/accuracyOnlineShopping.csv", stats)

# Friedman Statistic

In [None]:
def column(matrix, i):
    return [row[i] for row in matrix]

In [None]:
K = 10
N = 6

encoder = OrdinalEncoder()

print("__________________________________________________________________________________________Ranks:")
ranks = []

for i in range(10):
    data = asarray([[knn_stats[i]], [svm_stats[i]], [dt_stats[i]], [rf_stats[i]], [mlp_stats[i]], [gde_stats[i]]])
    ranks.append(encoder.fit_transform(data).flatten())
    ranks[i] = [abs(x-6.0) for x in ranks[i]]
    print(ranks[i])


print("__________________________________________________________________________________________R_bar:")
R_bar = (K+1)/2

print(R_bar)

print("__________________________________________________________________________________________eq_2:")
avg_knn = mean(column(ranks, 0))
avg_svm = mean(column(ranks, 1))
avg_dt = mean(column(ranks, 2))
avg_rf = mean(column(ranks, 3))
avg_mlp = mean(column(ranks, 4))
avg_gde = mean(column(ranks, 5))

print('Average Ranks: %.5f | %.5f | %.5f | %.5f | %.5f | %.5f \n' % (avg_knn, avg_svm, avg_dt, avg_rf, avg_mlp, avg_gde))

eq_2 = N * ((avg_knn - R_bar)**2 + (avg_svm - R_bar)**2 + (avg_dt - R_bar)**2 + (avg_rf - R_bar)**2 + (avg_mlp - R_bar)**2 + (gde_knn - R_bar)**2)

print(eq_2)

print("__________________________________________________________________________________________eq_3:")

eq_3 = 0

for row in ranks:
    for entry in row:
        eq_3 = eq_3 + (entry - R_bar)**2
        
eq_3 = eq_3/(N*(K-1))

print(eq_3)

print("__________________________________________________________________________________________Freidman Statistics:")

F_stat = eq_2/eq_3

print(F_stat)

print("__________________________________________________________________________________________Decision:")

F_0 = 10.800          # From class slides.

if (abs(F_stat)<F_0):
    print("Fail to reject the null (ie. cannot say that they are significantly different).")
elif (abs(F_stat)>=F_0):
    print("Reject the null (ie. they are significantly different).")