In [1]:
import ipynb
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import OrdinalEncoder
from numpy import asarray
from statistics import mean

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from ipynb.fs.full.TrainingFunction import Training_All_2
from ipynb.fs.full.TrainingFunction import SavePredictionsToFile

from ipynb.fs.full.ParameterTuning import ParameterTuning

In [4]:
preprocessed_df = pd.read_csv("../data/preprocessedOnlineShoppingData.csv", index_col=0)

categorical_features = ['VisitorType_New_Visitor', 'VisitorType_Other', 'VisitorType_Returning_Visitor', 'Month_Aug', 'Month_Dec', 'Month_Feb', 'Month_Jul', 'Month_June', 'Month_Mar', 'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep']
target = "Revenue"

# Convert categorical feature types
preprocessed_df[categorical_features] = preprocessed_df[categorical_features].astype('category') 
preprocessed_df[target] = preprocessed_df[target].astype('category') 

print('Revenue distribution:\n'+ str(preprocessed_df['Revenue'].value_counts()))

Revenue distribution:
0    10422
1     1908
Name: Revenue, dtype: int64


In [5]:
# Balancing (oversampling)
count_class_0, count_class_1 = preprocessed_df['Revenue'].value_counts()
df_class_0 = preprocessed_df[preprocessed_df['Revenue'] == 0]
df_class_1 = preprocessed_df[preprocessed_df['Revenue'] == 1]
# Oversample
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
oversampled_df = pd.concat([df_class_0, df_class_1_over], axis=0)
# Split back into X_train and y_train    
print('Random over-sampling:\n'+ str(oversampled_df['Revenue'].value_counts()))

Random over-sampling:
1    10422
0    10422
Name: Revenue, dtype: int64


# Models

Get Parameters from "OnlineShoppingIntention_ParameterTuning"

In [6]:
knn_model = KNeighborsClassifier(n_neighbors=13, algorithm = 'brute', metric='manhattan', weights='uniform')
rf_model = RandomForestClassifier(bootstrap= True, max_depth=10, max_features= 8, min_samples_leaf= 5, min_samples_split = 8, n_estimators = 100)
dt_model = DecisionTreeClassifier(criterion='gini', max_depth= 4, min_samples_leaf= 3, min_samples_split = 2)
svm_model = SVC(kernel= 'linear')
mlp_model = MLPClassifier(activation='relu', alpha=0.0001, hidden_layer_sizes=(10, 30, 10), learning_rate='adaptive', solver='adam')
gde_model = GradientBoostingClassifier(learning_rate=1, max_depth=9, n_estimators=500)

models = [knn_model, svm_model, dt_model, rf_model, mlp_model, gde_model]

In [7]:
stats = Training_All_2 (preprocessed_df, target, models, 10, 10)

___________________________________________________________________________ Fold 1
____________________________ Model KNeighborsClassifier
1: 0.8872668288726683
2: 0.8872668288726683
3: 0.8872668288726683
4: 0.8872668288726683
5: 0.8872668288726683
6: 0.8872668288726683
7: 0.8872668288726683
8: 0.8872668288726683
9: 0.8872668288726683
10: 0.8872668288726683

 Average Accuracy: 0.8872668288726683

____________________________ Model SVC
1: 0.8872668288726683
2: 0.8872668288726683
3: 0.8872668288726683
4: 0.8872668288726683
5: 0.8872668288726683
6: 0.8872668288726683
7: 0.8872668288726683
8: 0.8872668288726683
9: 0.8872668288726683
10: 0.8872668288726683

 Average Accuracy: 0.8872668288726683

____________________________ Model DecisionTreeClassifier
1: 0.9115977291159773
2: 0.9115977291159773
3: 0.9115977291159773
4: 0.9115977291159773
5: 0.9115977291159773
6: 0.9115977291159773
7: 0.9115977291159773
8: 0.9115977291159773
9: 0.9115977291159773
10: 0.9115977291159773

 Average Accuracy: 0

10: 0.8856447688564477

 Average Accuracy: 0.8856447688564477

____________________________ Model SVC
1: 0.8832116788321168
2: 0.8832116788321168
3: 0.8832116788321168
4: 0.8832116788321168
5: 0.8832116788321168
6: 0.8832116788321168
7: 0.8832116788321168
8: 0.8832116788321168
9: 0.8832116788321168
10: 0.8832116788321168

 Average Accuracy: 0.8832116788321168

____________________________ Model DecisionTreeClassifier
1: 0.9002433090024331
2: 0.9002433090024331
3: 0.9002433090024331
4: 0.9002433090024331
5: 0.9002433090024331
6: 0.9002433090024331
7: 0.9002433090024331
8: 0.9002433090024331
9: 0.9002433090024331
10: 0.9002433090024331

 Average Accuracy: 0.9002433090024331

____________________________ Model RandomForestClassifier
1: 0.9002433090024331
2: 0.902676399026764
3: 0.9002433090024331
4: 0.8969991889699919
5: 0.8986212489862125
6: 0.9034874290348743
7: 0.8994322789943228
8: 0.9059205190592052
9: 0.9018653690186537
10: 0.902676399026764

 Average Accuracy: 0.9012165450121654

_

8: 0.9010543390105434
9: 0.9010543390105434
10: 0.9010543390105434

 Average Accuracy: 0.9010543390105434

____________________________ Model RandomForestClassifier
1: 0.9034874290348743
2: 0.9059205190592052
3: 0.9067315490673155
4: 0.9018653690186537
5: 0.9051094890510949
6: 0.9051094890510949
7: 0.9042984590429846
8: 0.9075425790754258
9: 0.9034874290348743
10: 0.902676399026764

 Average Accuracy: 0.9046228710462287

____________________________ Model MLPClassifier
1: 0.9051094890510949
2: 0.9051094890510949
3: 0.8994322789943228
4: 0.8961881589618816
5: 0.910786699107867
6: 0.9051094890510949
7: 0.8978102189781022
8: 0.8994322789943228
9: 0.9059205190592052
10: 0.902676399026764

 Average Accuracy: 0.902757502027575

____________________________ Model GradientBoostingClassifier
1: 0.8791565287915653
2: 0.6285482562854826
3: 0.8775344687753447
4: 0.8094079480940795
5: 0.8629359286293593
6: 0.8751013787510138
7: 0.8694241686942417
8: 0.8605028386050284
9: 0.8199513381995134
10: 0.88

In [8]:
stats

array([[0.88726683, 0.88888889, 0.89375507, 0.8864558 , 0.88564477,
        0.88807786, 0.88483374, 0.89051095, 0.89132198, 0.88240065],
       [0.88726683, 0.88483374, 0.89051095, 0.88240065, 0.88321168,
        0.8783455 , 0.88240065, 0.89862125, 0.88240065, 0.87672344],
       [0.91159773, 0.90592052, 0.89699919, 0.8945661 , 0.90024331,
        0.90916464, 0.88969992, 0.90024331, 0.90105434, 0.88807786],
       [0.90932685, 0.90884023, 0.90170316, 0.90592052, 0.90121655,
        0.9052717 , 0.89148418, 0.90762368, 0.90462287, 0.89635036],
       [0.90559611, 0.90608273, 0.9025953 , 0.90681265, 0.89854015,
        0.90243309, 0.89034874, 0.89910787, 0.9027575 , 0.89618816],
       [0.86399027, 0.87315491, 0.87112733, 0.88150852, 0.82895377,
        0.8837794 , 0.83284672, 0.87226277, 0.83682076, 0.85888078]])

# Overall Results

In [9]:
# To copy-paste into the report

outputString = ""
print("      KNN     SVM     DT     RF     MLP     GDE")
for i in range (10):
    outputString = outputString+str(i+1)+' & %.5f & %.5f & %.5f & %.5f & %.5f & %.5f \\\ \\hline \n' % (stats[0][i],stats[1][i],stats[2][i],stats[3][i],stats[4][i],stats[5][i])
    
print(outputString)

      KNN     SVM     DT     RF     MLP     GDE
1 & 0.88727 & 0.88727 & 0.91160 & 0.90933 & 0.90560 & 0.86399 \\ \hline 
2 & 0.88889 & 0.88483 & 0.90592 & 0.90884 & 0.90608 & 0.87315 \\ \hline 
3 & 0.89376 & 0.89051 & 0.89700 & 0.90170 & 0.90260 & 0.87113 \\ \hline 
4 & 0.88646 & 0.88240 & 0.89457 & 0.90592 & 0.90681 & 0.88151 \\ \hline 
5 & 0.88564 & 0.88321 & 0.90024 & 0.90122 & 0.89854 & 0.82895 \\ \hline 
6 & 0.88808 & 0.87835 & 0.90916 & 0.90527 & 0.90243 & 0.88378 \\ \hline 
7 & 0.88483 & 0.88240 & 0.88970 & 0.89148 & 0.89035 & 0.83285 \\ \hline 
8 & 0.89051 & 0.89862 & 0.90024 & 0.90762 & 0.89911 & 0.87226 \\ \hline 
9 & 0.89132 & 0.88240 & 0.90105 & 0.90462 & 0.90276 & 0.83682 \\ \hline 
10 & 0.88240 & 0.87672 & 0.88808 & 0.89635 & 0.89619 & 0.85888 \\ \hline 



# Friedman Statistic

In [None]:
def column(matrix, i):
    return [row[i] for row in matrix]

In [None]:
K = 10
N = 6

encoder = OrdinalEncoder()

print("__________________________________________________________________________________________Ranks:")
ranks = []

for i in range(10):
    data = asarray([[knn_stats[i]], [svm_stats[i]], [dt_stats[i]], [rf_stats[i]], [mlp_stats[i]], [gde_stats[i]]])
    ranks.append(encoder.fit_transform(data).flatten())
    ranks[i] = [abs(x-6.0) for x in ranks[i]]
    print(ranks[i])


print("__________________________________________________________________________________________R_bar:")
R_bar = (K+1)/2

print(R_bar)

print("__________________________________________________________________________________________eq_2:")
avg_knn = mean(column(ranks, 0))
avg_svm = mean(column(ranks, 1))
avg_dt = mean(column(ranks, 2))
avg_rf = mean(column(ranks, 3))
avg_mlp = mean(column(ranks, 4))
avg_gde = mean(column(ranks, 5))

print('Average Ranks: %.5f | %.5f | %.5f | %.5f | %.5f | %.5f \n' % (avg_knn, avg_svm, avg_dt, avg_rf, avg_mlp, avg_gde))

eq_2 = N * ((avg_knn - R_bar)**2 + (avg_svm - R_bar)**2 + (avg_dt - R_bar)**2 + (avg_rf - R_bar)**2 + (avg_mlp - R_bar)**2 + (gde_knn - R_bar)**2)

print(eq_2)

print("__________________________________________________________________________________________eq_3:")

eq_3 = 0

for row in ranks:
    for entry in row:
        eq_3 = eq_3 + (entry - R_bar)**2
        
eq_3 = eq_3/(N*(K-1))

print(eq_3)

print("__________________________________________________________________________________________Freidman Statistics:")

F_stat = eq_2/eq_3

print(F_stat)

print("__________________________________________________________________________________________Decision:")

F_0 = 10.800          # From class slides.

if (abs(F_stat)<F_0):
    print("Fail to reject the null (ie. cannot say that they are significantly different).")
elif (abs(F_stat)>=F_0):
    print("Reject the null (ie. they are significantly different).")