In [4]:
import ipynb
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import OrdinalEncoder
from numpy import asarray
from statistics import mean

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
from ipynb.fs.full.TrainingFunction import Training_All_2
from ipynb.fs.full.TrainingFunction import SavePredictionsToFile

from ipynb.fs.full.ParameterTuning import ParameterTuning

In [7]:
preprocessed_df = pd.read_csv("../data/preprocessedOnlineShoppingData.csv", index_col=0)

categorical_features = ['VisitorType_New_Visitor', 'VisitorType_Other', 'VisitorType_Returning_Visitor', 'Month_Aug', 'Month_Dec', 'Month_Feb', 'Month_Jul', 'Month_June', 'Month_Mar', 'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep']
target = "Revenue"

# Convert categorical feature types
preprocessed_df[categorical_features] = preprocessed_df[categorical_features].astype('category') 
preprocessed_df[target] = preprocessed_df[target].astype('category') 

print('Revenue distribution:\n'+ str(preprocessed_df['Revenue'].value_counts()))

Revenue distribution:
0    10422
1     1908
Name: Revenue, dtype: int64


In [8]:
# Balancing (oversampling)
count_class_0, count_class_1 = preprocessed_df['Revenue'].value_counts()
df_class_0 = preprocessed_df[preprocessed_df['Revenue'] == 0]
df_class_1 = preprocessed_df[preprocessed_df['Revenue'] == 1]
# Oversample
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
oversampled_df = pd.concat([df_class_0, df_class_1_over], axis=0)
# Split back into X_train and y_train    
print('Random over-sampling:\n'+ str(oversampled_df['Revenue'].value_counts()))

Random over-sampling:
1    10422
0    10422
Name: Revenue, dtype: int64


# Models

Get Parameters from "OnlineShoppingIntention_ParameterTuning"

In [9]:
knn_model = KNeighborsClassifier(n_neighbors=13, algorithm = 'brute', metric='manhattan', weights='uniform')
rf_model = RandomForestClassifier(bootstrap= True, max_depth=10, max_features= 8, min_samples_leaf= 5, min_samples_split = 8, n_estimators = 100)
dt_model = DecisionTreeClassifier(criterion='gini', max_depth= 4, min_samples_leaf= 3, min_samples_split = 2)
svm_model = SVC(kernel= 'linear')
mlp_model = MLPClassifier(activation='relu', alpha=0.0001, hidden_layer_sizes=(10, 30, 10), learning_rate='adaptive', solver='adam')
gde_model = GradientBoostingClassifier(learning_rate=1, max_depth=9, n_estimators=500)

models = [knn_model, svm_model, dt_model, rf_model, mlp_model, gde_model]

In [10]:
stats = Training_All_2 (preprocessed_df, target, models, 10, 10)

___________________________________________________________________________ Fold 1
____________________________ Model KNeighborsClassifier
1: 0.8872668288726683
2: 0.8872668288726683
3: 0.8872668288726683
4: 0.8872668288726683
5: 0.8872668288726683
6: 0.8872668288726683
7: 0.8872668288726683
8: 0.8872668288726683
9: 0.8872668288726683
10: 0.8872668288726683

 Average Accuracy: 0.8872668288726683

____________________________ Model SVC
1: 0.884022708840227
2: 0.884022708840227
3: 0.884022708840227
4: 0.884022708840227
5: 0.884022708840227
6: 0.884022708840227
7: 0.884022708840227
8: 0.884022708840227
9: 0.884022708840227
10: 0.884022708840227

 Average Accuracy: 0.884022708840227

____________________________ Model DecisionTreeClassifier
1: 0.8888888888888888
2: 0.8888888888888888
3: 0.8888888888888888
4: 0.8888888888888888
5: 0.8888888888888888
6: 0.8888888888888888
7: 0.8888888888888888
8: 0.8888888888888888
9: 0.8888888888888888
10: 0.8888888888888888

 Average Accuracy: 0.8888888888

10: 0.8791565287915653

 Average Accuracy: 0.8791565287915653

____________________________ Model SVC
1: 0.8726682887266829
2: 0.8726682887266829
3: 0.8726682887266829
4: 0.8726682887266829
5: 0.8726682887266829
6: 0.8726682887266829
7: 0.8726682887266829
8: 0.8726682887266829
9: 0.8726682887266829
10: 0.8726682887266829

 Average Accuracy: 0.8726682887266829

____________________________ Model DecisionTreeClassifier
1: 0.8872668288726683
2: 0.8872668288726683
3: 0.8872668288726683
4: 0.8872668288726683
5: 0.8872668288726683
6: 0.8872668288726683
7: 0.8872668288726683
8: 0.8872668288726683
9: 0.8872668288726683
10: 0.8872668288726683

 Average Accuracy: 0.8872668288726683

____________________________ Model RandomForestClassifier
1: 0.8978102189781022
2: 0.8937550689375506
3: 0.8937550689375506
4: 0.894566098945661
5: 0.894566098945661
6: 0.8969991889699919
7: 0.8953771289537713
8: 0.8994322789943228
9: 0.8969991889699919
10: 0.8953771289537713

 Average Accuracy: 0.8958637469586375

_

8: 0.9010543390105434
9: 0.9010543390105434
10: 0.9010543390105434

 Average Accuracy: 0.9010543390105434

____________________________ Model RandomForestClassifier
1: 0.9091646390916464
2: 0.9099756690997567
3: 0.9075425790754258
4: 0.9091646390916464
5: 0.9083536090835361
6: 0.9091646390916464
7: 0.9075425790754258
8: 0.9099756690997567
9: 0.9124087591240876
10: 0.910786699107867

 Average Accuracy: 0.9094079480940794

____________________________ Model MLPClassifier
1: 0.9051094890510949
2: 0.9059205190592052
3: 0.9083536090835361
4: 0.902676399026764
5: 0.9051094890510949
6: 0.9083536090835361
7: 0.8994322789943228
8: 0.9034874290348743
9: 0.9018653690186537
10: 0.9067315490673155

 Average Accuracy: 0.9047039740470397

____________________________ Model GradientBoostingClassifier
1: 0.8799675587996756
2: 0.8913219789132197
3: 0.8004866180048662
4: 0.8710462287104623
5: 0.8848337388483374
6: 0.884022708840227
7: 0.8751013787510138
8: 0.8272506082725061
9: 0.8864557988645579
10: 0.8

In [11]:
stats

array([[0.88726683, 0.88564477, 0.89862125, 0.88321168, 0.87915653,
        0.8945661 , 0.88888889, 0.87753447, 0.89375507, 0.88888889],
       [0.88402271, 0.88240065, 0.88726683, 0.88807786, 0.87266829,
        0.8945661 , 0.88402271, 0.87591241, 0.88807786, 0.88969992],
       [0.88888889, 0.90592052, 0.90105434, 0.89051095, 0.88726683,
        0.90592052, 0.89781022, 0.89862125, 0.90105434, 0.89781022],
       [0.89326845, 0.90567721, 0.90754258, 0.90137875, 0.89586375,
        0.91524736, 0.90056772, 0.89010543, 0.90940795, 0.90632603],
       [0.89821573, 0.90632603, 0.9027575 , 0.8973236 , 0.89562044,
        0.90713706, 0.89375507, 0.88686131, 0.90470397, 0.90397405],
       [0.86163828, 0.85320357, 0.85823195, 0.84517437, 0.85912409,
        0.85604217, 0.84760746, 0.8702352 , 0.86869424, 0.86804542]])

# Overall Results

In [12]:
# To copy-paste into the report

outputString = ""
print("      KNN     SVM     DT     RF     MLP     GDE")
for i in range (10):
    outputString = outputString+str(i+1)+' & %.5f & %.5f & %.5f & %.5f & %.5f & %.5f \\\ \\hline \n' % (stats[0][i],stats[1][i],stats[2][i],stats[3][i],stats[4][i],stats[5][i])
    
print(outputString)

      KNN     SVM     DT     RF     MLP     GDE
1 & 0.88727 & 0.88402 & 0.88889 & 0.89327 & 0.89822 & 0.86164 \\ \hline 
2 & 0.88564 & 0.88240 & 0.90592 & 0.90568 & 0.90633 & 0.85320 \\ \hline 
3 & 0.89862 & 0.88727 & 0.90105 & 0.90754 & 0.90276 & 0.85823 \\ \hline 
4 & 0.88321 & 0.88808 & 0.89051 & 0.90138 & 0.89732 & 0.84517 \\ \hline 
5 & 0.87916 & 0.87267 & 0.88727 & 0.89586 & 0.89562 & 0.85912 \\ \hline 
6 & 0.89457 & 0.89457 & 0.90592 & 0.91525 & 0.90714 & 0.85604 \\ \hline 
7 & 0.88889 & 0.88402 & 0.89781 & 0.90057 & 0.89376 & 0.84761 \\ \hline 
8 & 0.87753 & 0.87591 & 0.89862 & 0.89011 & 0.88686 & 0.87024 \\ \hline 
9 & 0.89376 & 0.88808 & 0.90105 & 0.90941 & 0.90470 & 0.86869 \\ \hline 
10 & 0.88889 & 0.88970 & 0.89781 & 0.90633 & 0.90397 & 0.86805 \\ \hline 



# Friedman Statistic

In [None]:
def column(matrix, i):
    return [row[i] for row in matrix]

In [None]:
K = 10
N = 6

encoder = OrdinalEncoder()

print("__________________________________________________________________________________________Ranks:")
ranks = []

for i in range(10):
    data = asarray([[knn_stats[i]], [svm_stats[i]], [dt_stats[i]], [rf_stats[i]], [mlp_stats[i]], [gde_stats[i]]])
    ranks.append(encoder.fit_transform(data).flatten())
    ranks[i] = [abs(x-6.0) for x in ranks[i]]
    print(ranks[i])


print("__________________________________________________________________________________________R_bar:")
R_bar = (K+1)/2

print(R_bar)

print("__________________________________________________________________________________________eq_2:")
avg_knn = mean(column(ranks, 0))
avg_svm = mean(column(ranks, 1))
avg_dt = mean(column(ranks, 2))
avg_rf = mean(column(ranks, 3))
avg_mlp = mean(column(ranks, 4))
avg_gde = mean(column(ranks, 5))

print('Average Ranks: %.5f | %.5f | %.5f | %.5f | %.5f | %.5f \n' % (avg_knn, avg_svm, avg_dt, avg_rf, avg_mlp, avg_gde))

eq_2 = N * ((avg_knn - R_bar)**2 + (avg_svm - R_bar)**2 + (avg_dt - R_bar)**2 + (avg_rf - R_bar)**2 + (avg_mlp - R_bar)**2 + (gde_knn - R_bar)**2)

print(eq_2)

print("__________________________________________________________________________________________eq_3:")

eq_3 = 0

for row in ranks:
    for entry in row:
        eq_3 = eq_3 + (entry - R_bar)**2
        
eq_3 = eq_3/(N*(K-1))

print(eq_3)

print("__________________________________________________________________________________________Freidman Statistics:")

F_stat = eq_2/eq_3

print(F_stat)

print("__________________________________________________________________________________________Decision:")

F_0 = 10.800          # From class slides.

if (abs(F_stat)<F_0):
    print("Fail to reject the null (ie. cannot say that they are significantly different).")
elif (abs(F_stat)>=F_0):
    print("Reject the null (ie. they are significantly different).")