In [1]:
import ipynb
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
from ipynb.fs.full.TrainingFunction import Training_Repeat
from ipynb.fs.full.TrainingFunction import PrintAccuracies
from ipynb.fs.full.TrainingFunction import SavePredictionsToFile

from ipynb.fs.full.ParameterTuning import ParameterTuning

ImportError: cannot import name 'PrintAccuracies' from 'ipynb.fs.full.TrainingFunction' (unknown location)

In [None]:
preprocessed_df = pd.read_csv("../data/preprocessedOnlineShoppingData.csv", index_col=0)

categorical_features = ['VisitorType_New_Visitor', 'VisitorType_Other', 'VisitorType_Returning_Visitor', 'Month_Aug', 'Month_Dec', 'Month_Feb', 'Month_Jul', 'Month_June', 'Month_Mar', 'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep']
target = "Revenue"

# Convert categorical feature types
preprocessed_df[categorical_features] = preprocessed_df[categorical_features].astype('category') 
preprocessed_df[target] = preprocessed_df[target].astype('category') 

print('Revenue distribution:\n'+ str(preprocessed_df['Revenue'].value_counts()))

In [None]:
# Balancing (oversampling)
count_class_0, count_class_1 = preprocessed_df['Revenue'].value_counts()
df_class_0 = preprocessed_df[preprocessed_df['Revenue'] == 0]
df_class_1 = preprocessed_df[preprocessed_df['Revenue'] == 1]
# Oversample
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
oversampled_df = pd.concat([df_class_0, df_class_1_over], axis=0)
# Split back into X_train and y_train    
print('Random over-sampling:\n'+ str(oversampled_df['Revenue'].value_counts()))

# KNN

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=13, algorithm = 'brute', metric='manhattan', weights='uniform')

knn_stats = Training_Repeat(oversampled_df.copy(), target, knn_model)
SavePredictionsToFile("../accuracy/OnlineShoppingKNN.csv", knn_stats)

# Random Forest

In [None]:
rf_model = RandomForestClassifier(bootstrap= True, max_depth=10, max_features= 8, min_samples_leaf= 5, min_samples_split = 8, n_estimators = 100)

rf_stats = Training_Repeat(oversampled_df.copy(), target, rf_model)
SavePredictionsToFile("../accuracy/OnlineShoppingRF.csv", rf_stats)

# Decision Tree

In [None]:
dt_model = DecisionTreeClassifier(criterion='gini', max_depth= 4, min_samples_leaf= 3, min_samples_split = 2)

dt_stats = Training_Repeat(oversampled_df.copy(), target, dt_model)
SavePredictionsToFile("../accuracy/OnlineShoppingDT.csv", dt_stats)

# Support Vector Machine

In [None]:
svm_model = SVC(kernel= 'linear')

svm_stats = Training_Repeat(oversampled_df.copy(), target, svm_model)
SavePredictionsToFile("../accuracy/OnlineShoppingSVM.csv", svm_stats)

# Multi Layer Perceptron

In [None]:
# Hyperparameter Tuning

grid_params = {
    'hidden_layer_sizes': [(10,30,10),(20,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

ParameterTuning (oversampled_df.copy(), target, MLPClassifier(), grid_params)

In [None]:
mlp_model = MLPClassifier(activation='relu', alpha=0.0001, hidden_layer_sizes=(10, 30, 10), learning_rate='constant', solver='adam')

mlp_stats = Training_Repeat(oversampled_df.copy(), target, mlp_model)
SavePredictionsToFile("../accuracy/OnlineShoppingMLP.csv", mlp_stats)

# Gradient Boosting Ensemble

In [None]:
# Hyperparameter Tuning

grid_params = {
    "n_estimators":[5,50,250,500],
    "max_depth":[1,3,5,7,9],
    "learning_rate":[0.01,0.1,1,10,100]
}

ParameterTuning (oversampled_df.copy(), target, GradientBoostingClassifier(), grid_params)

In [None]:
gde_model = GradientBoostingClassifier(learning_rate=1, max_depth=9, n_estimators=500)

gde_stats = Training_Repeat(oversampled_df.copy(), target, gde_model)
SavePredictionsToFile("../accuracy/OnlineShoppingGDE.csv", gde_stats)

# Overall Results

In [None]:
# To copy-paste into the report

outputString = ""

for i in range (10):
    outputString = outputString+str(i+1)+'& %.5f & %.5f & %.5f & %.5f & %.5f & %.5f \\ \\hline \n' % (knn_stats[i],svm_stats[i],dt_stats[i],rf_stats[i],mlp_stats[i],gde_stats[i]))
    
print(outputString)