In [1]:
import ipynb
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import OrdinalEncoder
from numpy import asarray
from statistics import mean

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from ipynb.fs.full.TrainingFunction import Train_Models_CV10
from ipynb.fs.full.TrainingFunction import SavePredictionsToFile

from ipynb.fs.full.ParameterTuning import ParameterTuning

In [4]:
preprocessed_df = pd.read_csv("../data/preprocessedMarketingCampaign.csv",  index_col=0)

# Reseting the index
preprocessed_df.reset_index(drop=True, inplace=True)

categorical_features = [ 'Education_2n_Cycle','Education_Basic','Education_Graduation','Education_Master','Education_PhD',
                        'AcceptedCmp5_0','AcceptedCmp5_1','AcceptedCmp1_0','AcceptedCmp1_1' ]
target = "Teenhome"

# Convert categorical feature types
preprocessed_df[categorical_features] = preprocessed_df[categorical_features].astype('category') 
preprocessed_df[target] = preprocessed_df[target].astype('category') 

print('Revenue distribution:\n'+ str(preprocessed_df[target].value_counts()))

Revenue distribution:
1    1158
0    1158
Name: Teenhome, dtype: int64


In [5]:
preprocessed_df

Unnamed: 0,MntMeatProducts,MntFishProducts,Income,MntFruits,MntSweetProducts,NumDealsPurchases,Dt_Customer,NumWebPurchases,NumCatalogPurchases,MntGoldProds,...,Teenhome,Education_2n_Cycle,Education_Basic,Education_Graduation,Education_Master,Education_PhD,AcceptedCmp5_0,AcceptedCmp5_1,AcceptedCmp1_0,AcceptedCmp1_1
0,1.679702,2.462147,0.235327,1.551577,1.476500,0.349414,-1.531185,1.409304,2.510890,0.843207,...,0,0,0,1,0,0,1,0,1,0
1,-0.177032,1.345274,0.773633,0.570804,-0.146905,-0.685887,0.205773,1.409304,-0.226541,-0.038766,...,0,0,0,1,0,0,1,0,1,0
2,-0.651187,-0.503974,-1.022732,-0.560857,-0.583043,-0.168236,1.061881,-0.750450,-0.910898,-0.748179,...,0,0,0,1,0,0,1,0,1,0
3,-0.216914,0.155164,0.241519,0.419916,-0.001525,1.384715,0.953012,0.329427,0.115638,-0.556446,...,0,0,0,0,0,1,1,0,1,0
4,-0.491658,-0.632140,-0.750763,-0.409969,-0.631503,-0.168236,-0.313830,-0.030532,-0.910898,-0.403059,...,0,0,0,0,0,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2311,-0.620167,-0.503974,0.088236,-0.661449,-0.655733,0.349414,0.002881,0.689386,-0.568720,-0.556446,...,1,0,0,0,0,1,1,0,1,0
2312,-0.425188,-0.412427,0.318020,-0.083045,-0.655733,4.490616,-1.367881,1.409304,0.115638,0.287180,...,1,0,0,1,0,0,1,0,1,0
2313,-0.699931,-0.650449,-1.541025,-0.611153,-0.631503,-0.168236,-0.417750,-1.110409,-0.910898,-0.748179,...,1,0,0,1,0,0,1,0,1,0
2314,-0.668912,-0.687068,0.000000,-0.636301,-0.655733,0.867064,-0.640437,-0.750450,-0.568720,-0.671486,...,1,0,0,1,0,0,1,0,1,0


# Models

Get Parameters from "MarketingCampaign_ParameterTuning"

In [6]:
knn_model = KNeighborsClassifier(metric='manhattan', n_neighbors=4, weights='distance')
svm_model = SVC(kernel= 'rbf')
dt_model = DecisionTreeClassifier(criterion='gini', max_depth= 9, min_samples_leaf= 1, min_samples_split = 8)
rf_model = RandomForestClassifier(bootstrap= True, max_depth=110, max_features= 8, min_samples_leaf= 3, min_samples_split = 8, n_estimators = 100)
mlp_model = MLPClassifier(activation='tanh', alpha=0.0001, hidden_layer_sizes=(10, 30, 10), learning_rate='adaptive', solver='adam')
gde_model = GradientBoostingClassifier(learning_rate=0.1, max_depth=9, n_estimators=500)

models = [knn_model, svm_model, dt_model, rf_model, mlp_model, gde_model]

model_names = []

for mod in models:
    model_names.append(type(mod).__name__)

In [None]:
stats = Train_Models_CV10 (preprocessed_df, target, models)

KNeighborsClassifier Accuracy: 0.880 (0.021)
SVC Accuracy: 0.889 (0.026)
DecisionTreeClassifier Accuracy: 0.890 (0.020)
RandomForestClassifier Accuracy: 0.915 (0.013)
MLPClassifier Accuracy: 0.899 (0.025)


In [None]:
print(stats)

# Overall Results

In [None]:
# To copy-paste into the report

outputString = ""
print("      KNN       SVM       DT         RF        MLP       GDE")
for i in range (10):
    outputString = outputString+str(i+1)+' & %.5f & %.5f & %.5f & %.5f & %.5f & %.5f \\\ \\hline \n' % (stats[model_names[0]][i],stats[model_names[1]][i],stats[model_names[2]][i],stats[model_names[3]][i],stats[model_names[4]][i],stats[model_names[5]][i])
    
print(outputString)

In [None]:
SavePredictionsToFile("../accuracy/accuracyMarketingCampaign.csv", stats)

# Friedman Statistic

In [None]:
def column(matrix, i):
    return [row[i] for row in matrix]

In [None]:
K = 10
N = 6

encoder = OrdinalEncoder()

print("__________________________________________________________________________________________Ranks:")
ranks = []

for i in range(10):
    data = asarray([[knn_stats[i]], [svm_stats[i]], [dt_stats[i]], [rf_stats[i]], [mlp_stats[i]], [gde_stats[i]]])
    ranks.append(encoder.fit_transform(data).flatten())
    ranks[i] = [abs(x-6.0) for x in ranks[i]]
    print(ranks[i])


print("__________________________________________________________________________________________R_bar:")
R_bar = (K+1)/2

print(R_bar)

print("__________________________________________________________________________________________eq_2:")
avg_knn = mean(column(ranks, 0))
avg_svm = mean(column(ranks, 1))
avg_dt = mean(column(ranks, 2))
avg_rf = mean(column(ranks, 3))
avg_mlp = mean(column(ranks, 4))
avg_gde = mean(column(ranks, 5))

print('Average Ranks: %.5f | %.5f | %.5f | %.5f | %.5f | %.5f \n' % (avg_knn, avg_svm, avg_dt, avg_rf, avg_mlp, avg_gde))

eq_2 = N * ((avg_knn - R_bar)**2 + (avg_svm - R_bar)**2 + (avg_dt - R_bar)**2 + (avg_rf - R_bar)**2 + (avg_mlp - R_bar)**2 + (gde_knn - R_bar)**2)

print(eq_2)

print("__________________________________________________________________________________________eq_3:")

eq_3 = 0

for row in ranks:
    for entry in row:
        eq_3 = eq_3 + (entry - R_bar)**2
        
eq_3 = eq_3/(N*(K-1))

print(eq_3)

print("__________________________________________________________________________________________Freidman Statistics:")

F_stat = eq_2/eq_3

print(F_stat)

print("__________________________________________________________________________________________Decision:")

F_0 = 10.800          # From class slides.

if (abs(F_stat)<F_0):
    print("Fail to reject the null (ie. cannot say that they are significantly different).")
elif (abs(F_stat)>=F_0):
    print("Reject the null (ie. they are significantly different).")