In [1]:
import ipynb
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import OrdinalEncoder
from numpy import asarray
from statistics import mean

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from ipynb.fs.full.TrainingFunction import Train_Models_CV10
from ipynb.fs.full.TrainingFunction import SavePredictionsToFile

from ipynb.fs.full.ParameterTuning import ParameterTuning

In [14]:
preprocessed_df = pd.read_csv("../data/preprocessedMarketingCampaign.csv",  index_col=0)

# Reseting the index
preprocessed_df.reset_index(drop=True, inplace=True)

categorical_features = [ 'Education_2n_Cycle','Education_Basic','Education_Graduation','Education_Master','Education_PhD',
                        'AcceptedCmp5_0','AcceptedCmp5_1','AcceptedCmp1_0','AcceptedCmp1_1' ]
target = "Teenhome"

# Convert categorical feature types
preprocessed_df[categorical_features] = preprocessed_df[categorical_features].astype('category') 
preprocessed_df[target] = preprocessed_df[target].astype('category') 

print('Revenue distribution:\n'+ str(preprocessed_df[target].value_counts()))

Revenue distribution:
1    1158
0    1158
Name: Teenhome, dtype: int64


In [15]:
preprocessed_df

Unnamed: 0,MntMeatProducts,MntFishProducts,Income,MntFruits,MntSweetProducts,NumDealsPurchases,Dt_Customer,NumWebPurchases,NumCatalogPurchases,MntGoldProds,...,Teenhome,Education_2n_Cycle,Education_Basic,Education_Graduation,Education_Master,Education_PhD,AcceptedCmp5_0,AcceptedCmp5_1,AcceptedCmp1_0,AcceptedCmp1_1
0,1.679702,2.462147,0.235327,1.551577,1.476500,0.349414,-1.531185,1.409304,2.510890,0.843207,...,0,0,0,1,0,0,1,0,1,0
1,-0.177032,1.345274,0.773633,0.570804,-0.146905,-0.685887,0.205773,1.409304,-0.226541,-0.038766,...,0,0,0,1,0,0,1,0,1,0
2,-0.651187,-0.503974,-1.022732,-0.560857,-0.583043,-0.168236,1.061881,-0.750450,-0.910898,-0.748179,...,0,0,0,1,0,0,1,0,1,0
3,-0.216914,0.155164,0.241519,0.419916,-0.001525,1.384715,0.953012,0.329427,0.115638,-0.556446,...,0,0,0,0,0,1,1,0,1,0
4,-0.491658,-0.632140,-0.750763,-0.409969,-0.631503,-0.168236,-0.313830,-0.030532,-0.910898,-0.403059,...,0,0,0,0,0,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2311,-0.620167,-0.503974,0.088236,-0.661449,-0.655733,0.349414,0.002881,0.689386,-0.568720,-0.556446,...,1,0,0,0,0,1,1,0,1,0
2312,-0.425188,-0.412427,0.318020,-0.083045,-0.655733,4.490616,-1.367881,1.409304,0.115638,0.287180,...,1,0,0,1,0,0,1,0,1,0
2313,-0.699931,-0.650449,-1.541025,-0.611153,-0.631503,-0.168236,-0.417750,-1.110409,-0.910898,-0.748179,...,1,0,0,1,0,0,1,0,1,0
2314,-0.668912,-0.687068,0.000000,-0.636301,-0.655733,0.867064,-0.640437,-0.750450,-0.568720,-0.671486,...,1,0,0,1,0,0,1,0,1,0


# Models

Get Parameters from "MarketingCampaign_ParameterTuning"

In [16]:
knn_model = KNeighborsClassifier(metric='manhattan', n_neighbors=4, weights='distance')
svm_model = SVC(kernel= 'rbf')
dt_model = DecisionTreeClassifier(criterion='gini', max_depth= 9, min_samples_leaf= 1, min_samples_split = 8)
rf_model = RandomForestClassifier(bootstrap= True, max_depth=110, max_features= 8, min_samples_leaf= 3, min_samples_split = 8, n_estimators = 100)
mlp_model = MLPClassifier(activation='tanh', alpha=0.0001, hidden_layer_sizes=(10, 30, 10), learning_rate='adaptive', solver='adam')
gde_model = GradientBoostingClassifier(learning_rate=0.1, max_depth=9, n_estimators=500)

models = [knn_model, svm_model, dt_model, rf_model, mlp_model, gde_model]

model_names = []

for mod in models:
    model_names.append(type(mod).__name__)

In [17]:
stats = Train_Models_CV10 (preprocessed_df, target, models)

___________________________________________________________________________ Fold 1
____________________________ Model KNeighborsClassifier
1: 0.875
2: 0.875
3: 0.875
4: 0.875
5: 0.875
6: 0.875
7: 0.875
8: 0.875
9: 0.875
10: 0.875

 Average Accuracy: 0.875

____________________________ Model SVC
1: 0.8577586206896551
2: 0.8577586206896551
3: 0.8577586206896551
4: 0.8577586206896551
5: 0.8577586206896551
6: 0.8577586206896551
7: 0.8577586206896551
8: 0.8577586206896551
9: 0.8577586206896551
10: 0.8577586206896551

 Average Accuracy: 0.8577586206896551

____________________________ Model DecisionTreeClassifier
1: 0.8793103448275862
2: 0.8793103448275862
3: 0.8793103448275862
4: 0.875
5: 0.8836206896551724
6: 0.8836206896551724
7: 0.8793103448275862
8: 0.8836206896551724
9: 0.8793103448275862
10: 0.875

 Average Accuracy: 0.8797413793103448

____________________________ Model RandomForestClassifier
1: 0.9181034482758621
2: 0.9310344827586207
3: 0.9267241379310345
4: 0.9181034482758621
5: 0

9: 0.8879310344827587
10: 0.8793103448275862

 Average Accuracy: 0.8853448275862069

____________________________ Model RandomForestClassifier
1: 0.9181034482758621
2: 0.9181034482758621
3: 0.9224137931034483
4: 0.9224137931034483
5: 0.9181034482758621
6: 0.9267241379310345
7: 0.9094827586206896
8: 0.9181034482758621
9: 0.9224137931034483
10: 0.9137931034482759

 Average Accuracy: 0.9189655172413793

____________________________ Model MLPClassifier
1: 0.8922413793103449
2: 0.9008620689655172
3: 0.8793103448275862
4: 0.8879310344827587
5: 0.8922413793103449
6: 0.9094827586206896
7: 0.9181034482758621
8: 0.9051724137931034
9: 0.9094827586206896
10: 0.8879310344827587

 Average Accuracy: 0.8982758620689655

____________________________ Model GradientBoostingClassifier
1: 0.9267241379310345
2: 0.9224137931034483
3: 0.9267241379310345
4: 0.9224137931034483
5: 0.9224137931034483
6: 0.9267241379310345
7: 0.9224137931034483
8: 0.9224137931034483
9: 0.9224137931034483
10: 0.9224137931034483

 A

1: 0.8961038961038961
2: 0.8787878787878788
3: 0.8961038961038961
4: 0.9177489177489178
5: 0.9047619047619048
6: 0.9177489177489178
7: 0.9177489177489178
8: 0.9177489177489178
9: 0.9307359307359307
10: 0.9090909090909091

 Average Accuracy: 0.9086580086580086

____________________________ Model GradientBoostingClassifier
1: 0.935064935064935
2: 0.9307359307359307
3: 0.9264069264069265
4: 0.9307359307359307
5: 0.9393939393939394
6: 0.935064935064935
7: 0.9264069264069265
8: 0.922077922077922
9: 0.9393939393939394
10: 0.935064935064935

 Average Accuracy: 0.932034632034632

___________________________________________________________________________ Fold 10
____________________________ Model KNeighborsClassifier
1: 0.8571428571428571
2: 0.8571428571428571
3: 0.8571428571428571
4: 0.8571428571428571
5: 0.8571428571428571
6: 0.8571428571428571
7: 0.8571428571428571
8: 0.8571428571428571
9: 0.8571428571428571
10: 0.8571428571428571

 Average Accuracy: 0.8571428571428571

____________________

In [18]:
print(stats)

array([[0.875     , 0.87068966, 0.85775862, 0.88793103, 0.88793103,
        0.89224138, 0.85714286, 0.81385281, 0.9004329 , 0.85714286],
       [0.85775862, 0.90086207, 0.9137931 , 0.875     , 0.89224138,
        0.89224138, 0.93073593, 0.86580087, 0.8961039 , 0.9004329 ],
       [0.87974138, 0.86163793, 0.86465517, 0.88146552, 0.88534483,
        0.87241379, 0.93593074, 0.83766234, 0.8995671 , 0.9021645 ],
       [0.92456897, 0.91163793, 0.92327586, 0.91681034, 0.91896552,
        0.89655172, 0.92424242, 0.88484848, 0.92683983, 0.93636364],
       [0.87887931, 0.88922414, 0.91551724, 0.89137931, 0.89827586,
        0.88232759, 0.91601732, 0.87575758, 0.90865801, 0.90692641],
       [0.93103448, 0.92068966, 0.93793103, 0.93189655, 0.9237069 ,
        0.91637931, 0.94199134, 0.92121212, 0.93203463, 0.94761905]])

# Overall Results

In [19]:
# To copy-paste into the report

outputString = ""
print("      KNN       SVM       DT         RF        MLP       GDE")
for i in range (10):
    outputString = outputString+str(i+1)+' & %.5f & %.5f & %.5f & %.5f & %.5f & %.5f \\\ \\hline \n' % (stats[model_names[0]][i],stats[model_names[1]][i],stats[model_names[2]][i],stats[model_names[3]][i],stats[model_names[4]][i],stats[model_names[5]][i])
    
print(outputString)

      KNN     SVM     DT     RF     MLP     GDE
1 & 0.87500 & 0.85776 & 0.87974 & 0.92457 & 0.87888 & 0.93103 \\ \hline 
2 & 0.87069 & 0.90086 & 0.86164 & 0.91164 & 0.88922 & 0.92069 \\ \hline 
3 & 0.85776 & 0.91379 & 0.86466 & 0.92328 & 0.91552 & 0.93793 \\ \hline 
4 & 0.88793 & 0.87500 & 0.88147 & 0.91681 & 0.89138 & 0.93190 \\ \hline 
5 & 0.88793 & 0.89224 & 0.88534 & 0.91897 & 0.89828 & 0.92371 \\ \hline 
6 & 0.89224 & 0.89224 & 0.87241 & 0.89655 & 0.88233 & 0.91638 \\ \hline 
7 & 0.85714 & 0.93074 & 0.93593 & 0.92424 & 0.91602 & 0.94199 \\ \hline 
8 & 0.81385 & 0.86580 & 0.83766 & 0.88485 & 0.87576 & 0.92121 \\ \hline 
9 & 0.90043 & 0.89610 & 0.89957 & 0.92684 & 0.90866 & 0.93203 \\ \hline 
10 & 0.85714 & 0.90043 & 0.90216 & 0.93636 & 0.90693 & 0.94762 \\ \hline 



In [None]:
SavePredictionsToFile("../accuracyMarketingCampaign.csv", stats)

# Friedman Statistic

In [None]:
def column(matrix, i):
    return [row[i] for row in matrix]

In [None]:
K = 10
N = 6

encoder = OrdinalEncoder()

print("__________________________________________________________________________________________Ranks:")
ranks = []

for i in range(10):
    data = asarray([[knn_stats[i]], [svm_stats[i]], [dt_stats[i]], [rf_stats[i]], [mlp_stats[i]], [gde_stats[i]]])
    ranks.append(encoder.fit_transform(data).flatten())
    ranks[i] = [abs(x-6.0) for x in ranks[i]]
    print(ranks[i])


print("__________________________________________________________________________________________R_bar:")
R_bar = (K+1)/2

print(R_bar)

print("__________________________________________________________________________________________eq_2:")
avg_knn = mean(column(ranks, 0))
avg_svm = mean(column(ranks, 1))
avg_dt = mean(column(ranks, 2))
avg_rf = mean(column(ranks, 3))
avg_mlp = mean(column(ranks, 4))
avg_gde = mean(column(ranks, 5))

print('Average Ranks: %.5f | %.5f | %.5f | %.5f | %.5f | %.5f \n' % (avg_knn, avg_svm, avg_dt, avg_rf, avg_mlp, avg_gde))

eq_2 = N * ((avg_knn - R_bar)**2 + (avg_svm - R_bar)**2 + (avg_dt - R_bar)**2 + (avg_rf - R_bar)**2 + (avg_mlp - R_bar)**2 + (gde_knn - R_bar)**2)

print(eq_2)

print("__________________________________________________________________________________________eq_3:")

eq_3 = 0

for row in ranks:
    for entry in row:
        eq_3 = eq_3 + (entry - R_bar)**2
        
eq_3 = eq_3/(N*(K-1))

print(eq_3)

print("__________________________________________________________________________________________Freidman Statistics:")

F_stat = eq_2/eq_3

print(F_stat)

print("__________________________________________________________________________________________Decision:")

F_0 = 10.800          # From class slides.

if (abs(F_stat)<F_0):
    print("Fail to reject the null (ie. cannot say that they are significantly different).")
elif (abs(F_stat)>=F_0):
    print("Reject the null (ie. they are significantly different).")