In [2]:
import ipynb
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import OrdinalEncoder
from numpy import asarray
from statistics import mean
from sklearn.metrics import accuracy_score,f1_score

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [7]:
# ONLINE SHOPPING INTENTIONS

filename1 = 'online_shoppers_intentions'
df1r0 = pd.read_csv("../data/train/noresampling/"+filename1+"_0.csv", index_col=0)
df1u0 = pd.read_csv("../data/train/undersampled/"+filename1+"_0.csv", index_col=0)
df1o0 = pd.read_csv("../data/train/oversampled/"+filename1+"_0.csv", index_col=0)
df1t = pd.read_csv("../data/test/"+filename1+".csv", index_col=0)
target1 = 'Revenue'

numerical_features1 = ["Administrative", "Administrative_Duration", "Informational", "Informational_Duration", 
                      "ProductRelated", "ProductRelated_Duration", "BounceRates", "ExitRates", "PageValues", "SpecialDay"]
categorical_features1 = ["OperatingSystems", "Browser", "Region", "TrafficType", "VisitorType", "Weekend", "Month"]
df1r0[categorical_features1] = df1r0[categorical_features1].astype('category') 
df1u0[categorical_features1] = df1u0[categorical_features1].astype('category') 
df1o0[categorical_features1] = df1o0[categorical_features1].astype('category') 
df1t[categorical_features1] = df1t[categorical_features1].astype('category')
df1r0[target1] = df1r0[target1].astype('category') 
df1u0[target1] = df1u0[target1].astype('category')
df1o0[target1] = df1o0[target1].astype('category')
df1t[target1] = df1t[target1].astype('category')


# MARKETING CAMPAIGN

filename2 = 'marketing_campaign'
df2r0 = pd.read_csv("../data/train/noresampling/"+filename2+"_0.csv", index_col=0)
df2u0 = pd.read_csv("../data/train/undersampled/"+filename2+"_0.csv", index_col=0)
df2o0 = pd.read_csv("../data/train/oversampled/"+filename2+"_0.csv", index_col=0)
df2t = pd.read_csv("../data/test/"+filename2+".csv", index_col=0)
target2 = 'Teenhome'

numerical_features2 = ['Income','MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts','MntSweetProducts', 
                        'MntGoldProds','Year_Birth','Recency','NumDealsPurchases','NumWebPurchases',
                        'NumCatalogPurchases','NumStorePurchases','NumWebVisitsMonth','Dt_Customer']
categorical_features2 = ['Education','Marital_Status','Kidhome','AcceptedCmp3', 
                        'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2','Complain','Response']
df2r0[categorical_features2] = df2r0[categorical_features2].astype('category') 
df2u0[categorical_features2] = df2u0[categorical_features2].astype('category') 
df2o0[categorical_features2] = df2o0[categorical_features2].astype('category') 
df2t[categorical_features2] = df2t[categorical_features2].astype('category')
df1t[categorical_features1] = df1t[categorical_features1].astype('category')
df2r0[target2] = df2r0[target2].astype('category') 
df2u0[target2] = df2u0[target2].astype('category')
df2o0[target2] = df2o0[target2].astype('category')
df2t[target2] = df2t[target2].astype('category')

# HEART

filename3 = 'heart'
df3r0 = pd.read_csv("../data/train/noresampling/"+filename3+"_0.csv", index_col=0)
df3u0 = pd.read_csv("../data/train/undersampled/"+filename3+"_0.csv", index_col=0)
df3o0 = pd.read_csv("../data/train/oversampled/"+filename3+"_0.csv", index_col=0)
df3t = pd.read_csv("../data/test/"+filename3+".csv", index_col=0)
target3 = 'target'

numerical_features3 = ['trestbps','chol','thalach','oldpeak', 'age']
categorical_features3 = ['sex', 'cp','fbs','restecg','exang','slope','ca','thal']
df3r0[categorical_features3] = df3r0[categorical_features3].astype('category') 
df3u0[categorical_features3] = df3u0[categorical_features3].astype('category') 
df3o0[categorical_features3] = df3o0[categorical_features3].astype('category') 
df3t[categorical_features3] = df3t[categorical_features3].astype('category') 
df1t[categorical_features1] = df1t[categorical_features1].astype('category')
df3r0[target3] = df3r0[target3].astype('category') 
df3u0[target3] = df3u0[target3].astype('category')
df3o0[target3] = df3o0[target3].astype('category')
df3t[target3] = df3t[target3].astype('category')

# Sanity Check
#df2t.info(verbose=True)

0      0
1      0
2      1
3      0
4      1
      ..
207    1
208    0
209    1
210    0
211    1
Name: target, Length: 212, dtype: category
Categories (2, int64): [0, 1]

In [35]:
knn = KNeighborsClassifier()
svm =  SVC()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
mlp = MLPClassifier()
gbe = GradientBoostingClassifier()

mods = [knn, svm, dt, rf, mlp, gbe]
model_names = ['KNN', "SVM", "DT", "RF", "MLP", "GBE"]

In [36]:
def FindBestModel(dfr, dfu, dfo, dft, target, numerical_features, categorical_features, models, names):
    
    f1_scores_r = []
    f1_scores_u = []
    f1_scores_o = []
    
    Xr_train = dfr[numerical_features+categorical_features]
    yr_train = dfr[target]
    
    Xu_train = dfu[numerical_features+categorical_features]
    yu_train = dfu[target]
    
    Xo_train = dfo[numerical_features+categorical_features]
    yo_train = dfo[target]
    
    X_test = dft[numerical_features+categorical_features]
    y_test = dft[target]
    
    for mod in models:
        mr = mod.fit(Xr_train, yr_train)
        y_pred = mr.predict(X_test)
        f1_scores_r.append(f1_score(y_test, y_pred, average='weighted'))
        
        mu = mod.fit(Xu_train, yu_train)
        y_pred = mu.predict(X_test)
        f1_scores_u.append(f1_score(y_test, y_pred, average='weighted'))
        
        mo = mod.fit(Xo_train, yo_train)
        y_pred = mo.predict(X_test)
        f1_scores_o.append(f1_score(y_test, y_pred, average='weighted'))
        
    r_max = f1_scores_r.index(max(f1_scores_r))
    u_max = f1_scores_u.index(max(f1_scores_u))
    o_max = f1_scores_o.index(max(f1_scores_o))
        
    print("No-resampling: "+names[r_max], "Undersampling: "+names[u_max], "Oversampling: "+names[o_max])
        

In [43]:
FindBestModel(df1r0, df1u0, df1o0, df1t, target1, numerical_features1, categorical_features1, mods, model_names)

No-resampling: GBE Undersampling: RF Oversampling: RF


In [44]:
FindBestModel(df2r0, df2u0, df2o0, df2t, target2, numerical_features2, categorical_features2, mods, model_names)

No-resampling: GBE Undersampling: RF Oversampling: RF


In [45]:
FindBestModel(df3r0, df3u0, df3o0, df3t, target3, numerical_features3, categorical_features3, mods, model_names)

No-resampling: DT Undersampling: GBE Oversampling: GBE
