In [82]:
import os
import numpy as np
import pandas as pd
import pickle

import h2o
from h2o.estimators import H2OXGBoostEstimator
from h2o.grid.grid_search import H2OGridSearch

from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression 

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import auc, roc_auc_score


from category_encoders import *

## StackingClassifier: train:0.87011, test:0.86631
## MajorityVotingClassifier: train: 0.86974, test:0.85974

## Links
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.StackingClassifier.html

In [59]:
#Set directories
print(os.getcwd())
dirRawData = "../input/"
dirPData = "../PData/"

/home/jovyan/Projects/Assignment/PCode


## Data loading

In [60]:
#Load Data
f_name = dirPData + "01_df_250k.pickle"

with (open(f_name,"rb")) as f:
    dict_ = pickle.load(f)

df_train = dict_['df_train']
df_test = dict_['df_test']

del f_name, dict_

In [61]:
#Load the variables information
f_name = dirPData + '01_vars.pickle'
with open(f_name,"rb") as f:
    dict_ = pickle.load(f)

vars_ind_numeric = dict_['vars_ind_numeric']
vars_ind_categorical = dict_['vars_ind_categorical']
vars_ind_hccv = dict_['vars_ind_hccv']
vars_notToUse = dict_['vars_notToUse']
var_dep = dict_['var_dep']

del f_name, dict_


## Data Preprocessing

In [62]:
#replace -99 with NaN
df_train = df_train.replace(-99, np.NaN)
df_test = df_test.replace(-99, np.NaN)

In [63]:
#We drop columns we identified in Part A which had very high amount of null values
vars_to_drop = ['a04', 'b06', 'd01' ,'d02', 'd03']
vars_ind_numeric = [var for var in vars_ind_numeric if var not in vars_to_drop]
vars_ind_categorical = [var for var in vars_ind_categorical if var not in vars_to_drop]

In [64]:
df_train = df_train.drop(columns = vars_to_drop)
df_test = df_test.drop(columns = vars_to_drop)

In [65]:
#replace NaN with the mean of each column
for var in vars_ind_numeric:
    df_train[var] = df_train[var].fillna(df_train[var].mean())
    df_test[var] = df_test[var].fillna(df_train[var].mean())

In [66]:
#Perform Target Encoding
enc = TargetEncoder(cols=vars_ind_categorical)
enc.fit_transform(df_train, df_train['target'])
df_train_encoded = enc.transform(df_train, df_train['target'])
df_test['target'] = np.NaN
df_test_encoded = enc.transform(df_test)
df_test.drop(columns = ['target'], inplace = True)
df_test_encoded.drop(columns = ['target'], inplace = True)

In [67]:
df_train = df_train_encoded
df_test = df_test_encoded
del df_train_encoded, df_test_encoded

In [68]:
df_train.shape

(250000, 92)

In [69]:
#split to train_test split
X = df_train.drop(columns=['target']).values #need to drop the dep variable
y = df_train['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [70]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


(175000, 91)
(175000,)
(75000, 91)
(75000,)


## Fit Logistic Regression

In [71]:
parameters = {
    'C':[0.1, 100, 1000],
    'l1_ratio':[0.25, 0.5, 0.75],
    'penalty':['elasticnet'],
    'fit_intercept':[True],
    'solver':['saga'] #saga solver support elastic net penalty
    }
#passing the scoring function in the GridSearchCV
lr_grid_search = GridSearchCV(LogisticRegression(), parameters,scoring='roc_auc',verbose =2, refit=False,cv=3, n_jobs=-1)
lr_grid_search.fit(X_train, y_train) #fit on our train data

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  7.0min finished


GridSearchCV(cv=3, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': [0.1, 100, 1000], 'fit_intercept': [True],
                         'l1_ratio': [0.25, 0.5, 0.75],
                         'penalty': ['elasticnet'], 'solver': ['saga']},
             refit=False, scoring='roc_auc', verbose=2)

In [72]:
lr_grid_search.best_params_

{'C': 0.1,
 'fit_intercept': True,
 'l1_ratio': 0.25,
 'penalty': 'elasticnet',
 'solver': 'saga'}

In [73]:
logreg = LogisticRegression(C=100, fit_intercept=True, l1_ratio=0.25, penalty='elasticnet', solver='saga')
logreg.fit(X_train, y_train)



LogisticRegression(C=100, l1_ratio=0.25, penalty='elasticnet', solver='saga')

## Random Forests

In [74]:
parameters = {
    'n_estimators':[200],
    'max_depth':[10, 30, 50, 80],
    'max_features':['sqrt']
    }
#passing the scoring function in the GridSearchCV
rf_grid_search = GridSearchCV(RandomForestClassifier(), parameters,scoring='roc_auc',verbose =2, refit=False,cv=3, n_jobs=-1)
rf_grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed:  5.1min remaining: 10.3min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  8.5min finished


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [10, 30, 50, 80],
                         'max_features': ['sqrt'], 'n_estimators': [200]},
             refit=False, scoring='roc_auc', verbose=2)

In [75]:
rf_grid_search.best_params_

{'max_depth': 30, 'max_features': 'sqrt', 'n_estimators': 200}

In [76]:
rf = RandomForestClassifier(max_depth=30, max_features='sqrt', n_estimators=200)

## Stacking

In [77]:
estimators = [
('rf', RandomForestClassifier(n_estimators=200, max_depth=30, max_features='sqrt', random_state=42)),
('logreg', LogisticRegression(C=100, fit_intercept=True, l1_ratio=0.25, penalty='elasticnet', solver='saga'))
]
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

In [78]:
stacking_clf.fit(X_train, y_train)



StackingClassifier(estimators=[('rf',
                                RandomForestClassifier(max_depth=30,
                                                       max_features='sqrt',
                                                       n_estimators=200,
                                                       random_state=42)),
                               ('logreg',
                                LogisticRegression(C=100, l1_ratio=0.25,
                                                   penalty='elasticnet',
                                                   solver='saga'))],
                   final_estimator=LogisticRegression())

In [79]:
stacking_clf.score(X_test, y_test) #calculate accuracy, dataset is balanced so accuracy is adequate

0.785

In [84]:
stacking_pred = stacking_clf.predict_proba(X_test)[:,1]

In [85]:
stacking_pred

array([0.2320945 , 0.64545593, 0.06937698, ..., 0.06108798, 0.33173128,
       0.94071755])

In [86]:
fpr, tpr, thresholds = roc_curve(y_test, stacking_pred)
print(auc(fpr, tpr))

0.8701136661630816


In [89]:
kaggle_pred_stacking = stacking_clf.predict_proba(df_test.values)[:,1]
df_sub_partb = pd.DataFrame({
    "unique_id": df_test["unique_id"],
    "Predicted": kaggle_pred_stacking
})
df_sub_partb.to_csv('../POutput/pd_sub_partc.csv', index = False)

## Majority Voting

In [80]:
voting = VotingClassifier(estimators=[('lr', logreg), ('rf', rf)], voting='soft')
voting.fit(X_train, y_train)



VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=100, l1_ratio=0.25,
                                                 penalty='elasticnet',
                                                 solver='saga')),
                             ('rf',
                              RandomForestClassifier(max_depth=30,
                                                     max_features='sqrt',
                                                     n_estimators=200))],
                 voting='soft')

In [81]:
voting.score(X_test, y_test)

0.7844666666666666

In [94]:
voting_pred = voting.predict_proba(X_test)[:,1]

In [95]:
voting_pred

array([0.37923258, 0.5762548 , 0.27444432, ..., 0.26459112, 0.40976281,
       0.69307206])

In [96]:
fpr, tpr, thresholds = roc_curve(y_test, voting_pred)
print(auc(fpr, tpr))

0.8697356784713044


In [97]:
kaggle_pred_voting = voting.predict_proba(df_test.values)[:,1]
df_sub_partc_voting = pd.DataFrame({
    "unique_id": df_test["unique_id"],
    "Predicted": kaggle_pred_voting
})
df_sub_partc_voting.to_csv('../POutput/pd_sub_partc_voting.csv', index = False)