In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.base import BaseEstimator, ClassifierMixin

import lightgbm as lgb
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import roc_auc_score

from IPython.display import clear_output
from gc import collect
import warnings

warnings.simplefilter('ignore')

In [45]:
# voting model

class VotingModel(BaseEstimator, ClassifierMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators
        
    def fit(self, X, y=None):
        return self
    
    def predict_proba(self, X):
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

    def num_params(self):
        total_params = 0
        for estimator in self.estimators:
            if hasattr(estimator, 'get_params'):
                total_params += sum(param.size for param in estimator.get_params().values() if hasattr(param, 'size'))
            elif hasattr(estimator, 'tree_'):
                total_params += estimator.tree_.node_count
            elif hasattr(estimator, 'booster_'):
                total_params += estimator.booster_.num_total_parameters()
            elif hasattr(estimator, 'tree_count_'):
                total_params += estimator.tree_count_
            else:
                raise AttributeError("The base estimator does not have an attribute to determine the number of parameters")
        return total_params

In [46]:
# files read

df_train = pd.read_csv('train_scaled.csv')
df_test = pd.read_csv('test_scaled.csv')

print('train data shape:', df_train.shape)
print('test data shape:', df_test.shape)

df_train, df_test = df_train.align(df_test, axis=1)

print('train data shape aligned:', df_train.shape)
print('test data shape aligned:', df_test.shape)

X = df_train.drop(columns='not.fully.paid')
y = df_train['not.fully.paid']

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

train data shape: (40785, 20)
test data shape: (40786, 19)
train data shape aligned: (40785, 20)
test data shape aligned: (40786, 20)


In [47]:
# lightgbm gbdt

params_lgb1 = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 5,
    "learning_rate": 0.05,
    "n_estimators": 300,
    "colsample_bytree": 0.6, 
    "colsample_bynode": 0.6,
    "verbose": -1,
    "random_state": 42
}

fitted_models_lgb1 = []
scores_lgb1 = []
oof_preds_lgb1 = np.zeros(X.shape[0])

for idx_train, idx_valid in cv.split(X, y):

    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

    model_lgb1 = lgb.LGBMClassifier(**params_lgb1)
    model_lgb1.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        callbacks=[lgb.log_evaluation(100), lgb.early_stopping(200)]
    )
    
    pred_lgb1 = model_lgb1.predict_proba(X_valid)[:, 1]
    score_lgb1 = roc_auc_score(y_valid, pred_lgb1)
    
    scores_lgb1.append(score_lgb1)
    oof_preds_lgb1[idx_valid] = pred_lgb1
    fitted_models_lgb1.append(model_lgb1)
    
    clear_output()

lgb1_auc = roc_auc_score(y, oof_preds_lgb1)
scores_lgb1.append(lgb1_auc)
print(f'LightGBM GDBT AUC: {lgb1_auc:.6f}')

LightGBM GDBT AUC: 0.794733


In [48]:
# voting lgb1

voting_lgb1 = VotingModel(estimators=fitted_models_lgb1)
scores_lgb1_df = pd.DataFrame({'LightGBM gbdt': scores_lgb1})

scores_lgb1_df

Unnamed: 0,LightGBM gbdt
0,0.794543
1,0.791751
2,0.803747
3,0.786213
4,0.797728
5,0.794733


In [49]:
# lightgbm goss

params_lgb2 = {
    "boosting_type": "goss",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 5,
    "learning_rate": 0.05,
    "n_estimators": 300,
    "colsample_bytree": 0.6, 
    "colsample_bynode": 0.6,
    "verbose": -1,
    "random_state": 42
}

fitted_models_lgb2 = []
scores_lgb2 = []
oof_preds_lgb2 = np.zeros(X.shape[0])

for idx_train, idx_valid in cv.split(X, y):

    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

    model_lgb2 = lgb.LGBMClassifier(**params_lgb2)
    model_lgb2.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        callbacks=[lgb.log_evaluation(100), lgb.early_stopping(100)]
    )
    
    pred_lgb2 = model_lgb2.predict_proba(X_valid)[:, 1]
    score_lgb2 = roc_auc_score(y_valid, pred_lgb2)
    
    scores_lgb2.append(score_lgb2)
    oof_preds_lgb2[idx_valid] = pred_lgb2
    fitted_models_lgb2.append(model_lgb2)
    
    clear_output()

lgb2_auc = roc_auc_score(y, oof_preds_lgb2)
scores_lgb2.append(lgb2_auc)
print(f'LightGBM GOSS AUC: {lgb2_auc:.6f}')

LightGBM GOSS AUC: 0.791351


In [50]:
# voting lgb2

voting_lgb2 = VotingModel(estimators=fitted_models_lgb2)
scores_lgb2_df = pd.DataFrame({'LightGBM goss': scores_lgb2})

scores_lgb2_df

Unnamed: 0,LightGBM goss
0,0.791113
1,0.787646
2,0.801405
3,0.785118
4,0.793213
5,0.791351


In [51]:
# catboost

params_cat = {
    "iterations": 5000,
    "learning_rate": 0.05,
    "depth": 6,
    "eval_metric": "AUC",    
    "logging_level": "Verbose",
    "use_best_model": True,
    "random_seed": 42,
    'loss_function':'Logloss',
}

fitted_models_cat = []
scores_cat = []
oof_preds_cat = np.zeros(X.shape[0])

for idx_train, idx_valid in cv.split(X, y):
    
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

    model_cat = CatBoostClassifier(**params_cat)
    model_cat.fit(
        X_train, y_train, 
        eval_set=(X_valid, y_valid), 
        early_stopping_rounds=200, verbose=100)
    
    pred_cat = model_cat.predict_proba(X_valid)[:, 1]
    score_cat = roc_auc_score(y_valid, pred_cat)
    
    scores_cat.append(score_cat)
    oof_preds_cat[idx_valid] = pred_cat
    fitted_models_cat.append(model_cat)
    
    clear_output()

cat_auc = roc_auc_score(y, oof_preds_cat)
scores_cat.append(cat_auc)
print(f'CatBoost AUC: {cat_auc:.6f}')

CatBoost AUC: 0.794788


In [52]:
# voting catboost

voting_cat = VotingModel(estimators=fitted_models_cat)
scores_cat_df = pd.DataFrame({'CatBoost': scores_cat})

scores_cat_df

Unnamed: 0,CatBoost
0,0.794716
1,0.790193
2,0.806
3,0.787449
4,0.795917
5,0.794788


In [53]:
# randomforest

params_rf = {
    'n_estimators': 200,      
    'max_depth': 30,       
    'min_samples_split': 10,   
    'min_samples_leaf': 2,      
    'random_state': 42,       
    'n_jobs': -1             
}

fitted_models_rf = []
scores_rf = []
oof_preds_rf = np.zeros(X.shape[0])

for idx_train, idx_valid in cv.split(X, y):
    
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

    model_rf = RandomForestClassifier(**params_rf)
    model_rf.fit(X_train, y_train)
    
    pred_rf = model_rf.predict_proba(X_valid)[:, 1]
    score_rf = roc_auc_score(y_valid, pred_rf)
    
    scores_rf.append(score_rf)
    oof_preds_rf[idx_valid] = pred_rf
    fitted_models_rf.append(model_rf)

    clear_output()

rf_auc = roc_auc_score(y, oof_preds_rf)
scores_rf.append(rf_auc)
print(f'RandomForest AUC: {rf_auc:.6f}')

RandomForest AUC: 0.780900


In [54]:
# voting randomforest

voting_rf = VotingModel(estimators=fitted_models_rf)
scores_rf_df = pd.DataFrame({'RandomForest': scores_rf})

scores_rf_df

Unnamed: 0,RandomForest
0,0.779917
1,0.779129
2,0.789137
3,0.776638
4,0.780058
5,0.7809


In [82]:
# xgboost

params_xgb = {
    'colsample_bylevel': 0.6, 
    'colsample_bytree': 0.6, 
    'gamma': 0.2, 
    'learning_rate': 0.05, 
    'max_depth': 6, 
    'n_estimators': 200, 
    'scale_pos_weight': 2, 
    'objective': 'binary:logistic',
    'early_stopping_rounds': 100,
    'max_delta_step': 2,
    'eval_metric': 'auc'
}

fitted_models_xgb = []
scores_xgb = []
oof_preds_xgb = np.zeros(X.shape[0])

for idx_train, idx_valid in cv.split(X, y):

    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

    model_xgb = xgb.XGBClassifier(**params_xgb)
    model_xgb.fit(X_train, y_train,
        eval_set=[(X_valid, y_valid)]
    )
    
    pred_xgb = model_xgb.predict_proba(X_valid)[:, 1]
    score_xgb = roc_auc_score(y_valid, pred_xgb)
    
    scores_xgb.append(score_xgb)
    oof_preds_xgb[idx_valid] = pred_xgb
    fitted_models_xgb.append(model_xgb)
    
    clear_output()

xgb_auc = roc_auc_score(y, oof_preds_xgb)
scores_xgb.append(xgb_auc)
print(f'XGBoost AUC: {xgb_auc:.6f}')

XGBoost AUC: 0.794269


In [65]:
# voting xgboost

voting_xgb = VotingModel(estimators=fitted_models_xgb)
scores_xgb_df = pd.DataFrame({'XGBoost': scores_xgb})

scores_xgb_df

Unnamed: 0,XGBoost
0,0.795939
1,0.790725
2,0.80197
3,0.786248
4,0.796814
5,0.794269


In [74]:
# stack

params_lgb1 = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 6,
    "learning_rate": 0.05,
    "n_estimators": 200,
    "colsample_bytree": 0.6, 
    "colsample_bynode": 0.6,
    "verbose": -1,
    "random_state": 42
}

params_lgb2 = {
    "boosting_type": "goss",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 6,
    "learning_rate": 0.05,
    "n_estimators": 200,
    "colsample_bytree": 0.6, 
    "colsample_bynode": 0.6,
    "verbose": -1,
    "random_state": 42
}

stack_lgb1 = lgb.LGBMClassifier(**params_lgb1)
stack_lgb2 = lgb.LGBMClassifier(**params_lgb2)

model_stack = VotingClassifier(estimators=[('lightgbm1', stack_lgb1), ('lightgbm2', stack_lgb2)], voting='soft')

fitted_models_stack = []
scores_stack = []
oof_preds_stack = np.zeros(X.shape[0])

for idx_train, idx_valid in cv.split(X, y):

    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]    


    model_stack.fit(X_train, y_train)

    pred_stack = model_stack.predict_proba(X_valid)[:, 1]
    score_stack = roc_auc_score(y_valid,pred_stack)
    
    scores_stack.append(score_stack)
    oof_preds_stack[idx_valid] = pred_stack
    fitted_models_stack.append(model_stack)

    clear_output()

stack_auc = roc_auc_score(y, oof_preds_stack)
scores_stack.append(stack_auc)
print(f'Stacking AUC: {stack_auc:.6f}')

Stacking AUC: 0.794032


In [83]:
# voting stacking

voting_stack = VotingModel(estimators=fitted_models_stack)
scores_stack_df = pd.DataFrame({'Stacking': scores_stack})

scores_stack_df

Unnamed: 0,Stacking
0,0.793443
1,0.791808
2,0.802477
3,0.786785
4,0.796023
5,0.794032


In [84]:
# summary

scores = pd.concat([scores_lgb1_df, scores_lgb2_df, scores_cat_df, scores_rf_df, scores_xgb_df, scores_stack_df], axis=1)
scores

Unnamed: 0,LightGBM gbdt,LightGBM goss,CatBoost,RandomForest,XGBoost,Stacking
0,0.794543,0.791113,0.794716,0.779917,0.795939,0.793443
1,0.791751,0.787646,0.790193,0.779129,0.790725,0.791808
2,0.803747,0.801405,0.806,0.789137,0.80197,0.802477
3,0.786213,0.785118,0.787449,0.776638,0.786248,0.786785
4,0.797728,0.793213,0.795917,0.780058,0.796814,0.796023
5,0.794733,0.791351,0.794788,0.7809,0.794269,0.794032


In [85]:
# weights

preds_catboost = voting_cat.predict_proba(X_valid)[:, 1]
preds_xgboost = voting_xgb.predict_proba(X_valid)[:, 1]
preds_lightgbm = voting_lgb1.predict_proba(X_valid)[:, 1]

final_preds = 0.4 * preds_catboost + 0.3 * preds_xgboost + 0.3 * preds_lightgbm

final_auc = roc_auc_score(y_valid, final_preds)
print(f'Final Ensemble AUC: {final_auc:.6f}')

Final Ensemble AUC: 0.848656


In [87]:
# Parameters

n_params_lgb1 = model_lgb1.booster_.num_trees()
n_params_lgb2 = model_lgb2.booster_.num_trees()
n_params_cat = model_cat.tree_count_  
n_params_rf = sum(estimator.tree_.node_count for estimator in model_rf.estimators_)
n_params_xgb = model_xgb.get_booster().best_ntree_limit
n_params_stack = int((n_params_lgb1 + n_params_lgb2) / 2)

n_param = {
    'LightGBM gbdt': n_params_lgb1,
    'LightGBM goss': n_params_lgb2,
    'CatBoost': n_params_cat,
    'RandomForest': n_params_rf,
    'XGBoost': n_params_xgb,
    'Stacking': n_params_stack
}

n_params = pd.DataFrame(n_param, index=['Parameters'])
n_params

Unnamed: 0,LightGBM gbdt,LightGBM goss,CatBoost,RandomForest,XGBoost,Stacking
Parameters,300,177,397,803748,183,238


In [88]:
# predict

X_test = df_test.drop(columns='not.fully.paid')
test_pred = voting_cat.predict_proba(X_test)[:, 1]

submit = pd.read_csv('sample_submission.csv', header=None)
submit[1] = test_pred

submit.to_csv('submission.csv', header=None, index=False)
print('Submission saved.')

Submission saved.


In [63]:
# predict weighted

X_test = df_test.drop(columns='not.fully.paid')

test_pred_cat = voting_cat.predict_proba(X_test)[:, 1]
test_pred_xgb = voting_xgb.predict_proba(X_test)[:, 1]
test_pred_lgb = voting_lgb1.predict_proba(X_test)[:, 1]

test_pred = 0.4 * test_pred_cat + 0.3 * test_pred_xgb + 0.3 * test_pred_lgb

submit = pd.read_csv('sample_submission.csv', header=None)
submit[1] = test_pred

submit.to_csv('submission.csv', header=None, index=False)
print('Submission saved.')

Submission saved.
