In [11]:
import pandas as pd
import os
import gc
import numpy as np
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

In [12]:
# READ PICLKE

DATA_PATH = '../HomeCreditDefaultRisk/Data'
train_df = pd.read_pickle(os.path.join(DATA_PATH,'train_df.p'))
test_df = pd.read_pickle(os.path.join(DATA_PATH,'test_df.p'))

In [13]:
# LightGBM GBDT with KFold or Stratified KFold
# Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code

def kfold_lightgbm(train_df, test_df, num_folds, stratified = False):

    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    gc.collect()

    folds = KFold(n_splits= num_folds, shuffle=True, random_state=47)
    
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])    
    sub_preds = np.zeros(test_df.shape[0])
    
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    i=0
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]
        i += 1
        
        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=4,
            #is_unbalance=True,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=32,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.04,
            reg_lambda=0.073,
            min_split_gain=0.0222415,
            min_child_weight=40,
            silent=-1,
            verbose=-1,
            scale_pos_weight=1
            )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 200, early_stopping_rounds= 200)
        

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]

        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        
        print("n_splits",folds.n_splits)
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    # Write submission file and plot feature importance

    test_df['TARGET'] = sub_preds
    #test_df[['SK_ID_CURR', 'TARGET']].to_csv('submission.csv', index= False)
    
    
    #display_importances(feature_importance_df)
    return feature_importance_df, test_df

In [14]:
feature_importance_df, test_df = kfold_lightgbm(train_df, test_df, 5, stratified = False)

Starting LightGBM. Train shape: (99997, 706), test shape: (48744, 706)
Training until validation scores don't improve for 200 rounds.
[200]	training's auc: 0.784006	valid_1's auc: 0.751761
[400]	training's auc: 0.821585	valid_1's auc: 0.761441
[600]	training's auc: 0.843717	valid_1's auc: 0.762006
Early stopping, best iteration is:
[549]	training's auc: 0.839032	valid_1's auc: 0.762231
n_splits 5
Fold  1 AUC : 0.762231
Training until validation scores don't improve for 200 rounds.
[200]	training's auc: 0.784886	valid_1's auc: 0.753989
[400]	training's auc: 0.820878	valid_1's auc: 0.764212
[600]	training's auc: 0.843411	valid_1's auc: 0.765741
[800]	training's auc: 0.860546	valid_1's auc: 0.765986
Early stopping, best iteration is:
[745]	training's auc: 0.856134	valid_1's auc: 0.766015
n_splits 5
Fold  2 AUC : 0.766015
Training until validation scores don't improve for 200 rounds.
[200]	training's auc: 0.784174	valid_1's auc: 0.75658
[400]	training's auc: 0.819387	valid_1's auc: 0.76593

In [None]:
# TUTO CATBOOST 

# Initialize data
cat_features = [0,1,2]
train_data = [["a","b",1,4,5,6],["a","b",4,5,6,7],["c","d",30,40,50,60]]
train_labels = [1,1,-1]
test_data = [["a","b",2,4,6,8],["a","d",1,4,50,60]]

# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=2, learning_rate=1, depth=10, loss_function='Logloss')
# Fit model
model.fit(train_data, train_labels, cat_features)
# Get predicted classes
preds_class = model.predict(test_data)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(test_data)
# Get predicted RawFormulaVal
preds_raw = model.predict(test_data, prediction_type='RawFormulaVal')

In [15]:
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    #plt.savefig('lgbm_importances01.png')