In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
!pip install xverse
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from scipy.stats import randint
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import  precision_recall_curve, roc_auc_score, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score,auc, roc_curve, plot_confusion_matrix, classification_report

In [None]:
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

def missing_data(df):
  x = df.isnull().sum()
  count = len(df)
  x = pd.DataFrame(x, columns = ['number_null'])
  x['percent'] = round((x['number_null']/count)*100,2)
  return x

def delete_null_over_p(df, p): 
  x = missing_data(df)
  for i, j in zip(x['percent'], x.index): 
    if i > p:
      del df[j] 
  return df 

def corr(df):
    corrmat1= df.corr().abs()
    upper_tri = corrmat1.where(np.triu(np.ones(corrmat1.shape),k=1).astype(np.bool))
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.8)]
    for i in to_drop:
        del df[i]
    return df

#resampling
# umbalanced data
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
def over_sampling(X, y):
  ros = RandomOverSampler(random_state=25)
  X_ros, y_ros = ros.fit_resample(X, y)
  return X_ros, y_ros 
def under_sampling(X, y):
  rus = RandomUnderSampler(random_state = 42, replacement = True)
  X_rus, y_rus = rus.fit_resample(X, y)
  return X_rus, y_rus
def smothing(X, y):
  smote = SMOTE()
  X_smote, y_smote = smote.fit_resample(X, y)
  return X_smote, y_smote

from sklearn.model_selection import train_test_split
def split_data(X,y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_val, y_train, y_val 

def metrics(model, X_val, y_val):
  y_pred = model.predict(X_val)
  ypred_prob = model.predict_proba(X_val)[:,1]
  target_names = ["Class 0", "Class 1"]
  print("AUC",roc_auc_score(y_val, ypred_prob))
  print(classification_report(y_val, y_pred, target_names=target_names))
  return None

##AUC - ROC
def plot_AUC( y_val, ypred_prob):
    from sklearn import metrics
    fig, (ax, ax1) = plt.subplots(nrows = 1, ncols = 2, figsize = (15,5))
    
    fpr, tpr, threshold = metrics.roc_curve(y_val, ypred_prob)
    roc_auc = metrics.auc(fpr, tpr)

    ax.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    ax.plot([0, 1], [0, 1],'r--')
    ax.set_title('Receiver Operating Characteristic ',fontsize=10)
    ax.set_ylabel('True Positive Rate',fontsize=20)
    ax.set_xlabel('False Positive Rate',fontsize=15)
    ax.legend(loc = 'lower right', prop={'size': 16})
    plt.subplots_adjust(wspace=1)

In [None]:
# Preprocess application_train.csv and application_test.csv
def application_train_test(num_rows = None, nan_as_category = False):
    # Read data and merge
    df = pd.read_csv('/content/drive/MyDrive/Essay/Data/application_train.csv.zip', nrows= num_rows)
    test_df = pd.read_csv('/content/drive/MyDrive/Essay/Data/application_test.csv.zip', nrows= num_rows)
    print("Train samples: {}, test samples: {}".format(len(df), len(test_df)))
    df = df.append(test_df).reset_index()
    # Optional: Remove 4 applications with XNA CODE_GENDER (train set)
    df = df[df['CODE_GENDER'] != 'XNA']
    
    # Categorical features with Binary encode (0 or 1; two categories)
    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])
    # Categorical features with One-Hot encode
    df, cat_cols = one_hot_encoder(df, nan_as_category)
    
    # NaN values for DAYS_EMPLOYED: 365.243 -> nan
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
    # Some simple new features (percentages)
    df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
    del test_df
    gc.collect()
    return df

def bureau_and_balance(num_rows = None, nan_as_category = True):
    bureau = pd.read_csv('/content/drive/MyDrive/Essay/Data/bureau.csv.zip', nrows = num_rows)
    bb = pd.read_csv('/content/drive/MyDrive/Essay/Data/bureau_balance.csv.zip', nrows = num_rows)
    bb, bb_cat = one_hot_encoder(bb, nan_as_category)
    bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)
    
    # Bureau balance: Perform aggregations and merge with bureau.csv
    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
    for col in bb_cat:
        bb_aggregations[col] = ['mean']
    bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
    bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
    bureau.drop(['SK_ID_BUREAU'], axis=1, inplace= True)
    del bb, bb_agg
    gc.collect()

    # Bureau and bureau_balance numeric features
    num_aggregations = {
        'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
        'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
        'DAYS_CREDIT_UPDATE': ['mean'],
        'CREDIT_DAY_OVERDUE': ['max', 'mean'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
        'AMT_ANNUITY': ['max', 'mean'],
        'CNT_CREDIT_PROLONG': ['sum'],
        'MONTHS_BALANCE_MIN': ['min'],
        'MONTHS_BALANCE_MAX': ['max'],
        'MONTHS_BALANCE_SIZE': ['mean', 'sum']
    }
    # Bureau and bureau_balance categorical features
    cat_aggregations = {}
    for cat in bureau_cat: cat_aggregations[cat] = ['mean']
    for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean']
    bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
    # Bureau: Active credits - using only numerical aggregations
    active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
    active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
    active_agg.columns = pd.Index(['ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')
    del active, active_agg
    gc.collect()
    # Bureau: Closed credits - using only numerical aggregations
    closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
    closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
    closed_agg.columns = pd.Index(['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
    del closed, closed_agg, bureau
    gc.collect()
    return bureau_agg

def previous_applications(num_rows = None, nan_as_category = True):
    prev = pd.read_csv('/content/drive/MyDrive/Essay/Data/previous_application.csv.zip', nrows = num_rows)
    prev, cat_cols = one_hot_encoder(prev, nan_as_category= True)
    # Days 365.243 values -> nan
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
    # Add feature: value ask / value received percentage
    prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
    # Previous applications numeric features
    num_aggregations = {
        'AMT_ANNUITY': ['min', 'max', 'mean'],
        'AMT_APPLICATION': ['min', 'max', 'mean'],
        'AMT_CREDIT': ['min', 'max', 'mean'],
        'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
        'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
        'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
        'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'DAYS_DECISION': ['min', 'max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum'],
    }

    # Previous applications categorical features
    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ['mean']
    
    prev_agg = prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
    # Previous Applications: Approved Applications - only numerical features
    approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
    approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
    approved_agg.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
    prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')
    # Previous Applications: Refused Applications - only numerical features
    refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
    refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
    refused_agg.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
    prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
    del refused, refused_agg, approved, approved_agg, prev
    gc.collect()
    return prev_agg


# Preprocess POS_CASH_balance.csv
def pos_cash(num_rows = None, nan_as_category = True):
    pos = pd.read_csv('/content/drive/MyDrive/Essay/Data/POS_CASH_balance.csv.zip', nrows = num_rows)
    pos, cat_cols = one_hot_encoder(pos, nan_as_category= True)
    # Features
    aggregations = {
        'MONTHS_BALANCE': ['max', 'mean', 'size'],
        'SK_DPD': ['max', 'mean'],
        'SK_DPD_DEF': ['max', 'mean']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    
    pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
    pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
    # Count pos cash accounts
    pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
    del pos
    gc.collect()
    return pos_agg

# Preprocess installments_payments.csv
def installments_payments(num_rows = None, nan_as_category = True):
    ins = pd.read_csv('/content/drive/MyDrive/Essay/Data/installments_payments.csv.zip', nrows = num_rows)
    ins, cat_cols = one_hot_encoder(ins, nan_as_category= True)
    # Percentage and difference paid in each installment (amount paid and installment value)
    ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
    # Days past due and days before due (no negative values)
    ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
    ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
    ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
    ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
    # Features: Perform aggregations
    aggregations = {
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'DPD': ['max', 'mean', 'sum'],
        'DBD': ['max', 'mean', 'sum'],
        'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
        'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
        'AMT_INSTALMENT': ['max', 'mean', 'sum'],
        'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
        'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum']
    }

    for cat in cat_cols:
        aggregations[cat] = ['mean']
    ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
    ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
    # Count installments accounts
    ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()
    del ins
    gc.collect()
    return ins_agg


# Preprocess credit_card_balance.csv
def credit_card_balance(num_rows = None, nan_as_category = True):
    cc = pd.read_csv('/content/drive/MyDrive/Essay/Data/credit_card_balance.csv.zip', nrows = num_rows)
    cc, cat_cols = one_hot_encoder(cc, nan_as_category= True)
    # General aggregations
    cc.drop(['SK_ID_PREV'], axis= 1, inplace = True)
    cc_agg = cc.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var'])
    cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
    # Count credit card lines
    cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
    del cc
    gc.collect()
    return cc_agg

In [None]:
df = application_train_test(num_rows = None, nan_as_category = False)
bureau = bureau_and_balance(num_rows = None, nan_as_category = True)
prev = previous_applications(num_rows = None, nan_as_category = True)
pos = pos_cash(num_rows = None, nan_as_category = True)
ins = installments_payments(num_rows = None, nan_as_category = True)
cc = credit_card_balance(num_rows = None, nan_as_category = True)

In [None]:
df = df.join(bureau, how='left', on='SK_ID_CURR')
df = df.join(prev, how='left', on='SK_ID_CURR')
df = df.join(pos, how='left', on='SK_ID_CURR')
df = df.join(ins, how='left', on='SK_ID_CURR')
df = df.join(cc, how='left', on='SK_ID_CURR')

In [None]:
#
missing_data(df)
df = delete_null_over_p(df, p = 75)
corr(df)
for i in df.columns:
 df[i].fillna(df[i].mode()[0], inplace=True)
print(df)

In [None]:
df1 = df.copy()
df2 = df.copy()
df3 = df.copy()

## Method 1 : WOE - IV => Logistic

In [None]:
# Method 1 : WOE - IV => Logistic
from xverse.transformer import WOE
def WOE_IV():
  X = df1.drop('TARGET', axis = 1)
  y = df1.TARGET
  clf = WOE()
  clf.fit(X, y)
  clf.woe_df
  iv = clf.iv_df
  return iv

def drop_iv(iv, p):
  iv_1 = iv[iv['Information_Value']>p]
  return iv_1, iv_1['Variable_Name']

def X_y(iv_1):
  X=df1[[col for col in iv_1['Variable_Name']]]
  clf = WOE()
  X = clf.transform(X)
  y = df1.TARGET
  return X,y

## so sánh model log và model đã được resampling
def log(X_train,y_train,X_val, y_val):
  log = LogisticRegression()
  model = log.fit(X_train, y_train)
  y_predict = log.predict(X_val)
  a = metrics(model,X_val,y_val)
  return model


In [None]:
iv = WOE_IV()
iv_1, iv_1['Variable_Name'] = drop_iv(iv, 0.02)
X, y = X_y(iv_1)
X_train, X_val, y_train, y_val = split_data(X,y)
#Sampling
X_ros, y_ros = over_sampling(X_train, y_train)
X_rus, y_rus = under_sampling(X_train, y_train)
X_smooth, y_smooth = smothing(X_train, y_train)
#
log(X_train,y_train,X_val, y_val)
#Sampling
print("Oversampling:")
model1 = log(X_ros, y_ros, X_val, y_val)
print("\nUndersampling:")
model2 = log(X_rus, y_rus, X_val, y_val)
print("\nSmote: ")
model3 = log(X_smooth, y_smooth, X_val, y_val)

## Method 2: Machine Learning

In [None]:
from sklearn.preprocessing import StandardScaler
def scale(X_train, X_val):
    sc_X = StandardScaler()
    X_train = sc_X.fit_transform(X_train)
    X_val = sc_X.transform(X_val)
    return X_train, X_val

from sklearn.preprocessing import MinMaxScaler
def scale_minmax(X_train, X_val):
  scaler_minmax = MinMaxScaler()
  X_train = scaler_minmax.fit_transform(X_train)
  X_val = scaler_minmax.transform(X_val)
  return X_train, X_val

## Logistic Regression.
def log(X_train,y_train,X_val, y_val):
  log = LogisticRegression()
  model = log.fit(X_train, y_train)
  y_predict = log.predict(X_val)
  a = metrics(model,X_val,y_val)
  return model

## Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

def model_random_forest(X_train, y_train, X_val, y_val): 
  clf = RandomForestClassifier(criterion="gini",\
                               random_state = 25,\
                               min_samples_leaf=5)
  model = clf.fit(X_train, y_train)
  y_predict = clf.predict(X_val)
  a = metrics(model,X_val,y_val)
  return model

#lightgbm
def model_LGBM(X_train, y_train, X_val, y_val): 
  clf = lgb.LGBMClassifier(boosting_type = 'goss',
            nthread=4,
            n_estimators=10000,
            learning_rate=0.005134,
            num_leaves=54,
            colsample_bytree=0.508716,
            subsample=1,
            max_depth=10,
            reg_alpha=0.436193,
            reg_lambda=0.479169,
            min_split_gain=0.024766,
            min_child_weight=40,
            silent=-1,
            verbose=-1,
            is_unbalance=False)
  model = clf.fit(X_train, y_train)
  a = metrics(model,X_val,y_val)
  return model

#Hyperparameters tuning with randomsearchcv
def randomsearchcv_lgbm(X_train, y_train, X_val, y_val):
  clf = lgb.LGBMClassifier()
  param_dist = { "learning_rate": np.linspace(0,0.2,5),
               "max_depth": randint(3, 10),
                "min_split_gain": np.linspace(1, 10, 1),
                "num_iterations": randint(100, 10000),
                "min_data_in_leaf": randint(3, 10),
                "min_gain_to_split": randint(1, 10),
                "max_bin": randint(10, 100)}
               
  model = RandomizedSearchCV(clf , param_dist, scoring='accuracy', cv =5)
  model.fit(X_train,y_train)
  best_params = model.best_estimator_
  y_predict = best_params.predict(X_val)
  y_predict_proba = best_params.predict_proba(X_val)[:, 1]
  a = metrics(model,X_val,y_val)
  return model

In [None]:
df2 = df2.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
#Split
y = df2.TARGET
X = df2.drop('TARGET', axis = 1)
X_train, X_val, y_train, y_val = split_data(X,y)
scale(X_train, X_val)

#Sampling
X_ros, y_ros = over_sampling(X_train, y_train)
X_rus, y_rus = under_sampling(X_train, y_train)
X_smooth, y_smooth = smothing(X_train, y_train)

In [None]:
#Logistic
print("Logistic:")
log(X_train,y_train,X_val, y_val)
print("Oversampling_log:")
model1 = log(X_ros, y_ros, X_val, y_val)
print("\nUndersampling_log:")
model2 = log(X_rus, y_rus, X_val, y_val)
print("\nSmote_log: ")
model3 = log(X_smooth, y_smooth, X_val, y_val)

#Random Forest
print("\nRandom Forest:")
rf(X_train,y_train,X_val, y_val)
print("\nOversampling_rf:")
model1 = model_random_forest(X_ros, y_ros, X_val, y_val)
print("\nUndersampling_rf:")
model2 = model_random_forest(X_rus, y_rus, X_val, y_val)
print("\nSmote_rf: ")
model3 = model_random_forest(X_smooth, y_smooth, X_val, y_val)

#LightGBM
print("\nLightGBM:")
model_LGBM(X_train,y_train,X_val, y_val)
print("\nOversampling LightGBM:")
model1 = model_LGBM(X_ros, y_ros, X_val, y_val)
print("\nUndersampling LightGBM:")
model2 = model_LGBM(X_rus, y_rus, X_val, y_val)
print("\nSmote LightGBM: ")
model3 = model_LGBM(X_smooth, y_smooth, X_val, y_val)

#
print('\nHyperparameter with LightGBM:')
randomsearchcv_lgbm(X_train,y_train,X_val, y_val)
print("\nOversampling Hyper LightGBM:")
model1 = randomsearchcv_lgbm(X_ros, y_ros, X_val, y_val)
print("\nUndersampling Hyper LightGBM:")
model2 = randomsearchcv_lgbm(X_rus, y_rus, X_val, y_val)
print("\nSmote Hyper LightGBM: ")
model3 = randomsearchcv_lgbm(X_smooth, y_smooth, X_val, y_val)

### Method 3: K-Fold => LightGBM

In [None]:
def kfold_lightgbm(df3, num_folds, stratified = False, debug= False):
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=4,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 200, early_stopping_rounds= 200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    return None

feat_importance = kfold_lightgbm(df, num_folds= 10, stratified= False)