#Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#basic library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
from scipy.stats import kurtosis
from scipy.stats import skew
import missingno as ms
import pickle

#Models
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.model_selection import RandomizedSearchCV
from sklearn import model_selection, preprocessing, metrics
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV


#disable warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#Define Functions

In [None]:
#Functions

#Split dataframe by data types
def split_dataframe(dataframe):
  cat_dataframe = pd.DataFrame(dataframe.iloc[:,0:2])
  cont_dataframe = pd.DataFrame(dataframe.iloc[:,0:2])
  col = list(dataframe.columns)
  col.pop(0)
  col.pop(0)
  for c in col:
    if dataframe[c].dtypes == 'object':
      cat_dataframe = pd.concat([cat_dataframe, dataframe[c]], axis = 1)
    elif dataframe[c].isin([0,1,'Nan']).all():
        cat_dataframe = pd.concat([cat_dataframe, dataframe[c]], axis = 1)
    else:
      cont_dataframe = pd.concat([cont_dataframe, dataframe[c]], axis = 1)
  cat = cat_dataframe
  cont = cont_dataframe
  return cat, cont

#Organize aggregation of datasets
def organize_agg(dataframe, stat, name):
  columns = []
  for var in dataframe.columns.levels[0]:
    for stat in dataframe.columns.levels[1][:]:
      columns.append('%s_%s_%s' % (name, var, stat))
  dataframe.columns = columns
  return dataframe

#Number of missing values in the dataframe application
def missing_values(dataframe):
  number_MV = pd.DataFrame({'Count' : dataframe.isnull().sum(),
                          '%': dataframe.isnull().sum()/len(dataframe)})
  print(number_MV.astype(bool).sum(axis=0)[0], 'Columns with missing values')
  return number_MV.sort_values('Count', ascending=False)


#Plot ROC AUC Curve
def plot_roc(y_test, y_predict):
    fpr, tpr, _ = roc_curve(y_test, y_predict)
    roc_auc = auc(fpr,tpr)
    print(roc_auc)
    plt.figure(figsize=(10,8))
    plt.title("ROC curve")
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0,1], [0,1],'r--')
    plt.legend(loc="lower right")

#Plot Feature Importances
def plot_feature_importances(model):
    plt.figure(figsize=(20,20))

    fi_df = pd.DataFrame(model.feature_importances_, index=col[1:])
    fi_df = fi_df.rename(columns={0:'Feature importance'})
    fi_df.sort_values('Feature importance', ascending=True, inplace=True)

    ax = plt.subplot()
    ax.barh(list(fi_df.index), fi_df['Feature importance'])
    return fi_df

#Load Dataset

In [None]:
#Load Dataset
path = '/content/drive/MyDrive/DS/CreditRisk/home-credit-default-risk/'

#Load Dataset
application_train = pd.read_csv(path + 'application_train.csv')
bureau = pd.read_csv(path + 'bureau.csv')
bureau_balance = pd.read_csv(path + 'bureau_balance.csv')
pos_cash_balance = pd.read_csv(path + 'POS_CASH_balance.csv')
credit_card_balance = pd.read_csv(path + 'credit_card_balance.csv')
previous_application = pd.read_csv(path + 'previous_application.csv')
installments_payments = pd.read_csv(path + 'installments_payments.csv')
test_application = pd.read_csv(path + 'application_test.csv')

#Pipeline

##Application

In [None]:
index_target = ['SK_ID_CURR']

numerical_features = ['CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'HOUR_APPR_PROCESS_START',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'FLOORSMAX_AVG',
 'YEARS_BEGINEXPLUATATION_AVG',
 'TOTALAREA_MODE',
 'LIVINGAREA_AVG',
 'APARTMENTS_AVG',
 'ENTRANCES_AVG',
 'ELEVATORS_AVG',
 'OBS_30_CNT_SOCIAL_CIRCLE',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'DAYS_LAST_PHONE_CHANGE',
 'AMT_REQ_CREDIT_BUREAU_HOUR',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_YEAR']

#Columns with categorical data for the application test dataframe
categorial_features = ['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'OCCUPATION_TYPE',
 'ORGANIZATION_TYPE',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'WALLSMATERIAL_MODE']

In [None]:
df = test_application[index_target+ numerical_features + categorial_features]

In [None]:
time_features = ['DAYS_BIRTH',
                  'DAYS_EMPLOYED',
                  'DAYS_REGISTRATION',
                  'DAYS_ID_PUBLISH', 
                  'DAYS_LAST_PHONE_CHANGE']

In [None]:
for i in time_features:
  df[i] = df[i]/-365

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


##Bureau

In [None]:
selected_colums_bureau = ['SK_ID_CURR', 'SK_ID_BUREAU', 'CREDIT_ACTIVE', 'DAYS_CREDIT', 'CREDIT_TYPE', 'DAYS_CREDIT_UPDATE']

Treatment of time relative features

In [None]:
df_bureau = bureau[selected_colums_bureau]

In [None]:
time_features = ['DAYS_CREDIT',
                 'DAYS_CREDIT_UPDATE']

In [None]:
for i in time_features:
  df_bureau[i] = df_bureau[i]/-365

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
for i in time_features:
  df_bureau.loc[lambda df_bureau: df_bureau[i] < 0, [i]] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [None]:
for i in time_features:
  df_bureau.loc[lambda df_bureau: df_bureau[i] > 100, [i]] = -99999

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


Data aggregation

In [None]:
df_bureau = pd.get_dummies(df_bureau)

In [None]:
df_bureau = df_bureau.drop('SK_ID_BUREAU', axis=1).groupby('SK_ID_CURR').agg(['mean'])

In [None]:
df_bureau = organize_agg(df_bureau, 'mean', 'Bureau')

In [None]:
df_test = pd.merge(df, df_bureau, on='SK_ID_CURR', how='left', indicator=True)

In [None]:
df_test['_merge'] = df_test['_merge']=='left_only'
df_test.rename({'_merge': 'Bureau'}, axis = 1, inplace=True)

##Bureau Balance


In [None]:
bureau_balance_en = pd.get_dummies(bureau_balance)

Afterwards, these data can be aggregated using the mean value of each encoded feature in relation to each client. 

In [None]:
c = list(bureau_balance_en.columns)
c.remove('MONTHS_BALANCE')
c.remove('SK_ID_BUREAU' )

In [None]:
bb_mean = (bureau_balance_en.drop(['MONTHS_BALANCE'], axis=1).groupby('SK_ID_BUREAU', as_index=True).agg(['mean']))
bb_count = (bureau_balance_en[['SK_ID_BUREAU','MONTHS_BALANCE']].groupby('SK_ID_BUREAU', as_index=True).agg(['count']))

In [None]:
bb_mean = organize_agg(bb_mean, 'mean', 'bureau_balance')

In [None]:
bb_count = organize_agg(bb_count, 'count', 'bureau_balance')

In [None]:
bureau_balance_en = pd.merge(bb_mean, bb_count, on='SK_ID_BUREAU', how='left')

In [None]:
del bb_mean

In [None]:
del bb_count

Inclusion of target and ID variables into the Bureau dataset

In [None]:
bureau_balance_id = pd.merge(bureau_balance_en, bureau[['SK_ID_BUREAU']], on='SK_ID_BUREAU', how='left')

In [None]:
del bureau_balance_en

In [None]:
bureau_balance_id = pd.merge(bureau_balance_id, bureau[['SK_ID_CURR', 'SK_ID_BUREAU']], how = 'left', on='SK_ID_BUREAU')

In [None]:
bureau_balance_id = bureau_balance_id.dropna(axis=0,subset=['SK_ID_CURR']).groupby('SK_ID_CURR').agg(['mean'])

In [None]:
bureau_balance_id = organize_agg(bureau_balance_id, 'mean', 'Bureau_Balance')

In [None]:
bureau_balance_id.drop('Bureau_Balance_SK_ID_BUREAU_mean', axis=1, inplace=True)

Merge with the training dataset

In [None]:
df_test = pd.merge(df_test, bureau_balance_id, on='SK_ID_CURR', how='left', indicator=True)
df_test['_merge'] = df_test['_merge']=='left only'

In [None]:
df_test.rename({'_merge': 'Bureau_Balance'}, axis=1, inplace=True)

##Previous application

In [None]:
#Selected Features
selected_columns_categorical = ['CODE_REJECT_REASON', 'PRODUCT_COMBINATION', 'NAME_CONTRACT_STATUS']
selected_columns_numerical = ['DAYS_FIRST_DRAWING', 'DAYS_DECISION', 'RATE_DOWN_PAYMENT', 'AMT_ANNUITY', 'AMT_DOWN_PAYMENT', 'CNT_PAYMENT', 'DAYS_FIRST_DUE']
head_columns = ['SK_ID_CURR', 'SK_ID_PREV']

selected_columns = head_columns+selected_columns_categorical+selected_columns_numerical
pa_final = previous_application[selected_columns]

To prepare and aggregate the data, first, the features were divided according to their type, as numerical or categorical.

In [None]:
pa_categorical, pa_numerical = split_dataframe(pa_final)

Categorical data

In [None]:
pa_categorical = pd.get_dummies(pa_categorical)

pa_categorical_agg = pa_categorical.groupby('SK_ID_CURR', as_index=False).agg(['mean'])

In [None]:
pa_categorial_agg = organize_agg(pa_categorical_agg, 'mean', 'PA')

In [None]:
pa_categorial_agg.drop('PA_SK_ID_PREV_mean', axis=1, inplace=True)

Numerical data


Some of the features represent values relative to a timestamp counted as negative values. Therefore, positive values in such cases are a representation of missing values. These features information were transformed into positive values, such that negative values become a representation of missing values.

In [None]:
time_features = ['DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE', 'DAYS_DECISION' ]

In [None]:
for i in time_features:
  pa_numerical[i] = pa_numerical[i]/-365

In [None]:
for i in time_features:
  pa_numerical.loc[lambda pa_numerical: pa_numerical[i] < 0, [i]] = -99999

In [None]:
for i in ['RATE_DOWN_PAYMENT', 'AMT_DOWN_PAYMENT']:
  pa_numerical.loc[lambda pa_numerical: pa_numerical[i] < 0, [i]] = 0

Aggregation of clients in the previous application dataset

In [None]:
pa_numerical = pa_numerical.groupby('SK_ID_CURR', as_index=False).agg(['mean'])

In [None]:
pa_numerical = organize_agg(pa_numerical, 'mean', 'PA')

In [None]:
pa_numerical.drop('PA_SK_ID_PREV_mean', axis=1, inplace=True)

In [None]:
pa_final = pd.merge(pa_numerical, pa_categorial_agg, on='SK_ID_CURR', how='left')

Merge with the training dataset

In [None]:
df_test = pd.merge(df_test, pa_final, on='SK_ID_CURR', how='left', indicator=True)
df_test['_merge'] = df_test['_merge']=='left only'

In [None]:
df_test.rename({'_merge': 'PA'}, axis=1, inplace=True)

##Credit Card Balance

In [None]:
prev_cc_agg = credit_card_balance.drop('SK_ID_CURR', axis=1).groupby('SK_ID_PREV', as_index=True).agg(['max', 'min', 'mean'])

In [None]:
prev_cc_agg = organize_agg(prev_cc_agg, ['max', 'min', 'mean'], 'prev_cc_agg')

In [None]:
prev_cc_agg = pd.merge(prev_cc_agg, credit_card_balance[['SK_ID_PREV', 'SK_ID_CURR']], on='SK_ID_PREV', how='left')

In [None]:
prev_cc_agg = pd.merge(prev_cc_agg, application_train[['SK_ID_CURR', 'TARGET']], on='SK_ID_CURR', how='left')

In [None]:
prev_cc_agg = prev_cc_agg.groupby('SK_ID_CURR', as_index=False).agg(['mean'])

In [None]:
prev_cc_agg = organize_agg(prev_cc_agg, 'mean', 'CC_')

In [None]:
cc_selected_feat = ['CC__prev_cc_agg_MONTHS_BALANCE_max_mean',
                    'CC__prev_cc_agg_AMT_BALANCE_max_mean',
                    'CC__prev_cc_agg_AMT_BALANCE_min_mean',
                    'CC__prev_cc_agg_AMT_BALANCE_mean_mean',
                    'CC__prev_cc_agg_AMT_RECIVABLE_max_mean',
                    'CC__prev_cc_agg_AMT_RECIVABLE_min_mean',
                    'CC__prev_cc_agg_AMT_RECIVABLE_mean_mean',
                    'CC__prev_cc_agg_CNT_DRAWINGS_CURRENT_max_mean',
                    'CC__prev_cc_agg_CNT_DRAWINGS_CURRENT_min_mean',
                    'CC__prev_cc_agg_CNT_DRAWINGS_CURRENT_mean_mean',
                    'CC__prev_cc_agg_AMT_INST_MIN_REGULARITY_max_mean',
                    'CC__prev_cc_agg_AMT_INST_MIN_REGULARITY_min_mean',
                    'CC__prev_cc_agg_AMT_INST_MIN_REGULARITY_mean_mean',
                    'CC__prev_cc_agg_AMT_DRAWINGS_CURRENT_max_mean',
                    'CC__prev_cc_agg_AMT_DRAWINGS_CURRENT_min_mean',
                    'CC__prev_cc_agg_AMT_DRAWINGS_CURRENT_mean_mean']

In [None]:
prev_cc_agg = prev_cc_agg[cc_selected_feat]

Merge with the training dataset

In [None]:
df_test = pd.merge(df_test, prev_cc_agg, on='SK_ID_CURR', how='left', indicator=True)
df_test['_merge'] = df_test['_merge']=='left only'

In [None]:
df_test.rename({'_merge': 'CC_Balance'}, axis=1, inplace=True)

##Installments


In [None]:
installments_payments_agg = installments_payments.groupby('SK_ID_PREV').agg(['min', 'max', 'mean'])

In [None]:
installments_payments_agg = organize_agg(installments_payments_agg,['min', 'max', 'mean'], 'installments')

In [None]:
installments_payments_agg.drop(['installments_SK_ID_CURR_min', 'installments_SK_ID_CURR_max'], inplace=True, axis=1)

In [None]:
installments_payments_agg.rename(columns={'installments_SK_ID_CURR_mean':'SK_ID_CURR'}, inplace=True)

In [None]:
installments_payments_agg = pd.merge(installments_payments_agg, application_train[['SK_ID_CURR', 'TARGET']], on='SK_ID_CURR', how='left')

In [None]:
selected_columns_ip = ['SK_ID_CURR', 'installments_DAYS_ENTRY_PAYMENT_mean', 'installments_DAYS_INSTALMENT_mean', 'installments_NUM_INSTALMENT_NUMBER_mean', 'installments_AMT_INSTALMENT_mean']
installments_payments_agg = installments_payments_agg[selected_columns_ip]

In [None]:
time_features = [ 'installments_DAYS_ENTRY_PAYMENT_mean', 'installments_DAYS_INSTALMENT_mean' ]

In [None]:
for i in time_features:
  installments_payments_agg[i] = installments_payments_agg[i]/-365

In [None]:
installments_payments_agg = installments_payments_agg.groupby('SK_ID_CURR', as_index=False).agg(['mean'])

Data aggregation

In [None]:
installments_payments_agg = organize_agg(installments_payments_agg, 'mean', 'Instalments')

Merge with the training dataset

In [None]:
df_test = pd.merge(df_test, installments_payments_agg, on='SK_ID_CURR', how = 'left', indicator=True)
df_test['_merge'] = df_test['_merge']=='left_only'
df_test.rename({'_merge': 'Instalments'}, axis = 1, inplace=True)

##POS Data

In [None]:
pos_agg = pos_cash_balance.drop('SK_ID_CURR', axis=1).groupby('SK_ID_PREV').agg(['min', 'max', 'mean'])

In [None]:
pos_agg = organize_agg(pos_agg,['min', 'max', 'mean'], 'pos')

In [None]:
pos_agg = pd.merge(pos_agg, pos_cash_balance[['SK_ID_PREV','SK_ID_CURR']], on='SK_ID_PREV', how='right')

In [None]:
pos_agg = pd.merge(pos_agg, application_train[['SK_ID_CURR', 'TARGET']], on='SK_ID_CURR', how='left')

In [None]:
selected_columns_pos = ['SK_ID_CURR', 'pos_MONTHS_BALANCE_min', 'pos_CNT_INSTALMENT_min', 'pos_CNT_INSTALMENT_FUTURE_min', 'pos_SK_DPD_DEF_max']

In [None]:
pos_agg = pos_agg[selected_columns_pos]

In [None]:
time_features = [ 'pos_MONTHS_BALANCE_min' ]

In [None]:
for i in time_features:
  pos_agg[i] = pos_agg[i]/-12

In [None]:
pos_agg = pos_agg.groupby('SK_ID_CURR', as_index=False).agg(['min', 'max'])

In [None]:
pos_agg = organize_agg(pos_agg, ['min', 'max'], '')

Merge with the training dataset

In [None]:
df_test = pd.merge(df_test, pos_agg, on='SK_ID_CURR', how = 'left', indicator=True)
df_test['_merge'] = df_test['_merge']=='left_only'
df_test.rename({'_merge': 'POS'}, axis = 1, inplace=True)

##Encoding dataset

In [None]:
to_encode = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'OCCUPATION_TYPE', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE']

In [None]:
df_en_one = pd.get_dummies(df_test, columns=['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'OCCUPATION_TYPE', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE'])

In [None]:
df_not_en = df_test.copy()

In [None]:
df_cat = df_test

In [None]:
df_cat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48744 entries, 0 to 48743
Columns: 144 entries, SK_ID_CURR to POS
dtypes: bool(6), float64(126), int64(5), object(7)
memory usage: 52.0+ MB


In [None]:
for i in to_encode:
  df_cat[i] = df_cat[i].astype('category')
  df_cat[i]=df_cat[i].cat.codes


##Load trained model

In [None]:
feature_name = [
 'SK_ID_CURR',
 'NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'OCCUPATION_TYPE',
 'ORGANIZATION_TYPE',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'WALLSMATERIAL_MODE',
 'CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'HOUR_APPR_PROCESS_START',
 'EXT_SOURCE_2',
 'YEARS_BEGINEXPLUATATION_AVG',
 'ENTRANCES_AVG',
 'OBS_30_CNT_SOCIAL_CIRCLE',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'DAYS_LAST_PHONE_CHANGE',
 'EXT_SOURCE_3',
 'APARTMENTS_AVG',
 'FLOORSMAX_AVG',
 'TOTALAREA_MODE',
 'LIVINGAREA_AVG',
 'ELEVATORS_AVG',
 'AMT_REQ_CREDIT_BUREAU_HOUR',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_YEAR',
 'EXT_SOURCE_1',
 'Bureau_DAYS_CREDIT_mean',
 'Bureau_DAYS_CREDIT_UPDATE_mean',
 'Bureau_CREDIT_ACTIVE_Active_mean',
 'Bureau_CREDIT_ACTIVE_Bad debt_mean',
 'Bureau_CREDIT_ACTIVE_Closed_mean',
 'Bureau_CREDIT_ACTIVE_Sold_mean',
 'Bureau_CREDIT_TYPE_Another type of loan_mean',
 'Bureau_CREDIT_TYPE_Car loan_mean',
 'Bureau_CREDIT_TYPE_Cash loan (non-earmarked)_mean',
 'Bureau_CREDIT_TYPE_Consumer credit_mean',
 'Bureau_CREDIT_TYPE_Credit card_mean',
 'Bureau_CREDIT_TYPE_Interbank credit_mean',
 'Bureau_CREDIT_TYPE_Loan for business development_mean',
 'Bureau_CREDIT_TYPE_Loan for purchase of shares (margin lending)_mean',
 'Bureau_CREDIT_TYPE_Loan for the purchase of equipment_mean',
 'Bureau_CREDIT_TYPE_Loan for working capital replenishment_mean',
 'Bureau_CREDIT_TYPE_Microloan_mean',
 'Bureau_CREDIT_TYPE_Mobile operator loan_mean',
 'Bureau_CREDIT_TYPE_Mortgage_mean',
 'Bureau_CREDIT_TYPE_Real estate loan_mean',
 'Bureau_CREDIT_TYPE_Unknown type of loan_mean',
 'Bureau',
 'Bureau_Balance_bureau_balance_STATUS_0_mean_mean',
 'Bureau_Balance_bureau_balance_STATUS_1_mean_mean',
 'Bureau_Balance_bureau_balance_STATUS_2_mean_mean',
 'Bureau_Balance_bureau_balance_STATUS_3_mean_mean',
 'Bureau_Balance_bureau_balance_STATUS_4_mean_mean',
 'Bureau_Balance_bureau_balance_STATUS_5_mean_mean',
 'Bureau_Balance_bureau_balance_STATUS_C_mean_mean',
 'Bureau_Balance_bureau_balance_STATUS_X_mean_mean',
 'Bureau_Balance_bureau_balance_MONTHS_BALANCE_count_mean',
 'Bureau_Balance',
 'PA_DAYS_FIRST_DRAWING_mean',
 'PA_DAYS_DECISION_mean',
 'PA_RATE_DOWN_PAYMENT_mean',
 'PA_AMT_ANNUITY_mean',
 'PA_AMT_DOWN_PAYMENT_mean',
 'PA_CNT_PAYMENT_mean',
 'PA_DAYS_FIRST_DUE_mean',
 'PA_CODE_REJECT_REASON_CLIENT_mean',
 'PA_CODE_REJECT_REASON_HC_mean',
 'PA_CODE_REJECT_REASON_LIMIT_mean',
 'PA_CODE_REJECT_REASON_SCO_mean',
 'PA_CODE_REJECT_REASON_SCOFR_mean',
 'PA_CODE_REJECT_REASON_SYSTEM_mean',
 'PA_CODE_REJECT_REASON_VERIF_mean',
 'PA_CODE_REJECT_REASON_XAP_mean',
 'PA_CODE_REJECT_REASON_XNA_mean',
 'PA_PRODUCT_COMBINATION_Card Street_mean',
 'PA_PRODUCT_COMBINATION_Card X-Sell_mean',
 'PA_PRODUCT_COMBINATION_Cash_mean',
 'PA_PRODUCT_COMBINATION_Cash Street: high_mean',
 'PA_PRODUCT_COMBINATION_Cash Street: low_mean',
 'PA_PRODUCT_COMBINATION_Cash Street: middle_mean',
 'PA_PRODUCT_COMBINATION_Cash X-Sell: high_mean',
 'PA_PRODUCT_COMBINATION_Cash X-Sell: low_mean',
 'PA_PRODUCT_COMBINATION_Cash X-Sell: middle_mean',
 'PA_PRODUCT_COMBINATION_POS household with interest_mean',
 'PA_PRODUCT_COMBINATION_POS household without interest_mean',
 'PA_PRODUCT_COMBINATION_POS industry with interest_mean',
 'PA_PRODUCT_COMBINATION_POS industry without interest_mean',
 'PA_PRODUCT_COMBINATION_POS mobile with interest_mean',
 'PA_PRODUCT_COMBINATION_POS mobile without interest_mean',
 'PA_PRODUCT_COMBINATION_POS other with interest_mean',
 'PA_PRODUCT_COMBINATION_POS others without interest_mean',
 'PA_NAME_CONTRACT_STATUS_Approved_mean',
 'PA_NAME_CONTRACT_STATUS_Canceled_mean',
 'PA_NAME_CONTRACT_STATUS_Refused_mean',
 'PA_NAME_CONTRACT_STATUS_Unused offer_mean',
 'PA',
 'CC__prev_cc_agg_MONTHS_BALANCE_max_mean',
 'CC__prev_cc_agg_AMT_BALANCE_max_mean',
 'CC__prev_cc_agg_AMT_BALANCE_min_mean',
 'CC__prev_cc_agg_AMT_BALANCE_mean_mean',
 'CC__prev_cc_agg_AMT_RECIVABLE_max_mean',
 'CC__prev_cc_agg_AMT_RECIVABLE_min_mean',
 'CC__prev_cc_agg_AMT_RECIVABLE_mean_mean',
 'CC__prev_cc_agg_CNT_DRAWINGS_CURRENT_max_mean',
 'CC__prev_cc_agg_CNT_DRAWINGS_CURRENT_min_mean',
 'CC__prev_cc_agg_CNT_DRAWINGS_CURRENT_mean_mean',
 'CC__prev_cc_agg_AMT_INST_MIN_REGULARITY_max_mean',
 'CC__prev_cc_agg_AMT_INST_MIN_REGULARITY_min_mean',
 'CC__prev_cc_agg_AMT_INST_MIN_REGULARITY_mean_mean',
 'CC__prev_cc_agg_AMT_DRAWINGS_CURRENT_max_mean',
 'CC__prev_cc_agg_AMT_DRAWINGS_CURRENT_min_mean',
 'CC__prev_cc_agg_AMT_DRAWINGS_CURRENT_mean_mean',
 'CC_Balance',
 'Instalments_installments_DAYS_ENTRY_PAYMENT_mean_mean',
 'Instalments_installments_DAYS_INSTALMENT_mean_mean',
 'Instalments_installments_NUM_INSTALMENT_NUMBER_mean_mean',
 'Instalments_installments_AMT_INSTALMENT_mean_mean',
 'Instalments',
 '_pos_MONTHS_BALANCE_min_min',
 '_pos_MONTHS_BALANCE_min_max',
 '_pos_CNT_INSTALMENT_min_min',
 '_pos_CNT_INSTALMENT_min_max',
 '_pos_CNT_INSTALMENT_FUTURE_min_min',
 '_pos_CNT_INSTALMENT_FUTURE_min_max',
 '_pos_SK_DPD_DEF_max_min',
 '_pos_SK_DPD_DEF_max_max',
 'POS']

In [None]:
filename = path + 'model_lgbm.pkl'

In [None]:
lgbmc = pickle.load(open(filename, 'rb'))

In [None]:
df_test = df_test[feature_name]

In [None]:
x = df_test.iloc[:,1:]

In [None]:
sub = lgbmc.predict_proba(x)[:,1]

In [None]:
submission = pd.DataFrame( {'SK_ID_CURR': df_test['SK_ID_CURR'], 'TARGET': sub})

In [None]:
submission.to_csv(path+'submission.csv', index=False)

In [None]:
lgbmc

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
               importance_type='split', learning_rate=0.05, max_depth=-1,
               metric='auc', min_child_samples=20, min_child_weight=0.001,
               min_data_in_leaf=10, min_split_gain=0.01, n_estimators=500,
               n_jobs=-1, num_leaves=10, objective='binary', random_state=16,
               reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=0.5,
               subsample_for_bin=200000, subsample_freq=0)