## Library Imports

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')
pd.options.display.float_format = '{:.2f}'.format

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from typing import List, Tuple, Optional

from sklearn.pipeline import make_pipeline
import missingno as msno

In [2]:
def create_numerical_aggs(data: pd.DataFrame,
                          groupby_id: str,
                          aggs: dict,
                          prefix: Optional[str] = None,
                          suffix: Optional[str] = None,
                          ) -> pd.DataFrame:
    if not prefix:
        prefix = ""
    if not suffix:
        suffix = ""

    data_grouped = data.groupby(groupby_id)
    stats = data_grouped.agg(aggs)
    stats.columns = [f"{prefix}{feature}_{stat}{suffix}".upper() for feature, stat in stats]
    stats = stats.reset_index()

    return stats

In [3]:
# Load data
applications_history_data = pd.read_csv('data/applications_history.csv')
bki_data = pd.read_csv('data/bki.csv')
client_profile_data = pd.read_csv('data/client_profile.csv')
payments_data = pd.read_csv('data/payments.csv')
test_data = pd.read_csv('data/test.csv')
train_data = pd.read_csv('data/train.csv')

In [4]:
df_train = pd.merge(train_data, client_profile_data, how='left', on='APPLICATION_NUMBER', indicator='_MERGE_PROFILE')
df_test = pd.merge(test_data, client_profile_data, how='left', on='APPLICATION_NUMBER', indicator='_MERGE_PROFILE')

### Data cleaning

##### Drop Duplicate Observations

In [5]:
# Check for duplicates
print("Number of duplicates on the Train dataset:", df_train.duplicated().sum())
print("Number of duplicates on the Test dataset:", df_test.duplicated().sum())

Number of duplicates on the Train dataset: 0
Number of duplicates on the Test dataset: 0


##### Missing Observations

In [6]:
# Create new columns to flag features with a lot of missing values
flag_missing_columns = ['OWN_CAR_AGE', 'EXTERNAL_SCORING_RATING_1', 'EXTERNAL_SCORING_RATING_2',
                        'EXTERNAL_SCORING_RATING_3', 'AMT_REQ_CREDIT_BUREAU_MON']

for column in flag_missing_columns:
    df_train['MISSING_' + column] = ((df_train['_MERGE_PROFILE']=='both') & (df_train[column].isna())).astype('int')
    df_test['MISSING_' + column] = ((df_test['_MERGE_PROFILE']=='both') & (df_test[column].isna())).astype('int')
    
# Flag DAYS_ON_LAST_JOB > 350000
df_train['MISSING_DAYS_ON_LAST_JOB'] = (df_train.DAYS_ON_LAST_JOB > 350000).astype('int')
df_test['MISSING_DAYS_ON_LAST_JOB'] = (df_test.DAYS_ON_LAST_JOB > 350000).astype('int')

In [7]:
# Fill out missing values in numeric features as median
df_train.loc[df_train['MISSING_OWN_CAR_AGE']==1,'MISSING_OWN_CAR_AGE'] = 0

numeric_columns = df_train.drop(columns=['TARGET']).select_dtypes(include='number').columns
for column in numeric_columns:
    df_train[column].fillna(df_train[column].median(), inplace=True) 
    df_test[column].fillna(df_test[column].median(), inplace=True) 

In [8]:
# Fill out missing values in categorical features as Missing 
numeric_columns = df_train.select_dtypes(include='object').columns
for column in numeric_columns:
    df_train[column].fillna("Missing", inplace=True) 
    df_test[column].fillna("Missing", inplace=True) 

##### Process categorical features

In [9]:
features = df_train.drop(columns=['TARGET'])
target = df_train['TARGET']
features.shape

(110093, 32)

In [10]:
cat_features = features.select_dtypes(exclude='number').columns

In [11]:
df_train.GENDER.replace('XNA', 'Missing', inplace=True)
df_test.GENDER.replace('XNA', 'Missing', inplace=True)
df_train.FAMILY_STATUS.replace('Unknown', 'Missing', inplace=True)
df_test.FAMILY_STATUS.replace('Unknown', 'Missing', inplace=True)

In [12]:
df_train = pd.get_dummies(df_train, cat_features)
df_test = pd.get_dummies(df_test, cat_features)

##### Flag outliers in numeric features

In [13]:
# Flag outliers for 'TOTAL_SALARY', 'AMOUNT_CREDIT', 'AMOUNT_ANNUITY'
outliers_columns = ['TOTAL_SALARY', 'AMOUNT_CREDIT', 'AMOUNT_ANNUITY']

for column in outliers_columns:
    LEFT_BOUND_train, RIGHT_BOUND_train = np.percentile(df_train[column], q=1), np.percentile(df_train[column], q=99)
    LEFT_BOUND_test, RIGHT_BOUND_test = np.percentile(df_test[column], q=1), np.percentile(df_test[column], q=99)
    df_train['OUTLIER_' + column] = ((df_train[column] < LEFT_BOUND_train) | (df_train[column] > RIGHT_BOUND_train)).astype('int')
    df_test['OUTLIER_' + column] = ((df_test[column] < LEFT_BOUND_test) | (df_test[column] > RIGHT_BOUND_test)).astype('int')
    df_train[column] = np.clip(df_train[column], LEFT_BOUND_train, RIGHT_BOUND_train)
    df_test[column] = np.clip(df_test[column], LEFT_BOUND_test, RIGHT_BOUND_test)

##### Process numeric features

In [14]:
# Make CHILDRENS as a descrete/categorical feature
df_train['CHILDREN_0']  = (df_train.CHILDRENS == 0).astype('int')
df_train['CHILDREN_1']  = (df_train.CHILDRENS == 1).astype('int')
df_train['CHILDREN_2']  = (df_train.CHILDRENS == 2).astype('int')
df_train['CHILDREN_3+']  = (df_train.CHILDRENS >= 3).astype('int')
df_test['CHILDREN_0']  = (df_test.CHILDRENS == 0).astype('int')
df_test['CHILDREN_1']  = (df_test.CHILDRENS == 1).astype('int')
df_test['CHILDREN_2']  = (df_test.CHILDRENS == 2).astype('int')
df_test['CHILDREN_3+']  = (df_test.CHILDRENS >= 3).astype('int')

# Make FAMILY_SIZE as a descrete/categorical feature
df_train['FAMILY_SIZE_0']  = (df_train.FAMILY_SIZE == 0).astype('int')
df_train['FAMILY_SIZE_1']  = (df_train.FAMILY_SIZE == 1).astype('int')
df_train['FAMILY_SIZE_2']  = (df_train.FAMILY_SIZE == 2).astype('int')
df_train['FAMILY_SIZE_3+']  = (df_train.FAMILY_SIZE >= 3).astype('int')
df_test['FAMILY_SIZE_0']  = (df_test.FAMILY_SIZE == 0).astype('int')
df_test['FAMILY_SIZE_1']  = (df_test.FAMILY_SIZE == 1).astype('int')
df_test['FAMILY_SIZE_2']  = (df_test.FAMILY_SIZE == 2).astype('int')
df_test['FAMILY_SIZE_3+']  = (df_test.FAMILY_SIZE >= 3).astype('int')

#df_train.drop(columns=['CHILDRENS', 'FAMILY_SIZE'], inplace=True)

##### Generate new PROFILE metrics

In [15]:
# Generate financial metrics
df_train['AMOUNT_CREDIT_to_AMOUNT_ANNUITY'] = df_train['AMOUNT_CREDIT'] / df_train['AMOUNT_ANNUITY'] 
df_test['AMOUNT_CREDIT_to_AMOUNT_ANNUITY'] = df_test['AMOUNT_CREDIT'] / df_test['AMOUNT_ANNUITY'] 

df_train['AMOUNT_CREDIT_to_TOTAL_SALARY'] = df_train['AMOUNT_CREDIT'] / df_train['TOTAL_SALARY'] 
df_test['AMOUNT_CREDIT_to_TOTAL_SALARY'] = df_test['AMOUNT_CREDIT'] / df_test['TOTAL_SALARY'] 

df_train['AMOUNT_ANNUITY_to_TOTAL_SALARY'] = df_train['AMOUNT_ANNUITY'] / df_train['TOTAL_SALARY'] 
df_test['AMOUNT_ANNUITY_to_TOTAL_SALARY'] = df_test['AMOUNT_ANNUITY'] / df_test['TOTAL_SALARY'] 

df_train['TOTAL_SALARY_and_TOTAL_SALARY_diff'] = df_train['TOTAL_SALARY'] - df_train['AMOUNT_ANNUITY'] 
df_test['TOTAL_SALARY_and_TOTAL_SALARY_diff'] = df_test['TOTAL_SALARY'] - df_test['AMOUNT_ANNUITY'] 

In [16]:
# Generate scoring metrics
for function_name in ["mean", "nanmedian", 'min', 'max']:
    feature_name = "EXTERNAL_SCORING_RATINGS_{}".format(function_name)
    df_train[feature_name] = eval("np.{}".format(function_name))(
        df_train[["EXTERNAL_SCORING_RATING_1", "EXTERNAL_SCORING_RATING_2", "EXTERNAL_SCORING_RATING_3"]], axis=1
    )
    df_test[feature_name] = eval("np.{}".format(function_name))(
        df_test[["EXTERNAL_SCORING_RATING_1", "EXTERNAL_SCORING_RATING_2", "EXTERNAL_SCORING_RATING_3"]], axis=1
    )

##### Generate new applications_history_data metrics

In [17]:
aggs_refused = {
    'PREV_APPLICATION_NUMBER': ['count'],
    'AMT_APPLICATION': ['mean', 'min', 'max'],
    'DAYS_DECISION': ['mean', 'min', 'max']
}

mask_refused = applications_history_data["NAME_CONTRACT_STATUS"] == "Refused"
stats_refused = create_numerical_aggs(
    applications_history_data[mask_refused], groupby_id="APPLICATION_NUMBER", aggs=aggs_refused, prefix="PREV_REFUSED_"
)

df_train = pd.merge(df_train, stats_refused, how='left', on='APPLICATION_NUMBER')
df_test = pd.merge(df_test, stats_refused, how='left', on='APPLICATION_NUMBER')

In [18]:
aggs_approved = {
    'PREV_APPLICATION_NUMBER': ['count'],
    'AMOUNT_CREDIT': ['sum'],
    'AMOUNT_ANNUITY': ['sum'],
}

mask_approved = applications_history_data["NAME_CONTRACT_STATUS"] == "Approved"
stats_approved = create_numerical_aggs(
    applications_history_data[mask_approved], groupby_id="APPLICATION_NUMBER", aggs=aggs_approved, prefix="PREV_APPROVED_"
)

df_train = pd.merge(df_train, stats_approved, how='left', on='APPLICATION_NUMBER')
df_test = pd.merge(df_test, stats_approved, how='left', on='APPLICATION_NUMBER')


##### Generate new bki metrics

In [19]:
aggs = {
    'APPLICATION_NUMBER': ['count'],
    'CREDIT_DAY_OVERDUE': ['min', 'max'],
    'AMT_CREDIT_MAX_OVERDUE': ['min', 'max'],
    'CNT_CREDIT_PROLONG': ['min', 'max'],
    'AMT_CREDIT_SUM_OVERDUE': ['min', 'max'],
}

stats = create_numerical_aggs(
    bki_data, groupby_id="APPLICATION_NUMBER", aggs=aggs, prefix="BKI_"
)

df_train = pd.merge(df_train, stats, how='left', on='APPLICATION_NUMBER')
df_test = pd.merge(df_test, stats, how='left', on='APPLICATION_NUMBER')

In [20]:
aggs_active = {
    'APPLICATION_NUMBER': ['count'],
    'AMT_CREDIT_SUM': ['sum'],
    'AMT_ANNUITY': ['sum'],
    'AMT_CREDIT_SUM_DEBT': ['sum'],
    'AMT_CREDIT_SUM_OVERDUE': ['sum'],
}

mask_active = bki_data['CREDIT_ACTIVE'] =='Active'
stats_active = create_numerical_aggs(
    bki_data[mask_active], groupby_id="APPLICATION_NUMBER", aggs=aggs_active, prefix="BKI_ACTIVE_"
)

df_train = pd.merge(df_train, stats_active, how='left', on='APPLICATION_NUMBER')
df_test = pd.merge(df_test, stats_active, how='left', on='APPLICATION_NUMBER')

In [21]:
aggs_closed = {
    'APPLICATION_NUMBER': ['count'],
}

mask_closed = bki_data['CREDIT_ACTIVE'] =='Closed'
stats_closed = create_numerical_aggs(
    bki_data[mask_closed], groupby_id="APPLICATION_NUMBER", aggs=aggs_closed, prefix="BKI_CLOSED_"
)

df_train = pd.merge(df_train, stats_closed, how='left', on='APPLICATION_NUMBER')
df_test = pd.merge(df_test, stats_closed, how='left', on='APPLICATION_NUMBER')

In [22]:
df_train['BKI_CREDIT_SUM_and_BKI_DEBT_SUM_diff'] = df_train.BKI_ACTIVE_AMT_CREDIT_SUM_SUM - df_train.BKI_ACTIVE_AMT_CREDIT_SUM_DEBT_SUM
df_test['BKI_CREDIT_SUM_and_BKI_DEBT_SUM_diff'] = df_test.BKI_ACTIVE_AMT_CREDIT_SUM_SUM - df_test.BKI_ACTIVE_AMT_CREDIT_SUM_DEBT_SUM

df_train['NEW_and_BKI_CREDIT'] = df_train.AMOUNT_CREDIT + df_train.BKI_ACTIVE_AMT_CREDIT_SUM_SUM
df_test['NEW_and_BKI_CREDIT'] = df_test.AMOUNT_CREDIT + df_test.BKI_ACTIVE_AMT_CREDIT_SUM_SUM

df_train['NEW_and_BKI_ANNUITY'] = df_train.AMOUNT_ANNUITY + df_train.BKI_ACTIVE_AMT_ANNUITY_SUM
df_test['NEW_and_BKI_ANNUITY'] = df_test.AMOUNT_ANNUITY + df_test.BKI_ACTIVE_AMT_ANNUITY_SUM

df_train['BKI_CREDIT_to_BKI_ANNUITY'] = df_train.BKI_ACTIVE_AMT_CREDIT_SUM_SUM / df_train.BKI_ACTIVE_AMT_ANNUITY_SUM
df_test['BKI_CREDIT_to_BKI_ANNUITY'] = df_test.BKI_ACTIVE_AMT_CREDIT_SUM_SUM / df_test.BKI_ACTIVE_AMT_ANNUITY_SUM

df_train['BKI_ANNUITY_to_BKI_CREDIT'] = df_train.BKI_ACTIVE_AMT_ANNUITY_SUM / df_train.BKI_ACTIVE_AMT_CREDIT_SUM_SUM
df_test['BKI_ANNUITY_to_BKI_CREDIT'] = df_test.BKI_ACTIVE_AMT_ANNUITY_SUM / df_test.BKI_ACTIVE_AMT_CREDIT_SUM_SUM

df_train['NEW_and_BKI_CREDIT_to_ANNUITY'] = df_train.NEW_and_BKI_CREDIT / df_train.NEW_and_BKI_ANNUITY
df_test['NEW_and_BKI_CREDIT_to_ANNUITY'] = df_test.NEW_and_BKI_CREDIT / df_test.NEW_and_BKI_ANNUITY

df_train['NEW_and_BKI_ANNUITY_to_CREDIT'] = df_train.NEW_and_BKI_ANNUITY / df_train.NEW_and_BKI_CREDIT 
df_test['NEW_and_BKI_ANNUITY_to_CREDIT'] = df_test.NEW_and_BKI_ANNUITY / df_test.NEW_and_BKI_CREDIT

df_train['BKI_CREDIT_to_TOTAL_SALARY'] = df_train.BKI_ACTIVE_AMT_CREDIT_SUM_SUM / df_train.TOTAL_SALARY 
df_test['BKI_CREDIT_to_TOTAL_SALARY'] = df_test.BKI_ACTIVE_AMT_CREDIT_SUM_SUM / df_test.TOTAL_SALARY

df_train['NEW_and_BKI_CREDIT_to_TOTAL_SALARY'] = df_train.NEW_and_BKI_CREDIT / df_train.TOTAL_SALARY 
df_test['NEW_and_BKI_CREDIT_to_TOTAL_SALARY'] = df_test.NEW_and_BKI_CREDIT / df_test.TOTAL_SALARY

df_train['BKI_ANNUITY_to_TOTAL_SALARY'] = df_train.BKI_ACTIVE_AMT_ANNUITY_SUM / df_train.TOTAL_SALARY 
df_test['BKI_ANNUITY_to_TOTAL_SALARY'] = df_test.BKI_ACTIVE_AMT_ANNUITY_SUM / df_test.TOTAL_SALARY

df_train['NEW_and_BKI_ANNUITY_to_TOTAL_SALARY'] = df_train.NEW_and_BKI_ANNUITY / df_train.TOTAL_SALARY 
df_test['NEW_and_BKI_ANNUITY_to_TOTAL_SALARY'] = df_test.NEW_and_BKI_ANNUITY / df_test.TOTAL_SALARY

df_train['TOTAL_SALARY_and_BKI_ANNUITY_diff'] = df_train.TOTAL_SALARY - df_train.BKI_ACTIVE_AMT_ANNUITY_SUM
df_test['TOTAL_SALARY_and_BKI_ANNUITY_diff'] = df_test.TOTAL_SALARY - df_test.BKI_ACTIVE_AMT_ANNUITY_SUM

df_train['TOTAL_SALARY_and_NEW_and_BKI_ANNUITY_diff'] = df_train.TOTAL_SALARY - df_train.NEW_and_BKI_ANNUITY
df_test['TOTAL_SALARY_and_NEW_and_BKI_ANNUITY_diff'] = df_test.TOTAL_SALARY - df_test.NEW_and_BKI_ANNUITY


In [23]:
payments_data.head()

Unnamed: 0,PREV_APPLICATION_NUMBER,APPLICATION_NUMBER,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,49011181,123664960,1.0,5,1002.0,1015.0,12156.61,12156.61
1,48683432,123497205,1.0,13,442.0,432.0,18392.53,10047.65
2,48652024,123749925,1.0,10,8.0,23.0,5499.94,5499.94
3,48398897,123550846,0.0,82,398.0,398.0,7082.15,7082.15
4,49867197,123562174,0.0,63,1359.0,1359.0,156.74,156.74


In [26]:
for column in df_train.drop(columns=['TARGET']).columns:
    df_train.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_test.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_train[column].fillna(0, inplace=True)
    df_test[column].fillna(0, inplace=True)
    #df_train[column].fillna(df_train[column].median(), inplace=True) 
    #df_test[column].fillna(df_test[column].median(), inplace=True) 

In [27]:
df_train.to_csv('data/df_train_missing_clean.csv', index=None)
df_test.to_csv('data/df_test_missing_clean.csv', index=None)