In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Disabling warnings
import warnings
warnings.simplefilter("ignore")

In [2]:
def read_csv(path):
    """The function reads a csv file, converts it to a data frame and returns a copy of the data frame."""
    data = pd.read_csv(path, encoding = 'unicode_escape')
    return data.copy()

credit_card_balance = read_csv("credit_card_balance.csv")

In [3]:
df_credit_card_balance = credit_card_balance.copy()

## Summary Info
- Credit_card_balance
    - RangeIndex: 3,840,312 entries, 0 to 3,840,311
    - Data columns (total 23 columns)
    - dtypes: float64(15), int64(7), object(1)

In [4]:
# pd.options.display.max_rows = None
# pd.options.display.max_columns = None

## Credit Card Balance

In [5]:
df_credit_card_balance.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,...,0.0,0.0,0.0,1,0.0,1.0,35.0,Active,0,0
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,...,64875.555,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0
2,1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,...,31460.085,31460.085,0.0,0,0.0,0.0,30.0,Active,0,0
3,1389973,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,...,233048.97,233048.97,1.0,1,0.0,0.0,10.0,Active,0,0
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,...,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0


In [6]:
total_nan = df_credit_card_balance.isnull().sum().sort_values(ascending = False)
percent_nan = (df_credit_card_balance.isnull().sum()/df_credit_card_balance.isnull().count()*100).sort_values(ascending = False)
missing_credit_card  = pd.concat([total_nan, percent_nan], axis=1, keys=['Total_nan', 'Percent_nan'])
missing_credit_card.head(10)

Unnamed: 0,Total_nan,Percent_nan
AMT_PAYMENT_CURRENT,767988,19.998063
AMT_DRAWINGS_OTHER_CURRENT,749816,19.524872
CNT_DRAWINGS_POS_CURRENT,749816,19.524872
CNT_DRAWINGS_OTHER_CURRENT,749816,19.524872
CNT_DRAWINGS_ATM_CURRENT,749816,19.524872
AMT_DRAWINGS_ATM_CURRENT,749816,19.524872
AMT_DRAWINGS_POS_CURRENT,749816,19.524872
CNT_INSTALMENT_MATURE_CUM,305236,7.948208
AMT_INST_MIN_REGULARITY,305236,7.948208
SK_DPD_DEF,0,0.0


### New Features
1. CARD_AMT_DRAWINGS_TO_LIMIT = AMT_DRAWINGS_CURRENT / AMT_CREDIT_LIMIT_ACTUAL #Rate of drawings amount to credit limit
1. CARD_BALANCE_TO_LIMIT = AMT_BALANCE / AMT_CREDIT_LIMIT_ACTUAL #Rate of balance to credit limit
1. CARD_AMT_DRAWINGS_TO_BALANCE = AMT_DRAWINGS_CURRENT / AMT_BALANCE #Rate of current drawings to current balance
1. CARD_AMT_DRAWINGS_TO_LIMIT = AMT_DRAWINGS_CURRENT / AMT_CREDIT_LIMIT_ACTUAL #Rate of current drawings to credit limit
1. CARD_AMT_ATM_TO_CURRENT_DRAWINGS = AMT_DRAWINGS_ATM_CURRENT / AMT_DRAWINGS_CURRENT #Rate of ATM drawings to current drawings
1. CARD_AMT_POS_TO_CURRENT = AMT_DRAWINGS_POS_CURRENT / AMT_DRAWINGS_CURRENT	#Rate of POS drawings to current drawings
1. CARD_MIN_TO_CURRENT_PAYMENT = AMT_INST_MIN_REGULARITY / AMT_PAYMENT_CURRENT #Rate of min payment to current payment
1. CARD_MIN_TO_TOTAL_CURRENT_PAYMENT = AMT_INST_MIN_REGULARITY / AMT_PAYMENT_TOTAL_CURRENT #Rate of min payment to total payment
1. CARD_MIN_PAYMENT_TO_BALANCE = AMT_INST_MIN_REGULARITY / AMT_BALANCE #Rate of min payment to current balance
1. CARD_CNT_ATM_TO_CURRENT_DRAWINGS = CNT_DRAWINGS_ATM_CURRENT / CNT_DRAWINGS_CURRENT #Rate of number of ATM drawings to number of current drawings	
1. CARD_PRINCIPAL_TO_RECEIVABLE = AMT_RECEIVABLE_PRINCIPAL / AMT_RECIVABLE #Rate of principal receivable to receivable
1. CARD_PRINCIPAL_TO_TOTAL_RECEIVABLE = AMT_RECEIVABLE_PRINCIPAL /  AMT_TOTAL_RECEIVABLE #Rate of principal receivable to total receivable
1. CARD_RECEIVABLE_TO_TOTAL_RECEIVABLE = AMT_RECEIVABLE / AMT_TOTAL_RECEIVABLE #Rate of receivable to receivable

In [7]:
df_credit_card_balance['CARD_AMT_DRAWINGS_TO_LIMIT'] = df_credit_card_balance['AMT_DRAWINGS_CURRENT'] / df_credit_card_balance['AMT_CREDIT_LIMIT_ACTUAL']
df_credit_card_balance['CARD_BALANCE_TO_LIMIT'] = df_credit_card_balance['AMT_BALANCE'] / df_credit_card_balance['AMT_CREDIT_LIMIT_ACTUAL']
df_credit_card_balance['CARD_AMT_DRAWINGS_TO_BALANCE'] = df_credit_card_balance['AMT_DRAWINGS_CURRENT'] / df_credit_card_balance['AMT_BALANCE']
df_credit_card_balance['CARD_AMT_DRAWINGS_TO_LIMIT'] = df_credit_card_balance['AMT_DRAWINGS_CURRENT'] / df_credit_card_balance['AMT_CREDIT_LIMIT_ACTUAL']
df_credit_card_balance['CARD_AMT_ATM_TO_CURRENT_DRAWINGS'] = df_credit_card_balance['AMT_DRAWINGS_ATM_CURRENT'] / df_credit_card_balance['AMT_DRAWINGS_CURRENT']
df_credit_card_balance['CARD_AMT_POS_TO_CURRENT'] = df_credit_card_balance['AMT_DRAWINGS_POS_CURRENT'] / df_credit_card_balance['AMT_DRAWINGS_CURRENT']
df_credit_card_balance['CARD_MIN_TO_CURRENT_PAYMENT'] = df_credit_card_balance['AMT_INST_MIN_REGULARITY'] / df_credit_card_balance['AMT_PAYMENT_CURRENT']
df_credit_card_balance['CARD_MIN_TO_TOTAL_CURRENT_PAYMENT'] = df_credit_card_balance['AMT_INST_MIN_REGULARITY'] / df_credit_card_balance['AMT_PAYMENT_TOTAL_CURRENT']
df_credit_card_balance['CARD_MIN_PAYMENT_TO_BALANCE'] = df_credit_card_balance['AMT_INST_MIN_REGULARITY'] / df_credit_card_balance['AMT_BALANCE']
df_credit_card_balance['CARD_CNT_ATM_TO_CURRENT_DRAWINGS'] = df_credit_card_balance['CNT_DRAWINGS_ATM_CURRENT'] / df_credit_card_balance['CNT_DRAWINGS_CURRENT']
df_credit_card_balance['CARD_PRINCIPAL_TO_RECEIVABLE'] = df_credit_card_balance['AMT_RECEIVABLE_PRINCIPAL'] / df_credit_card_balance['AMT_RECIVABLE']
df_credit_card_balance['CARD_PRINCIPAL_TO_TOTAL_RECEIVABLE'] = df_credit_card_balance['AMT_RECEIVABLE_PRINCIPAL'] / df_credit_card_balance['AMT_TOTAL_RECEIVABLE']
df_credit_card_balance['CARD_RECEIVABLE_TO_TOTAL_RECEIVABLE'] = df_credit_card_balance['AMT_RECIVABLE'] / df_credit_card_balance['AMT_TOTAL_RECEIVABLE']

#### Credit card balance to limit ratio for the last 2 months to have a feature that considers more recent data
- This approach is taken from the kaggle write-up of @kingychiu. 
- https://www.kaggle.com/c/home-credit-default-risk/discussion/64598

In [8]:
credit_balance_to_limit2m = df_credit_card_balance[df_credit_card_balance.MONTHS_BALANCE >= -2]
credit_balance_to_limit2m['CARD_BALANCE_TO_LIMIT2M'] = credit_balance_to_limit2m['AMT_BALANCE'] / credit_balance_to_limit2m['AMT_CREDIT_LIMIT_ACTUAL']
credit_balance_to_limit2m = credit_balance_to_limit2m[['SK_ID_PREV','MONTHS_BALANCE', 'CARD_BALANCE_TO_LIMIT2M']]

In [9]:
df_credit_card_balance.replace([np.inf, -np.inf], np.nan,inplace=True)
credit_balance_to_limit2m.replace([np.inf, -np.inf], np.nan,inplace=True)

In [10]:
total_nan = df_credit_card_balance.isnull().sum().sort_values(ascending = False)
percent_nan = (df_credit_card_balance.isnull().sum()/df_credit_card_balance.isnull().count()*100).sort_values(ascending = False)
missing_credit_card  = pd.concat([total_nan, percent_nan], axis=1, keys=['Total_nan', 'Percent_nan'])
missing_credit_card.head(10)

Unnamed: 0,Total_nan,Percent_nan
CARD_CNT_ATM_TO_CURRENT_DRAWINGS,3229952,84.1065
CARD_AMT_ATM_TO_CURRENT_DRAWINGS,3223443,83.937008
CARD_AMT_POS_TO_CURRENT,3223443,83.937008
CARD_MIN_PAYMENT_TO_BALANCE,2187413,56.959252
CARD_MIN_TO_TOTAL_CURRENT_PAYMENT,2181632,56.808718
CARD_AMT_DRAWINGS_TO_BALANCE,2156420,56.152208
CARD_PRINCIPAL_TO_RECEIVABLE,2113816,55.042819
CARD_PRINCIPAL_TO_TOTAL_RECEIVABLE,2113643,55.038315
CARD_RECEIVABLE_TO_TOTAL_RECEIVABLE,2113643,55.038315
CARD_MIN_TO_CURRENT_PAYMENT,1169483,30.452812


#### Drop 3 variables which have more than 80% NaN values.

In [11]:
df_credit_card_balance.drop(labels=['CARD_CNT_ATM_TO_CURRENT_DRAWINGS','CARD_AMT_ATM_TO_CURRENT_DRAWINGS',
                                     'CARD_AMT_POS_TO_CURRENT'], axis=1, inplace=True)

### Categorical Features

In [12]:
df_credit_card_balance.NAME_CONTRACT_STATUS = df_credit_card_balance.NAME_CONTRACT_STATUS.astype('category')
df_credit_card_balance = pd.get_dummies(df_credit_card_balance, columns = ['NAME_CONTRACT_STATUS'])
df_credit_card_balance.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,CARD_PRINCIPAL_TO_RECEIVABLE,CARD_PRINCIPAL_TO_TOTAL_RECEIVABLE,CARD_RECEIVABLE_TO_TOTAL_RECEIVABLE,NAME_CONTRACT_STATUS_Active,NAME_CONTRACT_STATUS_Approved,NAME_CONTRACT_STATUS_Completed,NAME_CONTRACT_STATUS_Demand,NAME_CONTRACT_STATUS_Refused,NAME_CONTRACT_STATUS_Sent proposal,NAME_CONTRACT_STATUS_Signed
0,2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,...,,,,1,0,0,0,0,0,0
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,...,0.927546,0.927546,1.0,1,0,0,0,0,0,0
2,1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,...,0.855892,0.855892,1.0,1,0,0,0,0,0,0
3,1389973,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,...,0.965245,0.965245,1.0,1,0,0,0,0,0,0
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,...,0.976042,0.976042,1.0,1,0,0,0,0,0,0


### Imputing NaN Values (optional)
- We can use YCImpute package to impute NaN values. EM algorithm gives result in a shorter time. KNN is time consuming and Iterforest gives error. 
- A function is formed for imputation. We need to convert the dataframe into an array and the values become float. We add lines to keep the datatypes same as before the method.
- __Note__: Imputing NaN values with this algorithm decreases the AUC score compared to data set without imputaion. Different algorithms and alternatively sklearn tools can be used and compared but it is very time consuming with large data. 

In [13]:
# from ycimpute.imputer import iterforest
# from ycimpute.imputer import EM
# from ycimpute.imputer import knnimput

# def nan_imputer(df):
#     int_columns = df.select_dtypes(include='int64').columns
#     cat_columns = df.select_dtypes(include='uint8').columns
#     var_names = df.columns
#     np_df = np.array(df)
#     df = EM().complete(np_df)
#     df = pd.DataFrame(df, columns = var_names)
#     df[int_columns] = df[int_columns].astype('int64')
#     df[cat_columns] = df[cat_columns].astype('uint8')
#     return df

In [14]:
import gc
gc.collect()

20

In [15]:
# df_credit_card_balance = nan_imputer(df_credit_card_balance)

In [16]:
# df_credit_card_balance.isnull().sum()

In [17]:
df_credit_card_balance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3840312 entries, 0 to 3840311
Data columns (total 38 columns):
SK_ID_PREV                             int64
SK_ID_CURR                             int64
MONTHS_BALANCE                         int64
AMT_BALANCE                            float64
AMT_CREDIT_LIMIT_ACTUAL                int64
AMT_DRAWINGS_ATM_CURRENT               float64
AMT_DRAWINGS_CURRENT                   float64
AMT_DRAWINGS_OTHER_CURRENT             float64
AMT_DRAWINGS_POS_CURRENT               float64
AMT_INST_MIN_REGULARITY                float64
AMT_PAYMENT_CURRENT                    float64
AMT_PAYMENT_TOTAL_CURRENT              float64
AMT_RECEIVABLE_PRINCIPAL               float64
AMT_RECIVABLE                          float64
AMT_TOTAL_RECEIVABLE                   float64
CNT_DRAWINGS_ATM_CURRENT               float64
CNT_DRAWINGS_CURRENT                   int64
CNT_DRAWINGS_OTHER_CURRENT             float64
CNT_DRAWINGS_POS_CURRENT               float64
CNT_

#### We merge this new feature after the imputation because the CARD_BALANCE_TO_LIMIT2M values need to be NaN for balances older than 2 months.
- This approach is taken from the kaggle write-up of @kingychiu. 
- https://www.kaggle.com/c/home-credit-default-risk/discussion/64598

In [18]:
df_credit_card_balance = df_credit_card_balance.merge(credit_balance_to_limit2m, 
                                                      on = ['SK_ID_PREV','MONTHS_BALANCE'], how = 'left')

In [19]:
df_credit_card_balance.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,CARD_PRINCIPAL_TO_TOTAL_RECEIVABLE,CARD_RECEIVABLE_TO_TOTAL_RECEIVABLE,NAME_CONTRACT_STATUS_Active,NAME_CONTRACT_STATUS_Approved,NAME_CONTRACT_STATUS_Completed,NAME_CONTRACT_STATUS_Demand,NAME_CONTRACT_STATUS_Refused,NAME_CONTRACT_STATUS_Sent proposal,NAME_CONTRACT_STATUS_Signed,CARD_BALANCE_TO_LIMIT2M
0,2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,...,,,1,0,0,0,0,0,0,
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,...,0.927546,1.0,1,0,0,0,0,0,0,1.421679
2,1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,...,0.855892,1.0,1,0,0,0,0,0,0,
3,1389973,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,...,0.965245,1.0,1,0,0,0,0,0,0,
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,...,0.976042,1.0,1,0,0,0,0,0,0,1.00871


In [20]:
gc.collect()

40

## Aggregation
- We use two functions for aggregation, one for numerical features and one for categorical features. Because, it is not reasonable to calculate some of the statistical values for categorical variables. 
- We use groupby method for aggregations in the functions. 
- We aggregate observations based on SK_ID_CURR since we may have more than one record for a current applicant over the months. 
- @willkoehrsen
- https://www.kaggle.com/willkoehrsen/introduction-to-manual-feature-engineering

In [22]:
def agg_numeric(df, group_var, df_name):
    """Aggregates the numeric values in a dataframe. This can
    be used to create features for each instance of the grouping variable.
    
    Parameters
    --------
        df (dataframe): 
            the dataframe to calculate the statistics on
        group_var (string): 
            the variable by which to group df
        df_name (string): 
            the variable used to rename the columns
        
    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated for 
            all numeric columns. Each instance of the grouping variable will have 
            the statistics (mean, min, max, sum, std) calculated. 
            The columns are also renamed to keep track of features created.
    
    """
    # Remove id variables other than grouping variable
    for col in df:
        if col != group_var and 'SK_ID' in col:
            df = df.drop(columns = col)
            
    group_ids = df[group_var]
    numeric_df = df.select_dtypes(exclude=['uint8'])
    numeric_df[group_var] = group_ids

    # Group by the specified variable and calculate the statistics
    agg = numeric_df.groupby(group_var).agg(['count', 'mean', 'max', 'min', 'sum', 'var']).reset_index()

    # Need to create new column names
    columns = [group_var]

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        # Skip the grouping variable
        if var != group_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1][:-1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))

    agg.columns = columns
    return agg

In [23]:
def agg_categorical(df, group_var, df_name):
    """Aggregates the encoded categorical values in a dataframe. This can
    be used to create features for each instance of the grouping variable.
    
    Parameters
    --------
        df (dataframe): 
            the dataframe to calculate the statistics on
        group_var (string): 
            the variable by which to group df
        df_name (string): 
            the variable used to rename the columns
        
    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated for 
            all labeled categorical columns. Each instance of the grouping variable 
            will have the statistics (mean, min, max, sum, std) calculated. 
            The columns are also renamed to keep track of features created.
    
    """
    # Remove id variables other than grouping variable
    for col in df:
        if col != group_var and 'SK_ID' in col:
            df = df.drop(columns = col)
            
    group_ids = df[group_var]
    categorical_df = df.select_dtypes(include=['uint8'])
    categorical_df[group_var] = group_ids

    # Group by the specified variable and calculate the statistics
    agg = categorical_df.groupby(group_var).agg(['count', 'sum', 'mean']).reset_index()

    # Need to create new column names
    columns = [group_var]

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        # Skip the grouping variable
        if var != group_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1][:-1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))

    agg.columns = columns
    return agg

### Credit Card Balance Aggregation

In [24]:
credit_card_agg_num = agg_numeric(df_credit_card_balance.drop(columns = ['SK_ID_PREV']), group_var = 'SK_ID_CURR', df_name = 'credit_card')
credit_card_agg_num.head()

Unnamed: 0,SK_ID_CURR,credit_card_MONTHS_BALANCE_count,credit_card_MONTHS_BALANCE_mean,credit_card_MONTHS_BALANCE_max,credit_card_MONTHS_BALANCE_min,credit_card_MONTHS_BALANCE_sum,credit_card_MONTHS_BALANCE_var,credit_card_AMT_BALANCE_count,credit_card_AMT_BALANCE_mean,credit_card_AMT_BALANCE_max,...,credit_card_CARD_RECEIVABLE_TO_TOTAL_RECEIVABLE_max,credit_card_CARD_RECEIVABLE_TO_TOTAL_RECEIVABLE_min,credit_card_CARD_RECEIVABLE_TO_TOTAL_RECEIVABLE_sum,credit_card_CARD_RECEIVABLE_TO_TOTAL_RECEIVABLE_var,credit_card_CARD_BALANCE_TO_LIMIT2M_count,credit_card_CARD_BALANCE_TO_LIMIT2M_mean,credit_card_CARD_BALANCE_TO_LIMIT2M_max,credit_card_CARD_BALANCE_TO_LIMIT2M_min,credit_card_CARD_BALANCE_TO_LIMIT2M_sum,credit_card_CARD_BALANCE_TO_LIMIT2M_var
0,100006,6,-3.5,-1,-6,-21,3.5,6,0.0,0.0,...,,,0.0,,2,0.0,0.0,0.0,0.0,0.0
1,100011,74,-38.5,-2,-75,-2849,462.5,74,54482.111149,189000.0,...,1.0,1.0,37.0,0.0,1,0.0,0.0,0.0,0.0,
2,100013,96,-48.5,-1,-96,-4656,776.0,96,18159.919219,161420.22,...,1.0,1.0,24.0,0.0,2,0.0,0.0,0.0,0.0,0.0
3,100021,17,-10.0,-2,-18,-170,25.5,17,0.0,0.0,...,,,0.0,,1,0.0,0.0,0.0,0.0,
4,100023,8,-7.5,-4,-11,-60,6.0,8,0.0,0.0,...,,,0.0,,0,,,,0.0,


In [25]:
credit_card_agg_cat = agg_categorical(df_credit_card_balance.drop(columns = ['SK_ID_PREV']), group_var = 'SK_ID_CURR', df_name = 'credit_card')
credit_card_agg_cat.head()

Unnamed: 0,SK_ID_CURR,credit_card_NAME_CONTRACT_STATUS_Active_count,credit_card_NAME_CONTRACT_STATUS_Active_sum,credit_card_NAME_CONTRACT_STATUS_Active_mean,credit_card_NAME_CONTRACT_STATUS_Approved_count,credit_card_NAME_CONTRACT_STATUS_Approved_sum,credit_card_NAME_CONTRACT_STATUS_Approved_mean,credit_card_NAME_CONTRACT_STATUS_Completed_count,credit_card_NAME_CONTRACT_STATUS_Completed_sum,credit_card_NAME_CONTRACT_STATUS_Completed_mean,...,credit_card_NAME_CONTRACT_STATUS_Demand_mean,credit_card_NAME_CONTRACT_STATUS_Refused_count,credit_card_NAME_CONTRACT_STATUS_Refused_sum,credit_card_NAME_CONTRACT_STATUS_Refused_mean,credit_card_NAME_CONTRACT_STATUS_Sent proposal_count,credit_card_NAME_CONTRACT_STATUS_Sent proposal_sum,credit_card_NAME_CONTRACT_STATUS_Sent proposal_mean,credit_card_NAME_CONTRACT_STATUS_Signed_count,credit_card_NAME_CONTRACT_STATUS_Signed_sum,credit_card_NAME_CONTRACT_STATUS_Signed_mean
0,100006,6,6,1.0,6,0,0.0,6,0,0.0,...,0.0,6,0,0.0,6,0,0.0,6,0,0.0
1,100011,74,74,1.0,74,0,0.0,74,0,0.0,...,0.0,74,0,0.0,74,0,0.0,74,0,0.0
2,100013,96,96,1.0,96,0,0.0,96,0,0.0,...,0.0,96,0,0.0,96,0,0.0,96,0,0.0
3,100021,17,7,0.411765,17,0,0.0,17,10,0.588235,...,0.0,17,0,0.0,17,0,0.0,17,0,0.0
4,100023,8,8,1.0,8,0,0.0,8,0,0.0,...,0.0,8,0,0.0,8,0,0.0,8,0,0.0


In [26]:
total_nan = credit_card_agg_num.isnull().sum().sort_values(ascending = False)
percent_nan = (credit_card_agg_num.isnull().sum()/credit_card_agg_num.isnull().count()*100).sort_values(ascending = False)
missing_bureau_data  = pd.concat([total_nan, percent_nan], axis=1, keys=['Total_nan', 'Percent_nan'])
missing_bureau_data.head(13)

Unnamed: 0,Total_nan,Percent_nan
credit_card_CARD_BALANCE_TO_LIMIT2M_var,54920,53.033083
credit_card_CARD_MIN_PAYMENT_TO_BALANCE_var,35478,34.259063
credit_card_CARD_AMT_DRAWINGS_TO_BALANCE_var,35130,33.923019
credit_card_CARD_RECEIVABLE_TO_TOTAL_RECEIVABLE_var,34925,33.725062
credit_card_CARD_PRINCIPAL_TO_TOTAL_RECEIVABLE_var,34925,33.725062
credit_card_CARD_PRINCIPAL_TO_RECEIVABLE_var,34925,33.725062
credit_card_CARD_MIN_TO_TOTAL_CURRENT_PAYMENT_var,33705,32.546979
credit_card_CARD_MIN_PAYMENT_TO_BALANCE_mean,33634,32.478418
credit_card_CARD_MIN_PAYMENT_TO_BALANCE_max,33634,32.478418
credit_card_CARD_MIN_PAYMENT_TO_BALANCE_min,33634,32.478418


In [27]:
# credit_card_agg_cat.to_csv('credit_card_agg_cat.csv', index=False)

In [28]:
# credit_card_agg_num.to_csv('credit_card_agg_num.csv', index=False)

In [None]:
# pd.reset_option("display.max_rows")
# pd.reset_option("display.max_columns")
# pd.get_option("display.max_rows")