In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Disabling warnings
import warnings
warnings.simplefilter("ignore")

In [5]:
# This function is taken from https://www.kaggle.com/rinnqd/reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
       
        1. Iterate over every column
        2. Determine if the column is numeric
        3. Determine if the column can be represented by an integer
        4. Find the min and the max value
        5. Determine and apply the smallest datatype that can fit the range of values

    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [6]:
bureau = import_data("bureau.csv")

Memory usage of dataframe is 222.62 MB
Memory usage after optimization is: 78.57 MB
Decreased by 64.7%


In [7]:
bureau_balance = import_data("bureau_balance.csv")

Memory usage of dataframe is 624.85 MB
Memory usage after optimization is: 156.21 MB
Decreased by 75.0%


In [8]:
df_bureau = bureau.copy()

In [9]:
df_bureau_balance = bureau_balance.copy()

## Summary Info
- Bureau
    - RangeIndex: 1,716,428 entries, 0 to 1,716,427
    - Data columns (total 17 columns)
    - dtypes: float64(8), int64(6), object(3)
- Bureau_balance
    - RangeIndex: 27,299,925 entries, 0 to 27,299,924
    - Data columns (total 3 columns)
    - dtypes: int64(2), object(1)

In [10]:
# pd.options.display.max_rows = None
# pd.options.display.max_columns = None

## Bureau

In [11]:
df_bureau.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


### NaN Values:
- There are 1,124,488 NaN values and 470,650 zero values in AMT_CREDIT_MAX_OVERDUE. We should take into account these with the fact that there are 1,712,211 observations with zero days in CREDIT_DAY_OVERDUE. Then, we can decide to replace NaN with 0 here. 

In [12]:
total_nan = df_bureau.isnull().sum().sort_values(ascending = False)
percent_nan = (df_bureau.isnull().sum()/df_bureau.isnull().count()*100).sort_values(ascending = False)
missing_bureau_data  = pd.concat([total_nan, percent_nan], axis=1, keys=['Total_nan', 'Percent_nan'])
missing_bureau_data.head(8)

Unnamed: 0,Total_nan,Percent_nan
AMT_ANNUITY,1226791,71.47349
AMT_CREDIT_MAX_OVERDUE,1124488,65.513264
DAYS_ENDDATE_FACT,633653,36.916958
AMT_CREDIT_SUM_LIMIT,591780,34.477415
AMT_CREDIT_SUM_DEBT,257669,15.011932
DAYS_CREDIT_ENDDATE,105553,6.149573
AMT_CREDIT_SUM,13,0.000757
CREDIT_TYPE,0,0.0


In [13]:
df_bureau.AMT_CREDIT_MAX_OVERDUE.fillna(0, inplace=True)

In [14]:
display(df_bureau.AMT_CREDIT_MAX_OVERDUE.isna().sum())

0

### New Features
- BUREAU_MAX_OVERDUE_TO_CREDIT = AMT_CREDIT_MAX_OVERDUE / AMT_CREDIT_SUM #Rate of max amount overdue to current credit amount
- BUREAU_DEBT_TO_CREDIT = AMT_CREDIT_SUM_DEBT / AMT_CREDIT_SUM #Rate of current debt to current credit
- BUREAU_SUM_OVERDUE_TO_DEBT = AMT_CREDIT_SUM_OVERDUE / AMT_CREDIT_SUM_DEBT # Rate of current amount overdue to current debt
- BUREAU_SUM_OVERDUE_TO_CREDIT = AMT_CREDIT_SUM_OVERDUE / AMT_CREDIT_SUM #Rate of current amount overdue to current credit 
- BUREAU_CREDIT_TERM = AMT_CREDIT_SUM / AMT_ANNUITY #Term of the credit
- BUREAU_DEBT_TO_LIMIT = AMT_CREDIT_SUM_DEBT / AMT_CREDIT_SUM_LIMIT #Rate of current debt to credit limit for credit cards

In [15]:
df_bureau['BUREAU_MAX_OVERDUE_TO_CREDIT'] = df_bureau['AMT_CREDIT_MAX_OVERDUE'] / df_bureau['AMT_CREDIT_SUM']
df_bureau['BUREAU_DEBT_TO_CREDIT'] = df_bureau['AMT_CREDIT_SUM_DEBT'] / df_bureau['AMT_CREDIT_SUM']
df_bureau['BUREAU_SUM_OVERDUE_TO_DEBT'] = df_bureau['AMT_CREDIT_SUM_OVERDUE'] / df_bureau['AMT_CREDIT_SUM_DEBT']
df_bureau['BUREAU_SUM_OVERDUE_TO_CREDIT'] = df_bureau['AMT_CREDIT_SUM_OVERDUE'] / df_bureau['AMT_CREDIT_SUM']
df_bureau['BUREAU_CREDIT_TERM'] = df_bureau['AMT_CREDIT_SUM'] /  df_bureau['AMT_ANNUITY']
df_bureau['BUREAU_DEBT_TO_LIMIT'] = df_bureau['AMT_CREDIT_SUM_DEBT'] / df_bureau['AMT_CREDIT_SUM_LIMIT']

In [16]:
df_bureau.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1716428 entries, 0 to 1716427
Data columns (total 23 columns):
SK_ID_CURR                      int32
SK_ID_BUREAU                    int32
CREDIT_ACTIVE                   category
CREDIT_CURRENCY                 category
DAYS_CREDIT                     int16
CREDIT_DAY_OVERDUE              int16
DAYS_CREDIT_ENDDATE             float16
DAYS_ENDDATE_FACT               float16
AMT_CREDIT_MAX_OVERDUE          float32
CNT_CREDIT_PROLONG              int8
AMT_CREDIT_SUM                  float32
AMT_CREDIT_SUM_DEBT             float32
AMT_CREDIT_SUM_LIMIT            float32
AMT_CREDIT_SUM_OVERDUE          float32
CREDIT_TYPE                     category
DAYS_CREDIT_UPDATE              int32
AMT_ANNUITY                     float32
BUREAU_MAX_OVERDUE_TO_CREDIT    float32
BUREAU_DEBT_TO_CREDIT           float32
BUREAU_SUM_OVERDUE_TO_DEBT      float32
BUREAU_SUM_OVERDUE_TO_CREDIT    float32
BUREAU_CREDIT_TERM              float32
BUREAU_DEBT_TO_LI

#### As we make divisions to create some of the new features (if the dividend>0 and divisor=0), we have positive and negative infinite values. We need to replace them with NaN. 
#### We have NaN if both dividend and divisor = 0.

In [17]:
df_bureau.replace([np.inf, -np.inf], np.nan,inplace=True)

In [18]:
total_nan = df_bureau.isnull().sum().sort_values(ascending = False)
percent_nan = (df_bureau.isnull().sum()/df_bureau.isnull().count()*100).sort_values(ascending = False)
missing_bureau_data  = pd.concat([total_nan, percent_nan], axis=1, keys=['Total_nan', 'Percent_nan'])
missing_bureau_data.head(13)

Unnamed: 0,Total_nan,Percent_nan
BUREAU_DEBT_TO_LIMIT,1642022,95.665067
BUREAU_CREDIT_TERM,1483709,86.441668
BUREAU_SUM_OVERDUE_TO_DEBT,1274103,74.229912
AMT_ANNUITY,1226791,71.47349
DAYS_ENDDATE_FACT,633653,36.916958
AMT_CREDIT_SUM_LIMIT,591780,34.477415
BUREAU_DEBT_TO_CREDIT,319480,18.613073
AMT_CREDIT_SUM_DEBT,257669,15.011932
DAYS_CREDIT_ENDDATE,105553,6.149573
BUREAU_MAX_OVERDUE_TO_CREDIT,66595,3.87986


### Categorical Features
- 3 categorical features are converted through one-hot encoding.

In [19]:
bureau_cat = ['CREDIT_ACTIVE','CREDIT_CURRENCY','CREDIT_TYPE']
df_bureau[bureau_cat] = df_bureau[bureau_cat].astype('category')
df_bureau = pd.get_dummies(df_bureau, columns = bureau_cat)
df_bureau.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,...,CREDIT_TYPE_Interbank credit,CREDIT_TYPE_Loan for business development,CREDIT_TYPE_Loan for purchase of shares (margin lending),CREDIT_TYPE_Loan for the purchase of equipment,CREDIT_TYPE_Loan for working capital replenishment,CREDIT_TYPE_Microloan,CREDIT_TYPE_Mobile operator loan,CREDIT_TYPE_Mortgage,CREDIT_TYPE_Real estate loan,CREDIT_TYPE_Unknown type of loan
0,215354,5714462,-497,0,-153.0,-153.0,0.0,0,91323.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,215354,5714463,-208,0,1075.0,,0.0,0,225000.0,171342.0,...,0,0,0,0,0,0,0,0,0,0
2,215354,5714464,-203,0,528.0,,0.0,0,464323.5,,...,0,0,0,0,0,0,0,0,0,0
3,215354,5714465,-203,0,,,0.0,0,90000.0,,...,0,0,0,0,0,0,0,0,0,0
4,215354,5714466,-629,0,1197.0,,77674.5,0,2700000.0,,...,0,0,0,0,0,0,0,0,0,0


#### We drop two variables which have >=85% NaN values. 

In [20]:
df_bureau.drop(labels=['BUREAU_DEBT_TO_LIMIT','BUREAU_CREDIT_TERM'], 
                             axis=1, inplace=True)

In [21]:
total_nan = df_bureau.isnull().sum().sort_values(ascending = False)
percent_nan = (df_bureau.isnull().sum()/df_bureau.isnull().count()*100).sort_values(ascending = False)
missing_bureau_data  = pd.concat([total_nan, percent_nan], axis=1, keys=['Total_nan', 'Percent_nan'])
missing_bureau_data.head(13)

Unnamed: 0,Total_nan,Percent_nan
BUREAU_SUM_OVERDUE_TO_DEBT,1274103,74.229912
AMT_ANNUITY,1226791,71.47349
DAYS_ENDDATE_FACT,633653,36.916958
AMT_CREDIT_SUM_LIMIT,591780,34.477415
BUREAU_DEBT_TO_CREDIT,319480,18.613073
AMT_CREDIT_SUM_DEBT,257669,15.011932
DAYS_CREDIT_ENDDATE,105553,6.149573
BUREAU_SUM_OVERDUE_TO_CREDIT,66595,3.87986
BUREAU_MAX_OVERDUE_TO_CREDIT,66595,3.87986
AMT_CREDIT_SUM,13,0.000757


### Imputing NaN Values (optional)
- We can use YCImpute package to impute NaN values. EM algorithm gives result in a shorter time. KNN is time consuming and Iterforest gives error. 
- A function is formed for imputation. We need to convert the dataframe into an array and the values become float. We add lines to keep the datatypes same as before the method.
- __Note__: Imputing NaN values with this algorithm decreases the AUC score compared to data set without imputaion. Different algorithms and alternatively sklearn tools can be used and compared but it is very time consuming with large data. 

In [22]:
# !pip install ycimpute

In [23]:
# from ycimpute.imputer import EM

# def nan_imputer(df):
#     int_columns = df.select_dtypes(include='int64').columns
#     cat_columns = df.select_dtypes(include='uint8').columns
#     var_names = df.columns
#     np_df = np.array(df)
#     df = EM().complete(np_df)
#     df = pd.DataFrame(df, columns = var_names)
#     df[int_columns] = df[int_columns].astype('int64')
#     df[cat_columns] = df[cat_columns].astype('uint8')
#     return df

In [24]:
# df_bureau = nan_imputer(df_bureau)

In [25]:
# display(df_bureau.head())
# display(df_bureau.info())
# display(df_bureau.isnull().sum())

## Bureau Balance

In [26]:
df_bureau_balance.head()

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C


### The conversion of values and categories in STATUS
- We have values ranging from 1-5 and also X and C. C means closed but X means unknown. So, it is reasonable to give C 0 as it is closed. But, X can be located in any place for a customer throughout the months. 
- So, it is reasonable to replace X with NaN and impute the values through ML tool. 

In [27]:
df_bureau_balance.replace({'STATUS': {'X': np.nan, 'C': 0, '0':0,'1':1,'2':2, '3':3, '4': 4, '5': 5}},inplace=True)
df_bureau_balance.head()

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,0.0
1,5715448,-1,0.0
2,5715448,-2,0.0
3,5715448,-3,0.0
4,5715448,-4,0.0


In [28]:
df_bureau_balance.STATUS.value_counts()

0.0    21146500
1.0      242347
5.0       62406
2.0       23419
3.0        8924
4.0        5847
Name: STATUS, dtype: int64

In [29]:
df_bureau_balance.isna().sum()

SK_ID_BUREAU            0
MONTHS_BALANCE          0
STATUS            5810482
dtype: int64

In [30]:
df_bureau_balance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27299925 entries, 0 to 27299924
Data columns (total 3 columns):
SK_ID_BUREAU      int32
MONTHS_BALANCE    int8
STATUS            float64
dtypes: float64(1), int32(1), int8(1)
memory usage: 338.5 MB


### Imputing Nan values

In [31]:
# df_bureau_balance = nan_imputer(df_bureau_balance)

In [32]:
# display(df_bureau_balance.head())
# display(df_bureau_balance.info())
# display(df_bureau_balance.isnull().sum())

## Aggregation
### Bureau and Bureau Balance Aggregation
- We use two functions for aggregation, one for numerical features and one for categorical features. Because, it is not reasonable to calculate some of the statistical values for categorical variables. 
- We use groupby method for aggregations in the functions.
- This function is taken from kaggle kernel of @willkoehrsen.
- https://www.kaggle.com/willkoehrsen/introduction-to-feature-selection

In [33]:
def agg_numeric(df, group_var, df_name):
    """Aggregates the numeric values in a dataframe. This can
    be used to create features for each instance of the grouping variable.
    
    Parameters
    --------
        df (dataframe): 
            the dataframe to calculate the statistics on
        group_var (string): 
            the variable by which to group df
        df_name (string): 
            the variable used to rename the columns
        
    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated for 
            all numeric columns. Each instance of the grouping variable will have 
            the statistics (mean, min, max, sum, std) calculated. 
            The columns are also renamed to keep track of features created.
    
    """
    # Remove id variables other than grouping variable
    for col in df:
        if col != group_var and 'SK_ID' in col:
            df = df.drop(columns = col)
            
    group_ids = df[group_var]
    numeric_df = df.select_dtypes(exclude=['uint8'])
    numeric_df[group_var] = group_ids

    # Group by the specified variable and calculate the statistics
    agg = numeric_df.groupby(group_var).agg(['count', 'mean', 'max', 'min', 'sum', 'var']).reset_index()

    # Need to create new column names
    columns = [group_var]

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        # Skip the grouping variable
        if var != group_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1][:-1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))

    agg.columns = columns
    return agg

In [34]:
def agg_categorical(df, group_var, df_name):
    """Aggregates the encoded categorical values in a dataframe. This can
    be used to create features for each instance of the grouping variable.
    
    Parameters
    --------
        df (dataframe): 
            the dataframe to calculate the statistics on
        group_var (string): 
            the variable by which to group df
        df_name (string): 
            the variable used to rename the columns
        
    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated for 
            all labeled categorical columns. Each instance of the grouping variable 
            will have the statistics (mean, min, max, sum, std) calculated. 
            The columns are also renamed to keep track of features created.
    
    """
    # Remove id variables other than grouping variable
    for col in df:
        if col != group_var and 'SK_ID' in col:
            df = df.drop(columns = col)
            
    group_ids = df[group_var]
    categorical_df = df.select_dtypes(include=['uint8'])
    categorical_df[group_var] = group_ids

    # Group by the specified variable and calculate the statistics
    agg = categorical_df.groupby(group_var).agg(['count', 'sum', 'mean']).reset_index()

    # Need to create new column names
    columns = [group_var]

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        # Skip the grouping variable
        if var != group_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1][:-1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))

    agg.columns = columns
    return agg

### Bureau Aggregation
- First we need to aggregate bureau_balance dataframe based on SK_ID_BUREAU.
- Then, we merge this dataframe with bureau dataframe based on SK_ID_BUREAU.
- Then, we aggregate this merged dataframe based on SK_ID_CURR and we drop SK_ID_BUREAU.
- And two aggregated dataframes are ready to merge with application train and test dataframes based on SK_ID_CURR.

In [35]:
bureau_balance_agg = agg_numeric(df_bureau_balance, group_var = 'SK_ID_BUREAU', df_name = 'bureau_balance')
bureau_balance_agg.head()

Unnamed: 0,SK_ID_BUREAU,bureau_balance_MONTHS_BALANCE_count,bureau_balance_MONTHS_BALANCE_mean,bureau_balance_MONTHS_BALANCE_max,bureau_balance_MONTHS_BALANCE_min,bureau_balance_MONTHS_BALANCE_sum,bureau_balance_MONTHS_BALANCE_var,bureau_balance_STATUS_count,bureau_balance_STATUS_mean,bureau_balance_STATUS_max,bureau_balance_STATUS_min,bureau_balance_STATUS_sum,bureau_balance_STATUS_var
0,5001709,97,-48.0,0,-96,-4656.0,792.166667,86,0.0,0.0,0.0,0.0,0.0
1,5001710,83,-41.0,0,-82,-3403.0,581.0,53,0.0,0.0,0.0,0.0,0.0
2,5001711,4,-1.5,0,-3,-6.0,1.666667,3,0.0,0.0,0.0,0.0,0.0
3,5001712,19,-9.0,0,-18,-171.0,31.666667,19,0.0,0.0,0.0,0.0,0.0
4,5001713,22,-10.5,0,-21,-231.0,42.166667,0,,,,0.0,


In [36]:
df_bureau_merged = df_bureau.merge(bureau_balance_agg, on = 'SK_ID_BUREAU', how = 'left')
df_bureau_merged.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,...,bureau_balance_MONTHS_BALANCE_max,bureau_balance_MONTHS_BALANCE_min,bureau_balance_MONTHS_BALANCE_sum,bureau_balance_MONTHS_BALANCE_var,bureau_balance_STATUS_count,bureau_balance_STATUS_mean,bureau_balance_STATUS_max,bureau_balance_STATUS_min,bureau_balance_STATUS_sum,bureau_balance_STATUS_var
0,215354,5714462,-497,0,-153.0,-153.0,0.0,0,91323.0,0.0,...,,,,,,,,,,
1,215354,5714463,-208,0,1075.0,,0.0,0,225000.0,171342.0,...,,,,,,,,,,
2,215354,5714464,-203,0,528.0,,0.0,0,464323.5,,...,,,,,,,,,,
3,215354,5714465,-203,0,,,0.0,0,90000.0,,...,,,,,,,,,,
4,215354,5714466,-629,0,1197.0,,77674.5,0,2700000.0,,...,,,,,,,,,,


In [37]:
bureau_agg_num = agg_numeric(df_bureau_merged.drop(columns = ['SK_ID_BUREAU']), group_var = 'SK_ID_CURR', df_name = 'bureau_merged')
bureau_agg_num.head()

Unnamed: 0,SK_ID_CURR,bureau_merged_DAYS_CREDIT_count,bureau_merged_DAYS_CREDIT_mean,bureau_merged_DAYS_CREDIT_max,bureau_merged_DAYS_CREDIT_min,bureau_merged_DAYS_CREDIT_sum,bureau_merged_DAYS_CREDIT_var,bureau_merged_CREDIT_DAY_OVERDUE_count,bureau_merged_CREDIT_DAY_OVERDUE_mean,bureau_merged_CREDIT_DAY_OVERDUE_max,...,bureau_merged_bureau_balance_STATUS_sum_max,bureau_merged_bureau_balance_STATUS_sum_min,bureau_merged_bureau_balance_STATUS_sum_sum,bureau_merged_bureau_balance_STATUS_sum_var,bureau_merged_bureau_balance_STATUS_var_count,bureau_merged_bureau_balance_STATUS_var_mean,bureau_merged_bureau_balance_STATUS_var_max,bureau_merged_bureau_balance_STATUS_var_min,bureau_merged_bureau_balance_STATUS_var_sum,bureau_merged_bureau_balance_STATUS_var_var
0,100001,7,-735.0,-49,-1572,-5145,240043.666667,7,0.0,0,...,1.0,0.0,1.0,0.142857,6,0.012821,0.076923,0.0,0.076923,0.000986
1,100002,8,-874.0,-103,-1437,-6992,186150.0,8,0.0,0,...,6.0,0.0,27.0,8.267857,8,0.182234,0.333333,0.0,1.457875,0.019663
2,100003,4,-1400.75,-606,-2586,-5603,827783.583333,4,0.0,0,...,,,0.0,,0,,,,0.0,
3,100004,2,-867.0,-408,-1326,-1734,421362.0,2,0.0,0,...,,,0.0,,0,,,,0.0,
4,100005,3,-190.666667,-62,-373,-572,26340.333333,3,0.0,0,...,0.0,0.0,0.0,0.0,3,0.0,0.0,0.0,0.0,0.0


In [38]:
bureau_agg_cat = agg_categorical(df_bureau_merged.drop(columns = ['SK_ID_BUREAU']), group_var = 'SK_ID_CURR', df_name = 'bureau_merged')
bureau_agg_cat.head()

Unnamed: 0,SK_ID_CURR,bureau_merged_CREDIT_ACTIVE_Active_count,bureau_merged_CREDIT_ACTIVE_Active_sum,bureau_merged_CREDIT_ACTIVE_Active_mean,bureau_merged_CREDIT_ACTIVE_Bad debt_count,bureau_merged_CREDIT_ACTIVE_Bad debt_sum,bureau_merged_CREDIT_ACTIVE_Bad debt_mean,bureau_merged_CREDIT_ACTIVE_Closed_count,bureau_merged_CREDIT_ACTIVE_Closed_sum,bureau_merged_CREDIT_ACTIVE_Closed_mean,...,bureau_merged_CREDIT_TYPE_Mobile operator loan_mean,bureau_merged_CREDIT_TYPE_Mortgage_count,bureau_merged_CREDIT_TYPE_Mortgage_sum,bureau_merged_CREDIT_TYPE_Mortgage_mean,bureau_merged_CREDIT_TYPE_Real estate loan_count,bureau_merged_CREDIT_TYPE_Real estate loan_sum,bureau_merged_CREDIT_TYPE_Real estate loan_mean,bureau_merged_CREDIT_TYPE_Unknown type of loan_count,bureau_merged_CREDIT_TYPE_Unknown type of loan_sum,bureau_merged_CREDIT_TYPE_Unknown type of loan_mean
0,100001,7,3,0.428571,7,0,0.0,7,4,0.571429,...,0.0,7,0,0.0,7,0,0.0,7,0,0.0
1,100002,8,2,0.25,8,0,0.0,8,6,0.75,...,0.0,8,0,0.0,8,0,0.0,8,0,0.0
2,100003,4,1,0.25,4,0,0.0,4,3,0.75,...,0.0,4,0,0.0,4,0,0.0,4,0,0.0
3,100004,2,0,0.0,2,0,0.0,2,2,1.0,...,0.0,2,0,0.0,2,0,0.0,2,0,0.0
4,100005,3,2,0.666667,3,0,0.0,3,1,0.333333,...,0.0,3,0,0.0,3,0,0.0,3,0,0.0


In [39]:
total_nan = bureau_agg_cat.isnull().sum().sort_values(ascending = False)
percent_nan = (bureau_agg_cat.isnull().sum()/bureau_agg_cat.isnull().count()*100).sort_values(ascending = False)
missing_bureau_data  = pd.concat([total_nan, percent_nan], axis=1, keys=['Total_nan', 'Percent_nan'])
missing_bureau_data.head()

Unnamed: 0,Total_nan,Percent_nan
bureau_merged_CREDIT_TYPE_Unknown type of loan_mean,0,0.0
bureau_merged_CREDIT_TYPE_Another type of loan_count,0,0.0
bureau_merged_CREDIT_CURRENCY_currency 3_count,0,0.0
bureau_merged_CREDIT_CURRENCY_currency 3_sum,0,0.0
bureau_merged_CREDIT_CURRENCY_currency 3_mean,0,0.0


In [40]:
total_nan = bureau_agg_num.isnull().sum().sort_values(ascending = False)
percent_nan = (bureau_agg_num.isnull().sum()/bureau_agg_num.isnull().count()*100).sort_values(ascending = False)
missing_bureau_data  = pd.concat([total_nan, percent_nan], axis=1, keys=['Total_nan', 'Percent_nan'])
missing_bureau_data.head(20)

Unnamed: 0,Total_nan,Percent_nan
bureau_merged_AMT_ANNUITY_var,213412,69.785587
bureau_merged_bureau_balance_STATUS_var_var,195586,63.956496
bureau_merged_bureau_balance_STATUS_min_var,193936,63.416947
bureau_merged_bureau_balance_STATUS_mean_var,193936,63.416947
bureau_merged_bureau_balance_STATUS_max_var,193936,63.416947
bureau_merged_bureau_balance_MONTHS_BALANCE_var_var,189307,61.903267
bureau_merged_bureau_balance_MONTHS_BALANCE_min_var,188909,61.773121
bureau_merged_bureau_balance_STATUS_count_var,188909,61.773121
bureau_merged_bureau_balance_MONTHS_BALANCE_sum_var,188909,61.773121
bureau_merged_bureau_balance_MONTHS_BALANCE_count_var,188909,61.773121


#### We can use the following coded to download dataFrames as csv files.

In [41]:
# bureau_agg_cat.to_csv('bureau_agg_cat.csv', index=False)

In [42]:
# bureau_agg_num.to_csv('bureau_agg_num.csv', index=False)

In [None]:
# pd.reset_option("display.max_rows")
# pd.reset_option("display.max_columns")
# pd.get_option("display.max_rows")