In [11]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Disabling warnings
import warnings
warnings.simplefilter("ignore")

In [12]:
def read_csv(path):
    """The function reads a csv file, converts it to a data frame and returns a copy of the data frame."""
    data = pd.read_csv(path, encoding = 'unicode_escape')
    return data.copy()

previous_application = read_csv("previous_application.csv")

In [13]:
df_previous_application = previous_application.copy()

## Summary Info
- Previous_application
    - RangeIndex: 1,670,214 entries, 0 to 1,670,213
    - Data columns (total 37 columns)
    - dtypes: float64(15), int64(6), object(16)

In [14]:
# pd.options.display.max_rows = None
# pd.options.display.max_columns = None

## Previous Application

In [15]:
df_previous_application.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,...,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,...,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,...,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,...,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,...,XNA,24.0,high,Cash Street: high,,,,,,


In [16]:
df_previous_application.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1670214 entries, 0 to 1670213
Data columns (total 37 columns):
SK_ID_PREV                     1670214 non-null int64
SK_ID_CURR                     1670214 non-null int64
NAME_CONTRACT_TYPE             1670214 non-null object
AMT_ANNUITY                    1297979 non-null float64
AMT_APPLICATION                1670214 non-null float64
AMT_CREDIT                     1670213 non-null float64
AMT_DOWN_PAYMENT               774370 non-null float64
AMT_GOODS_PRICE                1284699 non-null float64
WEEKDAY_APPR_PROCESS_START     1670214 non-null object
HOUR_APPR_PROCESS_START        1670214 non-null int64
FLAG_LAST_APPL_PER_CONTRACT    1670214 non-null object
NFLAG_LAST_APPL_IN_DAY         1670214 non-null int64
RATE_DOWN_PAYMENT              774370 non-null float64
RATE_INTEREST_PRIMARY          5951 non-null float64
RATE_INTEREST_PRIVILEGED       5951 non-null float64
NAME_CASH_LOAN_PURPOSE         1670214 non-null object
NAME_CONTRA

In [17]:
total_nan = df_previous_application.isnull().sum().sort_values(ascending = False)
percent_nan = (df_previous_application.isnull().sum()/df_previous_application.isnull().count()*100).sort_values(ascending = False)
missing_previous_data = pd.concat([total_nan, percent_nan], axis=1, keys=['Total_nan', 'Percent_nan'])
missing_previous_data.head(17)

Unnamed: 0,Total_nan,Percent_nan
RATE_INTEREST_PRIVILEGED,1664263,99.643698
RATE_INTEREST_PRIMARY,1664263,99.643698
RATE_DOWN_PAYMENT,895844,53.63648
AMT_DOWN_PAYMENT,895844,53.63648
NAME_TYPE_SUITE,820405,49.119754
DAYS_TERMINATION,673065,40.298129
NFLAG_INSURED_ON_APPROVAL,673065,40.298129
DAYS_FIRST_DRAWING,673065,40.298129
DAYS_FIRST_DUE,673065,40.298129
DAYS_LAST_DUE_1ST_VERSION,673065,40.298129


### 365,243 value ve NaN values
- DAYS_FIRST_DRAWING, DAYS_FIRST_DUE, DAYS_LAST_DUE_1ST_VERSION, DAYS_LAST_DUE, DAYS_TERMINATION degiskenlerinde max deger olarak gozuken 365,243 degeri NaN olarak degistirilecek. SK_ID_CURR degerlerinden bir tanesi 365243 degeri aliyormus o yuzden replace isleminde SK_ID_CURR'u disarda tutmamiz gerekiyor.
- AMT_DOWN_PAYMENT ve RATE_DOWN_PAYMENT’ta nan degerler yerine 0 konulacak. Cunku down payment sadece POS Loan’larda var. 
- NAME_TYPE_SUITE’te NaN degerler unknown kategorisi ile degistirilecek.
- DAYS_FIRST_DRAWING, DAYS_FIRST_DUE, DAYS_LAST_DUE_1ST_VERSION, DAYS_LAST_DUE, DAYS_TERMINATION have values of 365,243, it is an automated infinite number given but the system. These need to be replaced by NaN. But there is one 365243 value in SK_ID_CURR. So, it would be misleading to replace all 365243 values with NaN. 

In [18]:
df_previous_application.replace({'DAYS_FIRST_DRAWING':365243,'DAYS_FIRST_DUE':365243,'DAYS_LAST_DUE_1ST_VERSION':365243,
                                   'DAYS_LAST_DUE':365243,'DAYS_TERMINATION':365243}, np.nan, inplace=True)
df_previous_application.AMT_DOWN_PAYMENT.fillna(0, inplace=True)
df_previous_application.RATE_DOWN_PAYMENT.fillna(0, inplace=True)
df_previous_application.NAME_TYPE_SUITE.fillna('Unknown',inplace=True)

### XAP and XNA values
- Besides NaN values, we have a lot of XNA and XAP values particularly in categorical variables. We need to arrange this values and replace them with appropriate values. 
- NAME_CONTRACT_TYPE: 
    - 346 XNA; Unknown
- NAME_CASH_LOAN_PURPOSE:
    - 677,918 XNA; Unknown
    - 922,661 XAP; Unknown
- NAME_PAYMENT_TYPE
    - 627,384 XNA; Unknown denilecek.
- CODE_REJECT_REASON
    - 5244 XNA; Other
    - 1,353,093 XAP; Not refused (it is not applicable because we have 290,678 refused and 26,436 Unused offer in NAME_CONTRACT_STATUS. The rest of the credit applications fall into other categories and they are not refused.)
- NAME_CLIENT_TYPE
    - 1941 XNA; Unknown
- NAME_GOODS_CATEGORY
    - 950,809 XNA; Unknown (We cannot knoe the name goods category especially in Cash Loans and Revolving Loans.)
- NAME_PORTFOLIO
    - 372,230 XNA; NaN
- NAME_PRODUCT_TYPE
    - 1,063,666 XNA; Unknown
- NAME_SELLER_INDUSTRY
    - 855,720 XNA; Unknown
- NAME_YIELD_GROUP
    - 517,215 XNA; 0 (low_action and low_normal = 1, middle = 3, high = 4)

In [19]:
df_previous_application.NAME_CONTRACT_TYPE.replace('XNA','Unknown',inplace=True)
df_previous_application.replace({'NAME_CASH_LOAN_PURPOSE':{'XNA':'Unknown', 'XAP':'Unknown'}},inplace=True)
df_previous_application.NAME_PAYMENT_TYPE.replace('XNA','Unknown',inplace=True)
df_previous_application.replace({'CODE_REJECT_REASON':{'XAP':'Not refused', 'XNA':'Other'}},inplace=True)
df_previous_application.NAME_CLIENT_TYPE.replace('XNA','Unknown',inplace=True)
df_previous_application.NAME_GOODS_CATEGORY.replace('XNA','Unknown',inplace=True)
df_previous_application.NAME_PORTFOLIO.replace('XNA',np.nan,inplace=True)
df_previous_application.NAME_PRODUCT_TYPE.replace('XNA','Unknown',inplace=True)
df_previous_application.NAME_SELLER_INDUSTRY.replace('XNA','Unknown',inplace=True)
df_previous_application.replace({'NAME_YIELD_GROUP':{'XNA':0,'low_normal':1,'low_action':1,'middle':3,'high':4}},inplace=True)

In [20]:
# for i in df_previous_application.columns:
#     print(i)
#     print(df_previous_application[i][df_previous_application[i]=="XNA"].count())
#     print(df_previous_application[i][df_previous_application[i]=="XAP"].count())

In [21]:
total_nan = df_previous_application.isnull().sum().sort_values(ascending = False)
percent_nan = (df_previous_application.isnull().sum()/df_previous_application.isnull().count()*100).sort_values(ascending = False)
missing_previous_data = pd.concat([total_nan, percent_nan], axis=1, keys=['Total_nan', 'Percent_nan'])
missing_previous_data.head(17)

Unnamed: 0,Total_nan,Percent_nan
RATE_INTEREST_PRIVILEGED,1664263,99.643698
RATE_INTEREST_PRIMARY,1664263,99.643698
DAYS_FIRST_DRAWING,1607509,96.245691
DAYS_TERMINATION,898978,53.824121
DAYS_LAST_DUE,884286,52.944473
DAYS_LAST_DUE_1ST_VERSION,766929,45.918008
DAYS_FIRST_DUE,713710,42.73165
NFLAG_INSURED_ON_APPROVAL,673065,40.298129
AMT_GOODS_PRICE,385515,23.081773
AMT_ANNUITY,372235,22.286665


#### We drop 3 features which have higher than 90% NaN values after NaN, XAP, XNA, 365243 arrangements. 

In [22]:
df_previous_application.drop(labels=['RATE_INTEREST_PRIVILEGED','RATE_INTEREST_PRIMARY','DAYS_FIRST_DRAWING'], 
                             axis=1, inplace=True)

### New Features
- PREVIOUS_TERM = AMT_CREDIT / AMT_ANNUITY #Term of the credit
- PREVIOUS_AMT_TO_APPLICATION = AMT_CREDIT / AMT_APPLICATION #Rate of credit amount to application amount
- PREVIOUS_CREDIT_TO_PRICE = AMT_GOODS_PRICE / AMT_CREDIT #Rate of goods price to credit amount
- PREVIOUS_DAYSLASTDUE1ST_DAYSFIRSTDUE_DIFF = DAYS_LAST_DUE_1ST_VERSION - DAYS_FIRST_DUE #Days difference between last due date 1st version and first due date
- PREVIOUS_DAYSLASTDUE_DAYSFIRSTDUE_DIFF = DAYS_LAST_DUE - DAYS_FIRST_DUE #Days difference between last due date 1st and first due date
- PREVIOUS_DAYSLASTDUE_DAYSLASTDUE1ST_DIFF = DAYS_LAST_DUE - DAYS_LAST_DUE_1ST_VERSION #Days difference between last due date 1st version and last due date
- PREVIOUS_TERMINATION_DAYSLASTDUE_DIFF = DAYS_TERMINATION - DAYS_LAST_DUE #Days difference between termination and last due date

In [23]:
df_previous_application['PREVIOUS_TERM'] = df_previous_application['AMT_CREDIT'] / df_previous_application['AMT_ANNUITY']
df_previous_application['PREVIOUS_AMT_TO_APPLICATION'] = df_previous_application['AMT_CREDIT'] / df_previous_application['AMT_APPLICATION']
df_previous_application['PREVIOUS_CREDIT_TO_PRICE'] = df_previous_application['AMT_GOODS_PRICE'] / df_previous_application['AMT_CREDIT']
df_previous_application['PREVIOUS_DAYSLASTDUE1ST_DAYSFIRSTDUE_DIFF'] = df_previous_application['DAYS_LAST_DUE_1ST_VERSION'] - df_previous_application['DAYS_FIRST_DUE']
df_previous_application['PREVIOUS_DAYSLASTDUE_DAYSFIRSTDUE_DIFF'] = df_previous_application['DAYS_LAST_DUE'] - df_previous_application['DAYS_FIRST_DUE']
df_previous_application['PREVIOUS_DAYSLASTDUE_DAYSLASTDUE1ST_DIFF'] = df_previous_application['DAYS_LAST_DUE'] - df_previous_application['DAYS_LAST_DUE_1ST_VERSION']
df_previous_application['PREVIOUS_TERMINATION_DAYSLASTDUE_DIFF'] = df_previous_application['DAYS_TERMINATION'] - df_previous_application['DAYS_LAST_DUE']

#### Interest Rate Calculations
- We use number of payments, annuity and credit to calculate interest amount, interest rate and share of interest amount in credit. 
- The interest calculations are taken from the kaggle write-up of @kingychiu. 
- https://www.kaggle.com/c/home-credit-default-risk/discussion/64598

In [24]:
df_previous_application['INTEREST'] = df_previous_application['CNT_PAYMENT']*df_previous_application['AMT_ANNUITY'] - df_previous_application['AMT_CREDIT']
df_previous_application['INTEREST_RATE'] = 2*12*df_previous_application['INTEREST']/(df_previous_application['AMT_CREDIT']*(df_previous_application['CNT_PAYMENT']+1))
df_previous_application['INTEREST_SHARE'] = df_previous_application['INTEREST']/df_previous_application['AMT_CREDIT']

#### As we make divisions to create some of the new features (if the dividend>0 and divisor=0), we have positive and negative infinite values. We need to replace them with NaN.

In [25]:
df_previous_application.replace([np.inf, -np.inf], np.nan,inplace=True)

In [26]:
total_nan = df_previous_application.isnull().sum().sort_values(ascending = False)
percent_nan = (df_previous_application.isnull().sum()/df_previous_application.isnull().count()*100).sort_values(ascending = False)
missing_previous_data = pd.concat([total_nan, percent_nan], axis=1, keys=['Total_nan', 'Percent_nan'])
missing_previous_data.head(20)

Unnamed: 0,Total_nan,Percent_nan
PREVIOUS_DAYSLASTDUE_DAYSLASTDUE1ST_DIFF,908297,54.382073
PREVIOUS_TERMINATION_DAYSLASTDUE_DIFF,904090,54.130189
DAYS_TERMINATION,898978,53.824121
PREVIOUS_DAYSLASTDUE_DAYSFIRSTDUE_DIFF,884286,52.944473
DAYS_LAST_DUE,884286,52.944473
PREVIOUS_DAYSLASTDUE1ST_DAYSFIRSTDUE_DIFF,774193,46.352922
DAYS_LAST_DUE_1ST_VERSION,766929,45.918008
DAYS_FIRST_DUE,713710,42.73165
NFLAG_INSURED_ON_APPROVAL,673065,40.298129
PREVIOUS_AMT_TO_APPLICATION,392402,23.494115


### Categorical Features
- FLAG_LAST_APPL_PER_CONTRACT; binary encoding, the rest one-hot encoding.

In [27]:
df_previous_application['FLAG_LAST_APPL_PER_CONTRACT'] = [1 if each=='Y' else 0 for each in df_previous_application.FLAG_LAST_APPL_PER_CONTRACT]

In [28]:
previous_cat = df_previous_application.select_dtypes(include='object').columns
df_previous_application[previous_cat] = df_previous_application[previous_cat].astype('category')
df_previous_application = pd.get_dummies(df_previous_application, columns = previous_cat)
df_previous_application.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,HOUR_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,NFLAG_LAST_APPL_IN_DAY,...,PRODUCT_COMBINATION_Cash X-Sell: low,PRODUCT_COMBINATION_Cash X-Sell: middle,PRODUCT_COMBINATION_POS household with interest,PRODUCT_COMBINATION_POS household without interest,PRODUCT_COMBINATION_POS industry with interest,PRODUCT_COMBINATION_POS industry without interest,PRODUCT_COMBINATION_POS mobile with interest,PRODUCT_COMBINATION_POS mobile without interest,PRODUCT_COMBINATION_POS other with interest,PRODUCT_COMBINATION_POS others without interest
0,2030495,271877,1730.43,17145.0,17145.0,0.0,17145.0,15,1,1,...,0,0,0,0,0,0,1,0,0,0
1,2802425,108129,25188.615,607500.0,679671.0,0.0,607500.0,11,1,1,...,1,0,0,0,0,0,0,0,0,0
2,2523466,122040,15060.735,112500.0,136444.5,0.0,112500.0,11,1,1,...,0,0,0,0,0,0,0,0,0,0
3,2819243,176158,47041.335,450000.0,470790.0,0.0,450000.0,7,1,1,...,0,1,0,0,0,0,0,0,0,0
4,1784265,202054,31924.395,337500.0,404055.0,0.0,337500.0,9,1,1,...,0,0,0,0,0,0,0,0,0,0


In [29]:
df_previous_application.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1670214 entries, 0 to 1670213
Columns: 165 entries, SK_ID_PREV to PRODUCT_COMBINATION_POS others without interest
dtypes: float64(22), int64(8), uint8(135)
memory usage: 597.3 MB


### Imputing NaN Values (optional)
- We can use YCImpute package to impute NaN values. EM algorithm gives result in a shorter time. KNN is time consuming and Iterforest gives error. 
- A function is formed for imputation. We need to convert the dataframe into an array and the values become float. We add lines to keep the datatypes same as before the method.
- __Note__: Imputing NaN values with this algorithm decreases the AUC score compared to data set without imputaion. Different algorithms and alternatively sklearn tools can be used and compared but it is very time consuming with large data. 

In [30]:
# from ycimpute.imputer import iterforest
# from ycimpute.imputer import EM
# from ycimpute.imputer import knnimput

# def nan_imputer(df):
#     int_columns = df.select_dtypes(include='int64').columns
#     cat_columns = df.select_dtypes(include='uint8').columns
#     var_names = df.columns
#     np_df = np.array(df)
#     df = EM().complete(np_df)
#     df = pd.DataFrame(df, columns = var_names)
#     df[int_columns] = df[int_columns].astype('int64')
#     df[cat_columns] = df[cat_columns].astype('uint8')
#     return df
# num_columns = df_previous_application.select_dtypes(exclude='uint8').columns
# df_previous_application[num_columns] = nan_imputer(df_previous_application[num_columns])

In [31]:
# display(df_previous_application.head())
# display(df_previous_application.info())
# display(df_previous_application.isnull().sum())

## Aggregation
- We use two functions for aggregation, one for numerical features and one for categorical features. Because, it is not reasonable to calculate some of the statistical values for categorical variables. 
- We use groupby method for aggregations in the functions. 
- This function is taken from kaggle kernel of @willkoehrsen.
- https://www.kaggle.com/willkoehrsen/introduction-to-manual-feature-engineering
- We aggregate the observation based on SK_ID_CURR since we may have more than one previous credit used by an applicant.

In [32]:
def agg_numeric(df, group_var, df_name):
    """Aggregates the numeric values in a dataframe. This can
    be used to create features for each instance of the grouping variable.
    
    Parameters
    --------
        df (dataframe): 
            the dataframe to calculate the statistics on
        group_var (string): 
            the variable by which to group df
        df_name (string): 
            the variable used to rename the columns
        
    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated for 
            all numeric columns. Each instance of the grouping variable will have 
            the statistics (mean, min, max, sum, std) calculated. 
            The columns are also renamed to keep track of features created.
    
    """
    # Remove id variables other than grouping variable
    for col in df:
        if col != group_var and 'SK_ID' in col:
            df = df.drop(columns = col)
            
    group_ids = df[group_var]
    numeric_df = df.select_dtypes(exclude=['uint8'])
    numeric_df[group_var] = group_ids

    # Group by the specified variable and calculate the statistics
    agg = numeric_df.groupby(group_var).agg(['count', 'mean', 'max', 'min', 'sum', 'var']).reset_index()

    # Need to create new column names
    columns = [group_var]

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        # Skip the grouping variable
        if var != group_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1][:-1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))

    agg.columns = columns
    return agg

In [33]:
def agg_categorical(df, group_var, df_name):
    """Aggregates the encoded categorical values in a dataframe. This can
    be used to create features for each instance of the grouping variable.
    
    Parameters
    --------
        df (dataframe): 
            the dataframe to calculate the statistics on
        group_var (string): 
            the variable by which to group df
        df_name (string): 
            the variable used to rename the columns
        
    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated for 
            all labeled categorical columns. Each instance of the grouping variable 
            will have the statistics (mean, min, max, sum, std) calculated. 
            The columns are also renamed to keep track of features created.
    
    """
    # Remove id variables other than grouping variable
    for col in df:
        if col != group_var and 'SK_ID' in col:
            df = df.drop(columns = col)
            
    group_ids = df[group_var]
    categorical_df = df.select_dtypes(include=['uint8'])
    categorical_df[group_var] = group_ids

    # Group by the specified variable and calculate the statistics
    agg = categorical_df.groupby(group_var).agg(['count', 'sum', 'mean']).reset_index()

    # Need to create new column names
    columns = [group_var]

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        # Skip the grouping variable
        if var != group_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1][:-1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))

    agg.columns = columns
    return agg

### Previous Aggregation

In [34]:
previous_agg_num = agg_numeric(df_previous_application.drop(columns = ['SK_ID_PREV']), group_var = 'SK_ID_CURR', df_name = 'previous')
previous_agg_num.head()

Unnamed: 0,SK_ID_CURR,previous_AMT_ANNUITY_count,previous_AMT_ANNUITY_mean,previous_AMT_ANNUITY_max,previous_AMT_ANNUITY_min,previous_AMT_ANNUITY_sum,previous_AMT_ANNUITY_var,previous_AMT_APPLICATION_count,previous_AMT_APPLICATION_mean,previous_AMT_APPLICATION_max,...,previous_INTEREST_RATE_max,previous_INTEREST_RATE_min,previous_INTEREST_RATE_sum,previous_INTEREST_RATE_var,previous_INTEREST_SHARE_count,previous_INTEREST_SHARE_mean,previous_INTEREST_SHARE_max,previous_INTEREST_SHARE_min,previous_INTEREST_SHARE_sum,previous_INTEREST_SHARE_var
0,100001,1,3951.0,3951.0,3951.0,3951.0,,1,24835.5,24835.5,...,0.876781,0.876781,0.876781,,1,0.328793,0.328793,0.328793,0.328793,
1,100002,1,9251.775,9251.775,9251.775,9251.775,,1,179055.0,179055.0,...,0.230477,0.230477,0.230477,,1,0.24008,0.24008,0.24008,0.24008,
2,100003,3,56553.99,98356.995,6737.31,169661.97,2146706000.0,3,435436.5,900000.0,...,0.381257,0.257354,0.985692,0.004095,3,0.146201,0.188002,0.1112,0.438602,0.001509
3,100004,1,5357.25,5357.25,5357.25,5357.25,,1,24282.0,24282.0,...,0.315846,0.315846,0.315846,,1,0.065801,0.065801,0.065801,0.065801,
4,100005,1,4813.2,4813.2,4813.2,4813.2,,2,22308.75,44617.5,...,0.809428,0.809428,0.809428,,1,0.43844,0.43844,0.43844,0.43844,


In [35]:
previous_agg_cat = agg_categorical(df_previous_application.drop(columns = ['SK_ID_PREV']), group_var = 'SK_ID_CURR', df_name = 'previous')
previous_agg_cat.head()

Unnamed: 0,SK_ID_CURR,previous_NAME_CONTRACT_TYPE_Cash loans_count,previous_NAME_CONTRACT_TYPE_Cash loans_sum,previous_NAME_CONTRACT_TYPE_Cash loans_mean,previous_NAME_CONTRACT_TYPE_Consumer loans_count,previous_NAME_CONTRACT_TYPE_Consumer loans_sum,previous_NAME_CONTRACT_TYPE_Consumer loans_mean,previous_NAME_CONTRACT_TYPE_Revolving loans_count,previous_NAME_CONTRACT_TYPE_Revolving loans_sum,previous_NAME_CONTRACT_TYPE_Revolving loans_mean,...,previous_PRODUCT_COMBINATION_POS mobile with interest_mean,previous_PRODUCT_COMBINATION_POS mobile without interest_count,previous_PRODUCT_COMBINATION_POS mobile without interest_sum,previous_PRODUCT_COMBINATION_POS mobile without interest_mean,previous_PRODUCT_COMBINATION_POS other with interest_count,previous_PRODUCT_COMBINATION_POS other with interest_sum,previous_PRODUCT_COMBINATION_POS other with interest_mean,previous_PRODUCT_COMBINATION_POS others without interest_count,previous_PRODUCT_COMBINATION_POS others without interest_sum,previous_PRODUCT_COMBINATION_POS others without interest_mean
0,100001,1,0,0.0,1,1,1.0,1,0,0.0,...,1.0,1,0,0.0,1,0,0.0,1,0,0.0
1,100002,1,0,0.0,1,1,1.0,1,0,0.0,...,0.0,1,0,0.0,1,1,1.0,1,0,0.0
2,100003,3,1,0.333333,3,2,0.666667,3,0,0.0,...,0.0,3,0,0.0,3,0,0.0,3,0,0.0
3,100004,1,0,0.0,1,1,1.0,1,0,0.0,...,0.0,1,1,1.0,1,0,0.0,1,0,0.0
4,100005,2,1,0.5,2,1,0.5,2,0,0.0,...,0.5,2,0,0.0,2,0,0.0,2,0,0.0


In [2]:
# previous_agg_cat.to_csv('previous_agg_cat.csv', index=False)

In [1]:
# previous_agg_num.to_csv('previous_agg_num.csv', index=False)

In [5]:
# pd.reset_option("display.max_rows")
# pd.reset_option("display.max_columns")
# pd.get_option("display.max_rows")

60