In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# For data visualization
import matplotlib.pyplot as plt
import matplotlib as mpl

# Disabling warnings
import warnings
warnings.simplefilter("ignore")

In [2]:
def read_csv(path):
    """The function reads a csv file, converts it to a data frame and returns a copy of the data frame."""
    data = pd.read_csv(path, encoding = 'unicode_escape')
    return data.copy()

installments_payments = read_csv("installments_payments.csv")

In [3]:
df_installments_payments = installments_payments.copy()

## Summary Info
- Installments_balance
    - RangeIndex: 13,605,401 entries, 0 to 13,605,400
    - Data columns (total 8 columns)
    - dtypes: float64(5), int64(3)

In [4]:
# pd.options.display.max_rows = None
# pd.options.display.max_columns = None

## Installments Payments

In [5]:
df_installments_payments.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585


### New Features
- INSTALLMENTS_DAYS_DIFF = DAYS_ENTRY_PAYMENT - DAYS_INSTALMENT #Difference of supposed and actual payment date.
- INSTALLMENTS_AMT_INSTALLMENT_TO_PAYMENT = AMT_INSTALMENT / AMT_PAYMENT #Ratio of supposed and paid amount
- INSTALLMENTS_DAYS_DIFF_LASTYR #Difference of supposed and actual payment date for the last year. 

In [6]:
df_installments_payments['INSTALLMENTS_DAYS_DIFF'] = df_installments_payments['DAYS_INSTALMENT'] - df_installments_payments['DAYS_ENTRY_PAYMENT']
df_installments_payments['INSTALLMENTS_AMT_INSTALLMENT_TO_PAYMENT'] = df_installments_payments['AMT_INSTALMENT'] / df_installments_payments['AMT_PAYMENT']

In [7]:
df_installments_payments['INSTALLMENTS_DAYS_365'] = [x if x >= -365 else np.nan for x in df_installments_payments.DAYS_ENTRY_PAYMENT]
df_installments_payments['INSTALLMENTS_DAYS_DIFF_LASTYR'] = df_installments_payments['DAYS_INSTALMENT'] - df_installments_payments['INSTALLMENTS_DAYS_365']
df_installments_payments.drop('INSTALLMENTS_DAYS_365', axis=1, inplace=True)
df_installments_payments.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,INSTALLMENTS_DAYS_DIFF,INSTALLMENTS_AMT_INSTALLMENT_TO_PAYMENT,INSTALLMENTS_DAYS_DIFF_LASTYR
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36,7.0,1.0,
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525,0.0,1.0,
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0,0.0,1.0,0.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13,8.0,1.0,
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585,-17.0,1.002062,


In [8]:
df_installments_payments.isnull().sum()

SK_ID_PREV                                        0
SK_ID_CURR                                        0
NUM_INSTALMENT_VERSION                            0
NUM_INSTALMENT_NUMBER                             0
DAYS_INSTALMENT                                   0
DAYS_ENTRY_PAYMENT                             2905
AMT_INSTALMENT                                    0
AMT_PAYMENT                                    2905
INSTALLMENTS_DAYS_DIFF                         2905
INSTALLMENTS_AMT_INSTALLMENT_TO_PAYMENT        2907
INSTALLMENTS_DAYS_DIFF_LASTYR              10247450
dtype: int64

In [9]:
df_installments_payments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13605401 entries, 0 to 13605400
Data columns (total 11 columns):
SK_ID_PREV                                 int64
SK_ID_CURR                                 int64
NUM_INSTALMENT_VERSION                     float64
NUM_INSTALMENT_NUMBER                      int64
DAYS_INSTALMENT                            float64
DAYS_ENTRY_PAYMENT                         float64
AMT_INSTALMENT                             float64
AMT_PAYMENT                                float64
INSTALLMENTS_DAYS_DIFF                     float64
INSTALLMENTS_AMT_INSTALLMENT_TO_PAYMENT    float64
INSTALLMENTS_DAYS_DIFF_LASTYR              float64
dtypes: float64(8), int64(3)
memory usage: 1.1 GB


In [10]:
df_installments_payments.replace([np.inf, -np.inf], np.nan,inplace=True)

In [11]:
total_nan = df_installments_payments.isnull().sum().sort_values(ascending = False)
percent_nan = (df_installments_payments.isnull().sum()/df_installments_payments.isnull().count()*100).sort_values(ascending = False)
missing_installments  = pd.concat([total_nan, percent_nan], axis=1, keys=['Total_nan', 'Percent_nan'])
missing_installments

Unnamed: 0,Total_nan,Percent_nan
INSTALLMENTS_DAYS_DIFF_LASTYR,10247450,75.318985
INSTALLMENTS_AMT_INSTALLMENT_TO_PAYMENT,4345,0.031936
INSTALLMENTS_DAYS_DIFF,2905,0.021352
AMT_PAYMENT,2905,0.021352
DAYS_ENTRY_PAYMENT,2905,0.021352
AMT_INSTALMENT,0,0.0
DAYS_INSTALMENT,0,0.0
NUM_INSTALMENT_NUMBER,0,0.0
NUM_INSTALMENT_VERSION,0,0.0
SK_ID_CURR,0,0.0


In [12]:
import gc
gc.collect()

135

## Aggregation
- We use two functions for aggregation, one for numerical features and one for categorical features. Because, it is not reasonable to calculate some of the statistical values for categorical variables. 
- We use groupby method for aggregations in the functions. 
- We aggregate observations based on SK_ID_CURR since we may have multiple records for each applicant.
- @willkoehrsen
- https://www.kaggle.com/willkoehrsen/introduction-to-manual-feature-engineering

In [14]:
def agg_numeric(df, group_var, df_name):
    """Aggregates the numeric values in a dataframe. This can
    be used to create features for each instance of the grouping variable.
    
    Parameters
    --------
        df (dataframe): 
            the dataframe to calculate the statistics on
        group_var (string): 
            the variable by which to group df
        df_name (string): 
            the variable used to rename the columns
        
    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated for 
            all numeric columns. Each instance of the grouping variable will have 
            the statistics (mean, min, max, sum, std) calculated. 
            The columns are also renamed to keep track of features created.
    
    """
    # Remove id variables other than grouping variable
    for col in df:
        if col != group_var and 'SK_ID' in col:
            df = df.drop(columns = col)
            
    group_ids = df[group_var]
    numeric_df = df.select_dtypes(exclude=['uint8'])
    numeric_df[group_var] = group_ids

    # Group by the specified variable and calculate the statistics
    agg = numeric_df.groupby(group_var).agg(['count', 'mean', 'max', 'min', 'sum', 'var']).reset_index()

    # Need to create new column names
    columns = [group_var]

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        # Skip the grouping variable
        if var != group_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1][:-1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))

    agg.columns = columns
    return agg

In [15]:
def agg_categorical(df, group_var, df_name):
    """Aggregates the encoded categorical values in a dataframe. This can
    be used to create features for each instance of the grouping variable.
    
    Parameters
    --------
        df (dataframe): 
            the dataframe to calculate the statistics on
        group_var (string): 
            the variable by which to group df
        df_name (string): 
            the variable used to rename the columns
        
    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated for 
            all labeled categorical columns. Each instance of the grouping variable 
            will have the statistics (mean, min, max, sum, std) calculated. 
            The columns are also renamed to keep track of features created.
    
    """
    # Remove id variables other than grouping variable
    for col in df:
        if col != group_var and 'SK_ID' in col:
            df = df.drop(columns = col)
            
    group_ids = df[group_var]
    categorical_df = df.select_dtypes(include=['uint8'])
    categorical_df[group_var] = group_ids

    # Group by the specified variable and calculate the statistics
    agg = categorical_df.groupby(group_var).agg(['count', 'sum', 'mean']).reset_index()

    # Need to create new column names
    columns = [group_var]

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        # Skip the grouping variable
        if var != group_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1][:-1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))

    agg.columns = columns
    return agg

### Installments Payments Aggregation

In [16]:
installments_agg_num = agg_numeric(df_installments_payments.drop(columns = ['SK_ID_PREV']), group_var = 'SK_ID_CURR', df_name = 'installments')
installments_agg_num.head()

Unnamed: 0,SK_ID_CURR,installments_NUM_INSTALMENT_VERSION_count,installments_NUM_INSTALMENT_VERSION_mean,installments_NUM_INSTALMENT_VERSION_max,installments_NUM_INSTALMENT_VERSION_min,installments_NUM_INSTALMENT_VERSION_sum,installments_NUM_INSTALMENT_VERSION_var,installments_NUM_INSTALMENT_NUMBER_count,installments_NUM_INSTALMENT_NUMBER_mean,installments_NUM_INSTALMENT_NUMBER_max,...,installments_INSTALLMENTS_AMT_INSTALLMENT_TO_PAYMENT_max,installments_INSTALLMENTS_AMT_INSTALLMENT_TO_PAYMENT_min,installments_INSTALLMENTS_AMT_INSTALLMENT_TO_PAYMENT_sum,installments_INSTALLMENTS_AMT_INSTALLMENT_TO_PAYMENT_var,installments_INSTALLMENTS_DAYS_DIFF_LASTYR_count,installments_INSTALLMENTS_DAYS_DIFF_LASTYR_mean,installments_INSTALLMENTS_DAYS_DIFF_LASTYR_max,installments_INSTALLMENTS_DAYS_DIFF_LASTYR_min,installments_INSTALLMENTS_DAYS_DIFF_LASTYR_sum,installments_INSTALLMENTS_DAYS_DIFF_LASTYR_var
0,100001,7,1.142857,2.0,1.0,8.0,0.142857,7,2.714286,4,...,1.0,1.0,7.0,0.0,0,,,,0.0,
1,100002,19,1.052632,2.0,1.0,20.0,0.052632,19,10.0,19,...,1.0,1.0,19.0,0.0,11,17.363636,24.0,12.0,191.0,10.654545
2,100003,25,1.04,2.0,1.0,26.0,0.04,25,5.08,12,...,1.0,1.0,25.0,0.0,0,,,,0.0,
3,100004,3,1.333333,2.0,1.0,4.0,0.333333,3,2.0,3,...,1.0,1.0,3.0,0.0,0,,,,0.0,
4,100005,9,1.111111,2.0,1.0,10.0,0.111111,9,5.0,9,...,1.0,1.0,9.0,0.0,0,,,,0.0,


In [1]:
# installments_agg_num.to_csv('installments_agg_num.csv', index=False)

In [18]:
gc.collect()

131

In [None]:
# pd.reset_option("display.max_rows")
# pd.reset_option("display.max_columns")
# pd.get_option("display.max_rows")