In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("../input"))
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import gc
from matplotlib_venn import venn2
from matplotlib_venn import venn3
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import scipy
import itertools

['POS_CASH_balance.csv', 'bureau_balance.csv', 'application_train.csv', 'previous_application.csv', 'installments_payments.csv', 'credit_card_balance.csv', 'sample_submission.csv', 'application_test.csv', 'bureau.csv']


In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
plt.style.use('fivethirtyeight')

In [4]:
previous_application = reduce_mem_usage(pd.read_csv('../input/previous_application.csv'))

Memory usage of dataframe is 471.48 MB
Memory usage after optimization is: 324.94 MB
Decreased by 31.1%


In [5]:
#previous_application_agg = previous_application.groupby(['SK_ID_CURR', 'SK_ID_PREV'], as_index = False).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()
previous_application_agg = previous_application.drop('SK_ID_PREV',axis=1).groupby('SK_ID_CURR', as_index = False).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()
previous_application_agg.head()

Unnamed: 0_level_0,SK_ID_CURR,AMT_ANNUITY,AMT_ANNUITY,AMT_ANNUITY,AMT_ANNUITY,AMT_ANNUITY,AMT_APPLICATION,AMT_APPLICATION,AMT_APPLICATION,AMT_APPLICATION,AMT_APPLICATION,AMT_CREDIT,AMT_CREDIT,AMT_CREDIT,AMT_CREDIT,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_DOWN_PAYMENT,AMT_DOWN_PAYMENT,AMT_DOWN_PAYMENT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,AMT_GOODS_PRICE,AMT_GOODS_PRICE,AMT_GOODS_PRICE,AMT_GOODS_PRICE,HOUR_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,NFLAG_LAST_APPL_IN_DAY,NFLAG_LAST_APPL_IN_DAY,NFLAG_LAST_APPL_IN_DAY,NFLAG_LAST_APPL_IN_DAY,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_DOWN_PAYMENT,RATE_DOWN_PAYMENT,RATE_DOWN_PAYMENT,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,RATE_INTEREST_PRIVILEGED,RATE_INTEREST_PRIVILEGED,RATE_INTEREST_PRIVILEGED,RATE_INTEREST_PRIVILEGED,DAYS_DECISION,DAYS_DECISION,DAYS_DECISION,DAYS_DECISION,DAYS_DECISION,SELLERPLACE_AREA,SELLERPLACE_AREA,SELLERPLACE_AREA,SELLERPLACE_AREA,SELLERPLACE_AREA,CNT_PAYMENT,CNT_PAYMENT,CNT_PAYMENT,CNT_PAYMENT,CNT_PAYMENT,DAYS_FIRST_DRAWING,DAYS_FIRST_DRAWING,DAYS_FIRST_DRAWING,DAYS_FIRST_DRAWING,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_FIRST_DUE,DAYS_FIRST_DUE,DAYS_FIRST_DUE,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_LAST_DUE,DAYS_LAST_DUE,DAYS_LAST_DUE,DAYS_LAST_DUE,DAYS_TERMINATION,DAYS_TERMINATION,DAYS_TERMINATION,DAYS_TERMINATION,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL,NFLAG_INSURED_ON_APPROVAL,NFLAG_INSURED_ON_APPROVAL,NFLAG_INSURED_ON_APPROVAL,NFLAG_INSURED_ON_APPROVAL
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,min,sum,count,mean,max,min,sum,count,mean,max,min,sum,count,mean,max,min,sum,count,mean,max,min,sum,count,mean,max,min,sum,count,mean,max,min,sum,count,mean,max,min,sum,count,mean,max,min,sum,count,mean,max,min,sum,count,mean,max,min,sum,count,mean,max,min,sum,count,mean,max,min,sum,count,mean,max,min,sum,count,mean,max,min,sum,count,mean,max,min,sum,count,mean,max,min,sum,count,mean,max,min,sum,count,mean,max,min,sum
0,100001,1,3951.0,3951.0,3951.0,3951.0,1,24835.5,24835.5,24835.5,24835.5,1,23787.0,23787.0,23787.0,23787.0,1,2520.0,2520.0,2520.0,2520.0,1,24835.5,24835.5,24835.5,24835.5,1,13.0,13,13,13.0,1,1.0,1,1,1,1,0.104326,0.104326,0.104326,0.104326,0,,,,0.0,0,,,,0.0,1,-1740.0,-1740,-1740,-1740.0,1,23.0,23,23,23,1,8.0,8.0,8.0,8.0,1,365243.0,365243.0,365243.0,365243.0,1,-1709.0,-1709.0,-1709.0,-1709.0,1,-1499.0,-1499.0,-1499.0,-1499.0,1,-1619.0,-1619.0,-1619.0,-1619.0,1,-1612.0,-1612.0,-1612.0,-1612.0,1,0.0,0.0,0.0,0.0
1,100002,1,9251.775391,9251.775391,9251.775391,9251.775391,1,179055.0,179055.0,179055.0,179055.0,1,179055.0,179055.0,179055.0,179055.0,1,0.0,0.0,0.0,0.0,1,179055.0,179055.0,179055.0,179055.0,1,9.0,9,9,9.0,1,1.0,1,1,1,1,0.0,0.0,0.0,0.0,0,,,,0.0,0,,,,0.0,1,-606.0,-606,-606,-606.0,1,500.0,500,500,500,1,24.0,24.0,24.0,24.0,1,365243.0,365243.0,365243.0,365243.0,1,-565.0,-565.0,-565.0,-565.0,1,125.0,125.0,125.0,125.0,1,-25.0,-25.0,-25.0,-25.0,1,-17.0,-17.0,-17.0,-17.0,1,0.0,0.0,0.0,0.0
2,100003,3,56553.988281,98356.992188,6737.310059,169661.96875,3,435436.5,900000.0,68809.5,1306309.5,3,484191.0,1035882.0,68053.5,1452573.0,2,3442.5,6885.0,0.0,6885.0,3,435436.5,900000.0,68809.5,1306309.5,3,14.666667,17,12,44.0,3,1.0,1,1,3,2,0.05003,0.100061,0.0,0.100061,0,,,,0.0,0,,,,0.0,3,-1305.0,-746,-2341,-3915.0,3,533.0,1400,-1,1599,3,10.0,12.0,6.0,30.0,3,365243.0,365243.0,365243.0,1095729.0,3,-1274.333374,-716.0,-2310.0,-3823.0,3,-1004.333313,-386.0,-1980.0,-3013.0,3,-1054.333374,-536.0,-1980.0,-3163.0,3,-1047.333374,-527.0,-1976.0,-3142.0,3,0.666667,1.0,0.0,2.0
3,100004,1,5357.25,5357.25,5357.25,5357.25,1,24282.0,24282.0,24282.0,24282.0,1,20106.0,20106.0,20106.0,20106.0,1,4860.0,4860.0,4860.0,4860.0,1,24282.0,24282.0,24282.0,24282.0,1,5.0,5,5,5.0,1,1.0,1,1,1,1,0.212008,0.212008,0.212008,0.212008,0,,,,0.0,0,,,,0.0,1,-815.0,-815,-815,-815.0,1,30.0,30,30,30,1,4.0,4.0,4.0,4.0,1,365243.0,365243.0,365243.0,365243.0,1,-784.0,-784.0,-784.0,-784.0,1,-694.0,-694.0,-694.0,-694.0,1,-724.0,-724.0,-724.0,-724.0,1,-714.0,-714.0,-714.0,-714.0,1,0.0,0.0,0.0,0.0
4,100005,1,4813.200195,4813.200195,4813.200195,4813.200195,2,22308.75,44617.5,0.0,44617.5,2,20076.75,40153.5,0.0,40153.5,1,4464.0,4464.0,4464.0,4464.0,1,44617.5,44617.5,44617.5,44617.5,2,10.5,11,10,21.0,2,1.0,1,1,2,1,0.108964,0.108964,0.108964,0.108964,0,,,,0.0,0,,,,0.0,2,-536.0,-315,-757,-1072.0,2,18.0,37,-1,36,1,12.0,12.0,12.0,12.0,1,365243.0,365243.0,365243.0,365243.0,1,-706.0,-706.0,-706.0,-706.0,1,-376.0,-376.0,-376.0,-376.0,1,-466.0,-466.0,-466.0,-466.0,1,-460.0,-460.0,-460.0,-460.0,1,0.0,0.0,0.0,0.0


In [6]:
#columns = ['SK_ID_CURR', 'SK_ID_PREV']
columns = ['SK_ID_CURR']
# Iterate through the variables names
for var in previous_application_agg.columns.levels[0]:
    # Skip the id name
    #if var != 'SK_ID_CURR' and var != 'SK_ID_PREV':
    if var != 'SK_ID_CURR':
        # Iterate through the stat names
        for stat in previous_application_agg.columns.levels[1][:-1]:
            # Make a new column name for the variable and stat
            columns.append('previous_application_%s_%s' % (var, stat))

In [7]:
previous_application_agg.columns = columns
previous_application_agg = reduce_mem_usage(previous_application_agg)
previous_application_agg.head()

Memory usage of dataframe is 151.56 MB
Memory usage after optimization is: 99.53 MB
Decreased by 34.3%


Unnamed: 0,SK_ID_CURR,previous_application_AMT_ANNUITY_count,previous_application_AMT_ANNUITY_mean,previous_application_AMT_ANNUITY_max,previous_application_AMT_ANNUITY_min,previous_application_AMT_ANNUITY_sum,previous_application_AMT_APPLICATION_count,previous_application_AMT_APPLICATION_mean,previous_application_AMT_APPLICATION_max,previous_application_AMT_APPLICATION_min,previous_application_AMT_APPLICATION_sum,previous_application_AMT_CREDIT_count,previous_application_AMT_CREDIT_mean,previous_application_AMT_CREDIT_max,previous_application_AMT_CREDIT_min,previous_application_AMT_CREDIT_sum,previous_application_AMT_DOWN_PAYMENT_count,previous_application_AMT_DOWN_PAYMENT_mean,previous_application_AMT_DOWN_PAYMENT_max,previous_application_AMT_DOWN_PAYMENT_min,previous_application_AMT_DOWN_PAYMENT_sum,previous_application_AMT_GOODS_PRICE_count,previous_application_AMT_GOODS_PRICE_mean,previous_application_AMT_GOODS_PRICE_max,previous_application_AMT_GOODS_PRICE_min,previous_application_AMT_GOODS_PRICE_sum,previous_application_HOUR_APPR_PROCESS_START_count,previous_application_HOUR_APPR_PROCESS_START_mean,previous_application_HOUR_APPR_PROCESS_START_max,previous_application_HOUR_APPR_PROCESS_START_min,previous_application_HOUR_APPR_PROCESS_START_sum,previous_application_NFLAG_LAST_APPL_IN_DAY_count,previous_application_NFLAG_LAST_APPL_IN_DAY_mean,previous_application_NFLAG_LAST_APPL_IN_DAY_max,previous_application_NFLAG_LAST_APPL_IN_DAY_min,previous_application_NFLAG_LAST_APPL_IN_DAY_sum,previous_application_RATE_DOWN_PAYMENT_count,previous_application_RATE_DOWN_PAYMENT_mean,previous_application_RATE_DOWN_PAYMENT_max,previous_application_RATE_DOWN_PAYMENT_min,previous_application_RATE_DOWN_PAYMENT_sum,previous_application_RATE_INTEREST_PRIMARY_count,previous_application_RATE_INTEREST_PRIMARY_mean,previous_application_RATE_INTEREST_PRIMARY_max,previous_application_RATE_INTEREST_PRIMARY_min,previous_application_RATE_INTEREST_PRIMARY_sum,previous_application_RATE_INTEREST_PRIVILEGED_count,previous_application_RATE_INTEREST_PRIVILEGED_mean,previous_application_RATE_INTEREST_PRIVILEGED_max,previous_application_RATE_INTEREST_PRIVILEGED_min,previous_application_RATE_INTEREST_PRIVILEGED_sum,previous_application_DAYS_DECISION_count,previous_application_DAYS_DECISION_mean,previous_application_DAYS_DECISION_max,previous_application_DAYS_DECISION_min,previous_application_DAYS_DECISION_sum,previous_application_SELLERPLACE_AREA_count,previous_application_SELLERPLACE_AREA_mean,previous_application_SELLERPLACE_AREA_max,previous_application_SELLERPLACE_AREA_min,previous_application_SELLERPLACE_AREA_sum,previous_application_CNT_PAYMENT_count,previous_application_CNT_PAYMENT_mean,previous_application_CNT_PAYMENT_max,previous_application_CNT_PAYMENT_min,previous_application_CNT_PAYMENT_sum,previous_application_DAYS_FIRST_DRAWING_count,previous_application_DAYS_FIRST_DRAWING_mean,previous_application_DAYS_FIRST_DRAWING_max,previous_application_DAYS_FIRST_DRAWING_min,previous_application_DAYS_FIRST_DRAWING_sum,previous_application_DAYS_FIRST_DUE_count,previous_application_DAYS_FIRST_DUE_mean,previous_application_DAYS_FIRST_DUE_max,previous_application_DAYS_FIRST_DUE_min,previous_application_DAYS_FIRST_DUE_sum,previous_application_DAYS_LAST_DUE_1ST_VERSION_count,previous_application_DAYS_LAST_DUE_1ST_VERSION_mean,previous_application_DAYS_LAST_DUE_1ST_VERSION_max,previous_application_DAYS_LAST_DUE_1ST_VERSION_min,previous_application_DAYS_LAST_DUE_1ST_VERSION_sum,previous_application_DAYS_LAST_DUE_count,previous_application_DAYS_LAST_DUE_mean,previous_application_DAYS_LAST_DUE_max,previous_application_DAYS_LAST_DUE_min,previous_application_DAYS_LAST_DUE_sum,previous_application_DAYS_TERMINATION_count,previous_application_DAYS_TERMINATION_mean,previous_application_DAYS_TERMINATION_max,previous_application_DAYS_TERMINATION_min,previous_application_DAYS_TERMINATION_sum,previous_application_NFLAG_INSURED_ON_APPROVAL_count,previous_application_NFLAG_INSURED_ON_APPROVAL_mean,previous_application_NFLAG_INSURED_ON_APPROVAL_max,previous_application_NFLAG_INSURED_ON_APPROVAL_min,previous_application_NFLAG_INSURED_ON_APPROVAL_sum
0,100001,1,3951.0,3951.0,3951.0,3951.0,1,24835.5,24835.5,24835.5,24835.5,1,23787.0,23787.0,23787.0,23787.0,1,2520.0,2520.0,2520.0,2520.0,1,24835.5,24835.5,24835.5,24835.5,1,13.0,13,13,13.0,1,1.0,1,1,1,1,0.104326,0.104326,0.104326,0.104326,0,,,,0.0,0,,,,0.0,1,-1740.0,-1740,-1740,-1740.0,1,23.0,23,23,23,1,8.0,8.0,8.0,8.0,1,365243.0,365243.0,365243.0,365243.0,1,-1709.0,-1709.0,-1709.0,-1709.0,1,-1499.0,-1499.0,-1499.0,-1499.0,1,-1619.0,-1619.0,-1619.0,-1619.0,1,-1612.0,-1612.0,-1612.0,-1612.0,1,0.0,0.0,0.0,0.0
1,100002,1,9251.775391,9251.775391,9251.775391,9251.775391,1,179055.0,179055.0,179055.0,179055.0,1,179055.0,179055.0,179055.0,179055.0,1,0.0,0.0,0.0,0.0,1,179055.0,179055.0,179055.0,179055.0,1,9.0,9,9,9.0,1,1.0,1,1,1,1,0.0,0.0,0.0,0.0,0,,,,0.0,0,,,,0.0,1,-606.0,-606,-606,-606.0,1,500.0,500,500,500,1,24.0,24.0,24.0,24.0,1,365243.0,365243.0,365243.0,365243.0,1,-565.0,-565.0,-565.0,-565.0,1,125.0,125.0,125.0,125.0,1,-25.0,-25.0,-25.0,-25.0,1,-17.0,-17.0,-17.0,-17.0,1,0.0,0.0,0.0,0.0
2,100003,3,56553.988281,98356.992188,6737.310059,169661.96875,3,435436.5,900000.0,68809.5,1306309.5,3,484191.0,1035882.0,68053.5,1452573.0,2,3442.5,6885.0,0.0,6885.0,3,435436.5,900000.0,68809.5,1306309.5,3,14.666667,17,12,44.0,3,1.0,1,1,3,2,0.05003,0.100061,0.0,0.100061,0,,,,0.0,0,,,,0.0,3,-1305.0,-746,-2341,-3915.0,3,533.0,1400,-1,1599,3,10.0,12.0,6.0,30.0,3,365243.0,365243.0,365243.0,1095729.0,3,-1274.333374,-716.0,-2310.0,-3823.0,3,-1004.333313,-386.0,-1980.0,-3013.0,3,-1054.333374,-536.0,-1980.0,-3163.0,3,-1047.333374,-527.0,-1976.0,-3142.0,3,0.666667,1.0,0.0,2.0
3,100004,1,5357.25,5357.25,5357.25,5357.25,1,24282.0,24282.0,24282.0,24282.0,1,20106.0,20106.0,20106.0,20106.0,1,4860.0,4860.0,4860.0,4860.0,1,24282.0,24282.0,24282.0,24282.0,1,5.0,5,5,5.0,1,1.0,1,1,1,1,0.212008,0.212008,0.212008,0.212008,0,,,,0.0,0,,,,0.0,1,-815.0,-815,-815,-815.0,1,30.0,30,30,30,1,4.0,4.0,4.0,4.0,1,365243.0,365243.0,365243.0,365243.0,1,-784.0,-784.0,-784.0,-784.0,1,-694.0,-694.0,-694.0,-694.0,1,-724.0,-724.0,-724.0,-724.0,1,-714.0,-714.0,-714.0,-714.0,1,0.0,0.0,0.0,0.0
4,100005,1,4813.200195,4813.200195,4813.200195,4813.200195,2,22308.75,44617.5,0.0,44617.5,2,20076.75,40153.5,0.0,40153.5,1,4464.0,4464.0,4464.0,4464.0,1,44617.5,44617.5,44617.5,44617.5,2,10.5,11,10,21.0,2,1.0,1,1,2,1,0.108964,0.108964,0.108964,0.108964,0,,,,0.0,0,,,,0.0,2,-536.0,-315,-757,-1072.0,2,18.0,37,-1,36,1,12.0,12.0,12.0,12.0,1,365243.0,365243.0,365243.0,365243.0,1,-706.0,-706.0,-706.0,-706.0,1,-376.0,-376.0,-376.0,-376.0,1,-466.0,-466.0,-466.0,-466.0,1,-460.0,-460.0,-460.0,-460.0,1,0.0,0.0,0.0,0.0


In [8]:
previous_application_agg.shape

(338857, 96)

In [9]:
gc.collect()

161

In [10]:
previous_application_agg.to_csv('previous_application_agg.csv', index=False)