# Data Merging
Notebook to merge data from provided files

In [1]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd

# Allows the use of display() for DataFrames
from IPython.display import display

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

import missingno as msno

from utils import *

In [2]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import gc
gc.collect()

11

Utility function returns values counts to evaluate most frequent value for category columns.

In [4]:
most_frequent = lambda x: x.value_counts().index[0]

## Ext Bureau
bureau.csv

All client's previous credits provided by other financial institutions that were reported to Credit Bureau (for clients who have a loan in our sample).
For every loan in our sample, there are as many rows as number of credits the client had in Credit Bureau before the application date.

In [5]:
bureau = pd.read_csv('input/bureau.csv.zip')
bureau.head(2)

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,


bureau_balance.csv

Monthly balances of previous credits in Credit Bureau.
This table has one row for each month of history of every previous credit reported to Credit Bureau – i.e the table has (#loans in sample * # of relative previous credits * # of months where we have some history observable for the previous credits) rows.

In [6]:
bureau_balance = pd.read_csv('input/bureau_balance.csv.zip')
display(bureau_balance.head(2))

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C


Merging bureau and balance data on SK_ID_BUREAU.

In [None]:
bureau_balance_by_id = bureau_balance.groupby('SK_ID_BUREAU')
bureau_grouped_size = bureau_balance_by_id['MONTHS_BALANCE'].size()
bureau_grouped_max = bureau_balance_by_id['MONTHS_BALANCE'].max()
bureau_grouped_min = bureau_balance_by_id['MONTHS_BALANCE'].min()

# create separate column for each STATUS in bureau_balance table.
bureau_counts = bureau_balance_by_id['STATUS'].value_counts(normalize = False)
bureau_counts_unstacked = bureau_counts.unstack('STATUS')
bureau_counts_unstacked.columns = ['STATUS_0', 'STATUS_1','STATUS_2','STATUS_3','STATUS_4','STATUS_5','STATUS_C','STATUS_X',]

bureau_counts_unstacked['MONTHS_COUNT'] = bureau_grouped_size
bureau_counts_unstacked['MONTHS_MIN'] = bureau_grouped_min
bureau_counts_unstacked['MONTHS_MAX'] = bureau_grouped_max

bureau = bureau.join(bureau_counts_unstacked, how='left', on='SK_ID_BUREAU')

In [8]:
bureau.head(2)

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY,STATUS_0,STATUS_1,STATUS_2,STATUS_3,STATUS_4,STATUS_5,STATUS_C,STATUS_X,MONTHS_COUNT,MONTHS_MIN,MONTHS_MAX
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,,,,,,,,,,,,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,,,,,,,,,,,,


Relation between applicant to bureau is one to many, so to join bureau data we can e.g. avarage data per applicant.
Let's group by sk_id_curr and take average, this way we get data that we can merge with app_train/test data

In [9]:
bureau_by_skid = bureau.groupby('SK_ID_CURR')
avg_bureau = bureau_by_skid.mean()
avg_bureau['BUREAU_CNT'] = bureau[['SK_ID_BUREAU', 'SK_ID_CURR']].groupby('SK_ID_CURR').count()['SK_ID_BUREAU']

In [10]:
avg_bureau.drop(columns=['SK_ID_BUREAU'], inplace=True)
avg_bureau = avg_bureau.reset_index()
avg_bureau.head(2)

Unnamed: 0,SK_ID_CURR,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,DAYS_CREDIT_UPDATE,AMT_ANNUITY,STATUS_0,STATUS_1,STATUS_2,STATUS_3,STATUS_4,STATUS_5,STATUS_C,STATUS_X,MONTHS_COUNT,MONTHS_MIN,MONTHS_MAX,BUREAU_CNT
0,100001,-735.0,0.0,82.428571,-825.5,,0.0,207623.571429,85240.928571,0.0,0.0,-93.142857,3545.357143,4.428571,1.0,,,,,27.5,6.0,24.571429,-23.571429,0.0,7
1,100002,-874.0,0.0,-349.0,-697.5,1681.029,0.0,108131.945625,49156.2,7997.14125,0.0,-499.875,0.0,5.625,4.5,,,,,3.833333,2.5,13.75,-28.25,-15.5,8


In [11]:
bureau_cols = avg_bureau.columns.tolist()

### Previous applications groups by sk id as well and take average, like for bureau data
previous_application.csv

All previous applications for Home Credit loans of clients who have loans in our sample.
There is one row for each previous application related to loans in our data sample.

In [12]:
previous_application = pd.read_csv('input/previous_application.csv.zip')
previous_application.head(2)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,NAME_CASH_LOAN_PURPOSE,NAME_CONTRACT_STATUS,DAYS_DECISION,NAME_PAYMENT_TYPE,CODE_REJECT_REASON,NAME_TYPE_SUITE,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,CHANNEL_TYPE,SELLERPLACE_AREA,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,Y,1,0.0,0.182832,0.867336,XAP,Approved,-73,Cash through the bank,XAP,,Repeater,Mobile,POS,XNA,Country-wide,35,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,Y,1,,,,XNA,Approved,-164,XNA,XAP,Unaccompanied,Repeater,XNA,Cash,x-sell,Contact center,-1,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0


Previous application relation to applicant data is one-to-many, so to join data we need to avarage data per applicant.

In [None]:
avg_previous_app = previous_application.groupby('SK_ID_CURR').mean()
cnt_previous_app = previous_application[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
# engeneering new column - number of previous applications
avg_previous_app['NUM_APPS'] = cnt_previous_app['SK_ID_PREV']
avg_previous_app.drop(columns=['SK_ID_PREV'], inplace=True)

Reset index after removing column

In [14]:
avg_previous_app = avg_previous_app.reset_index()

Rename conflicting with other files columns.

In [15]:
avg_previous_app = avg_previous_app.rename(index=str, columns={'AMT_ANNUITY': 'PREV_APP_AMT_ANNUITY'})
prev_app_cols = avg_previous_app.columns.tolist()

In [16]:
avg_previous_app.head(2)

Unnamed: 0,SK_ID_CURR,PREV_APP_AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,HOUR_APPR_PROCESS_START,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,DAYS_DECISION,SELLERPLACE_AREA,CNT_PAYMENT,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL,NUM_APPS
0,100001,3951.0,24835.5,23787.0,2520.0,24835.5,13.0,1.0,0.104326,,,-1740.0,23.0,8.0,365243.0,-1709.0,-1499.0,-1619.0,-1612.0,0.0,1
1,100002,9251.775,179055.0,179055.0,0.0,179055.0,9.0,1.0,0.0,,,-606.0,500.0,24.0,365243.0,-565.0,125.0,-25.0,-17.0,0.0,1


### Credit card balance
POS_CASH_balance.csv

Monthly balance snapshots of previous POS (point of sales) and cash loans that the applicant had with Home Credit.
This table has one row for each month of history of every previous credit in Home Credit (consumer credit and cash loans) related to loans in our sample – i.e. the table has (#loans in sample * # of relative previous credits * # of months in which we have some history observable for the previous credits) rows.

In [17]:
POS_CASH_balance = pd.read_csv('input/POS_CASH_balance.csv.zip')
POS_CASH_balance.head(2)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0


Processing categorical columns. Additional columns added: number of unique statuses and most frequent status.

In [None]:
# number of unqiue statues
nunique_status = POS_CASH_balance[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']].groupby('SK_ID_CURR').nunique()
# find most frequent status
max_status = POS_CASH_balance[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']].groupby('SK_ID_CURR').agg(most_frequent)
POS_CASH_balance['NUNIQUE_STATUS'] = nunique_status['NAME_CONTRACT_STATUS']
POS_CASH_balance['MAX_STATUS'] = max_status['NAME_CONTRACT_STATUS']

POS_CASH_balance.drop(columns=['SK_ID_PREV', 'NAME_CONTRACT_STATUS'], inplace=True)

In [19]:
POS_CASH_balance.head(2)

Unnamed: 0,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,SK_DPD,SK_DPD_DEF,NUNIQUE_STATUS,MAX_STATUS
0,182943,-31,48.0,45.0,0,0,,
1,367990,-33,36.0,35.0,0,0,,


Avarage data per applicant to be able to join to main data file.

In [20]:
avg_POS_CASH_balance = POS_CASH_balance.groupby('SK_ID_CURR').mean().reset_index()

In [21]:
avg_POS_CASH_balance.head(2)

Unnamed: 0,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,SK_DPD,SK_DPD_DEF,NUNIQUE_STATUS
0,100001,-72.555556,4.0,1.444444,0.777778,0.777778,
1,100002,-10.0,24.0,15.0,0.0,0.0,


In [22]:
pos_cash_cols = avg_POS_CASH_balance.columns.tolist()

### Payment data
installments_payments.csv

Repayment history for the previously disbursed credits in Home Credit related to the loans in our sample.
There is a) one row for every payment that was made plus b) one row each for missed payment.
One row is equivalent to one payment of one installment OR one installment corresponding to one payment of one previous Home Credit credit related to loans in our sample.

In [23]:
installments_payments = pd.read_csv('input/installments_payments.csv.zip')
installments_payments.head(2)
installments_payments.drop(columns=['SK_ID_PREV'], inplace=True)

Avarage data per applicant to be able to join to main data file. Group be skid and take mean, max and min values for each grouping.

In [24]:
avg_payments = installments_payments.groupby('SK_ID_CURR').mean().reset_index()
avg_payments_cols = avg_payments.columns.tolist()
avg_payments.head(2)

Unnamed: 0,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,100001,1.142857,2.714286,-2187.714286,-2195.0,5885.132143,5885.132143
1,100002,1.052632,10.0,-295.0,-315.421053,11559.247105,11559.247105


In [25]:
max_payments = installments_payments.groupby('SK_ID_CURR').max().reset_index()
max_payments_cols = max_payments.columns.tolist()
max_payments.head(2)

Unnamed: 0,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,100001,2.0,4,-1619.0,-1628.0,17397.9,17397.9
1,100002,2.0,19,-25.0,-49.0,53093.745,53093.745


In [26]:
min_payments = installments_payments.groupby('SK_ID_CURR').min().reset_index()
min_payments_cols = min_payments.columns.tolist()
min_payments.head(2)

Unnamed: 0,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,100001,1.0,1,-2916.0,-2916.0,3951.0,3951.0
1,100002,1.0,1,-565.0,-587.0,9251.775,9251.775


### Credit cards data
credit_card_balance.csv

Monthly balance snapshots of previous credit cards that the applicant has with Home Credit.
This table has one row for each month of history of every previous credit in Home Credit (consumer credit and cash loans) related to loans in our sample – i.e. the table has (#loans in sample * # of relative previous credit cards * # of months where we have some history observable for the previous credit card) rows.

In [27]:
credit_card_balance = pd.read_csv('input/credit_card_balance.csv.zip')
credit_card_balance.head(2)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,AMT_PAYMENT_TOTAL_CURRENT,AMT_RECEIVABLE_PRINCIPAL,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,1800.0,1800.0,0.0,0.0,0.0,0.0,1,0.0,1.0,35.0,Active,0,0
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,2250.0,2250.0,60175.08,64875.555,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0


In [None]:
# processing categorical columns before merge merge
# find number of unique statuses
nunique_status = credit_card_balance[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']].groupby('SK_ID_CURR').nunique()
# find most frequent status
max_status = credit_card_balance[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']].groupby('SK_ID_CURR').agg(most_frequent)
credit_card_balance['NUNIQUE_STATUS'] = nunique_status['NAME_CONTRACT_STATUS']
credit_card_balance['MAX_STATUS'] = max_status['NAME_CONTRACT_STATUS']

credit_card_balance.drop(columns=['SK_ID_PREV', 'NAME_CONTRACT_STATUS'], inplace=True)

In [29]:
credit_card_balance.head(2)

Unnamed: 0,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,AMT_PAYMENT_TOTAL_CURRENT,AMT_RECEIVABLE_PRINCIPAL,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,SK_DPD,SK_DPD_DEF,NUNIQUE_STATUS,MAX_STATUS
0,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,1800.0,1800.0,0.0,0.0,0.0,0.0,1,0.0,1.0,35.0,0,0,,
1,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,2250.0,2250.0,60175.08,64875.555,64875.555,1.0,1,0.0,0.0,69.0,0,0,,


In [30]:
credit_card_balance = credit_card_balance.groupby('SK_ID_CURR').mean().reset_index()

In [31]:
credit_card_balance.head(2)

Unnamed: 0,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,AMT_PAYMENT_TOTAL_CURRENT,AMT_RECEIVABLE_PRINCIPAL,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,SK_DPD,SK_DPD_DEF,NUNIQUE_STATUS
0,100006,-3.5,0.0,270000.0,,0.0,,,0.0,,0.0,0.0,0.0,0.0,,0.0,,,0.0,0.0,0.0,
1,100011,-38.5,54482.111149,164189.189189,2432.432432,2432.432432,0.0,0.0,3956.221849,4843.064189,4520.067568,52402.088919,54433.179122,54433.179122,0.054054,0.054054,0.0,0.0,25.767123,0.0,0.0,1.0


#### Conflicting columns renaming

In [32]:
cols_rename = {'MONTHS_BALANCE': 'CC_MONTHS_BALANCE',
               'NUNIQUE_STATUS': 'CC_NUNIQUE_STATUS',
               'SK_DPD': 'CC_SK_DPD',
               'SK_DPD_DEF': 'CC_SK_DPD_DEF'}
credit_card_balance = credit_card_balance.rename(index=str, columns=cols_rename)

In [33]:
credit_card_balance_cols = credit_card_balance.columns.tolist()

In [34]:
cols_rename = {'AMT_ANNUITY': 'B_AMT_ANNUITY'}
avg_bureau = avg_bureau.rename(index=str, columns=cols_rename)

In [35]:
bureau_cols = avg_bureau.columns.tolist()

In [36]:
cols_rename = {'AMT_CREDIT': 'P_AMT_CREDIT',
               'AMT_GOODS_PRICE': 'P_AMT_GOODS_PRICE',
               'HOUR_APPR_PROCESS_START': 'P_HOUR_APPR_PROCESS_START'}
avg_previous_app = avg_previous_app.rename(index=str, columns=cols_rename)

In [37]:
prev_app_cols = avg_previous_app.columns.tolist()

### Check Column Names Conflicts
Files contains the same column names, to join them with out automatic renaming we need to find rename those beforehand.

bureau_cols  
prev_app_cols  
pos_cash_cols  
avg_payments_cols  
min_payments_cols  
max_payments_cols  
credit_card_balance_cols

In [38]:
set(bureau_cols).intersection(prev_app_cols)

{'SK_ID_CURR'}

In [39]:
all_cols = set(bureau_cols + prev_app_cols)
all_cols.intersection(pos_cash_cols)

{'SK_ID_CURR'}

In [40]:
all_cols = set(bureau_cols + prev_app_cols + pos_cash_cols)
all_cols.intersection(avg_payments_cols)

{'SK_ID_CURR'}

In [41]:
all_cols = set(bureau_cols + prev_app_cols + pos_cash_cols + avg_payments_cols)
all_cols.intersection(credit_card_balance_cols)

{'SK_ID_CURR'}

In [None]:
app_train_cols = app_train.columns.tolist()
all_cols = set(bureau_cols + prev_app_cols + pos_cash_cols + avg_payments_cols + credit_card_balance_cols)
all_cols.intersection(app_train_cols)

## Merging everything to train and test data
Join prepared data and store into train.csv and test.csv to use all provided data by models.

In [None]:
app_train = pd.read_csv('input/application_train.csv.zip')

In [None]:
app_train = app_train.merge(avg_bureau, how='left', on='SK_ID_CURR')

In [None]:
app_train = app_train.merge(avg_previous_app, how='left', on='SK_ID_CURR')

In [None]:
app_train = app_train.merge(avg_POS_CASH_balance, how='left', on='SK_ID_CURR')

In [None]:
app_train = app_train.merge(credit_card_balance, how='left', on='SK_ID_CURR')

In [None]:
app_train = app_train.merge(avg_payments, how='left', on='SK_ID_CURR')

In [None]:
#app_train = app_train.merge(min_payments, how='left', on='SK_ID_CURR')

In [43]:
#app_train = app_train.merge(max_payments, how='left', on='SK_ID_CURR')

In [None]:
app_train.shape

In [None]:
app_train.to_csv('input/train.csv', index=False)

In [None]:
app_train.head(2)

Test data

In [None]:
app_test = pd.read_csv('input/application_test.csv.zip')

In [None]:
app_test = app_test.merge(avg_bureau, how='left', on='SK_ID_CURR')

In [None]:
app_test = app_test.merge(avg_previous_app, how='left', on='SK_ID_CURR')

In [None]:
app_test = app_test.merge(avg_POS_CASH_balance, how='left', on='SK_ID_CURR')

In [None]:
app_test = app_test.merge(credit_card_balance, how='left', on='SK_ID_CURR')

In [None]:
app_test = app_test.merge(avg_payments, how='left', on='SK_ID_CURR')

In [None]:
#app_test = app_test.merge(min_payments, how='left', on='SK_ID_CURR')

In [None]:
#app_test = app_test.merge(max_payments, how='left', on='SK_ID_CURR')

In [None]:
app_test.shape

In [None]:
app_test.to_csv('input/test.csv', index=False)

In [None]:
app_test.head(2)