# kaggle home-credit-default-risk

## Project plan

### 1) Here wo go
* Information about competition
* All datasets downloaded with kaggle cli
    ```
    source activate <env>
    pip install kaggle
    kaggle competitions download -c home-credit-default-risk
    ```
* Read datasets with pandas
* Decide on goal (AUC?)

### 2) Build flow/pipeline (Iteration 1 - Application data)
* Data analysis
* Feature engineering
* Modelling
    1. Decision trees?
    2. Logistic regression?
    3. Deep network?
* Validation
* Model v1.0

### 3) Iterate
Iterate and compare with v1.0

### 4) Final modelling
Decide on strategy and model

### 5) Conclusions
Conclusions and thoughts

# Import datasets

In [10]:
import pandas as pd
import numpy as np
import featuretools as ft

path_data = '~/.kaggle/competitions/home-credit-default-risk'

description = pd.read_csv(path_data + '/HomeCredit_columns_description.csv', encoding='latin1') # encoding error...
bureau_balance = pd.read_csv(path_data + '/bureau_balance.csv.zip', compression='zip')
POS_CASH_balance = pd.read_csv(path_data + '/POS_CASH_balance.csv.zip', compression='zip')
credit_card_balance = pd.read_csv(path_data + '/credit_card_balance.csv.zip', compression='zip')
application_test = pd.read_csv(path_data + '/application_test.csv.zip', compression='zip')
installments_payments = pd.read_csv(path_data + '/installments_payments.csv.zip', compression='zip')
application_train = pd.read_csv(path_data + '/application_train.csv.zip', compression='zip')
previous_application = pd.read_csv(path_data + '/previous_application.csv.zip', compression='zip')
bureau = pd.read_csv(path_data + '/bureau.csv.zip', compression='zip')
sample_submission = pd.read_csv(path_data + '/sample_submission.csv.zip', compression='zip')

today = pd.to_datetime('2018-06-11')
previous_application['DAYS_DECISION'] = today + pd.to_timedelta(previous_application['DAYS_DECISION'], unit='d')

In [2]:
# fix new id 
#bureau_balance = bureau_balance.reset_index()
#bureau_balance = bureau_balance.rename(columns={'index': 'SK_ID_BUREAU_BALANCE'})

POS_CASH_balance = POS_CASH_balance.reset_index()
POS_CASH_balance = POS_CASH_balance.rename(columns={'index': 'SK_ID_POS_CASH_BALANCE'})

credit_card_balance = credit_card_balance.reset_index()
credit_card_balance = credit_card_balance.rename(columns={'index': 'SK_ID_CREDIT_CARD_BALANCE'})

installments_payments = installments_payments.reset_index()
installments_payments = installments_payments.rename(columns={'index': 'SK_ID_INSTALLMENTS_PAYMENTS'})

In [3]:
print('sample_submission:')
print(sample_submission.head())

sample_submission:
   SK_ID_CURR  TARGET
0      100001     0.5
1      100005     0.5
2      100013     0.5
3      100028     0.5
4      100038     0.5


Submission must contain key and target (probability)

In [11]:
print('application_train:')
print(application_train.head())
#print('application_test:')
#print(application_test.head())

print('previous_application:')
print(previous_application.head())

print('bureau:')
print(bureau.head())

print('bureau_balance:')
print(bureau_balance.head(n=100))

print('POS_CASH_balance:')
print(POS_CASH_balance.head())

print('credit_card_balance:')
print(credit_card_balance.head())

print('installments_payments:')
print(installments_payments.head())

application_train:
   SK_ID_CURR  TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR  \
0      100002       1         Cash loans           M            N   
1      100003       0         Cash loans           F            N   
2      100004       0    Revolving loans           M            Y   
3      100006       0         Cash loans           F            N   
4      100007       0         Cash loans           M            N   

  FLAG_OWN_REALTY  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \
0               Y             0          202500.0    406597.5      24700.5   
1               N             0          270000.0   1293502.5      35698.5   
2               Y             0           67500.0    135000.0       6750.0   
3               Y             0          135000.0    312682.5      29686.5   
4               Y             0          121500.0    513000.0      21865.5   

              ...              FLAG_DOCUMENT_18 FLAG_DOCUMENT_19  \
0             ...            

In [5]:
#print('bureau_balance:')
#print(bureau_balance.describe())
#print('POS_CASH_balance:')
#print(POS_CASH_balance.describe())
#print('credit_card_balance:')
#print(credit_card_balance.describe())
#print('application_test:')
#print(application_test.describe())
#print('installments_payments:')
#print(installments_payments.describe())
#print('application_train:')
#print(application_train.describe())
#print('previous_application:')
#print(previous_application.describe())
#print('bureau:')
#print(bureau.describe())

In [13]:
es = ft.EntitySet(id="applications")

es = es.entity_from_dataframe(
    entity_id="applications",
    dataframe=application_train,
    #time_index="transaction_time",
    index="SK_ID_CURR",
    )
#es["applications"].variables

es = es.entity_from_dataframe(
    entity_id="prev_applications",
    dataframe=previous_application,
    time_index="DAYS_DECISION",
    index="SK_ID_PREV",
    )
#es["prev_applications"].variables

es = es.entity_from_dataframe(
    entity_id="bureau",
    dataframe=bureau,
    #time_index="transaction_time",
    index="SK_ID_BUREAU",
    )
#es["bureau"].variables

es = es.entity_from_dataframe(
    entity_id="bureau_balance",
    dataframe=bureau_balance,
    #time_index="transaction_time",
    index="SK_ID_BUREAU_BALANCE",
    )
#es["bureau_balance"].variables

es = es.entity_from_dataframe(
    entity_id="pos_cash_balance",
    dataframe=POS_CASH_balance,
    #time_index="transaction_time",
    index="SK_ID_POS_CASH_BALANCE",
    )
#es["pos_cash_balance"].variables

es = es.entity_from_dataframe(
    entity_id="credit_card_balance",
    dataframe=credit_card_balance,
    #time_index="transaction_time",
    index="SK_ID_CREDIT_CARD_BALANCE",
    )
#es["credit_card_balance"].variables

es = es.entity_from_dataframe(
    entity_id="installments_payments",
    dataframe=installments_payments,
    #time_index="transaction_time",
    index="SK_ID_INSTALLMENTS_PAYMENTS",
    )
#es["credit_card_balance"].variables


applications_to_prev_applications = ft.Relationship(
    es["applications"]["SK_ID_CURR"],
    es["prev_applications"]["SK_ID_CURR"]
    )

applications_to_bureau = ft.Relationship(
    es["applications"]["SK_ID_CURR"],
    es["bureau"]["SK_ID_CURR"]
    )

bureau_to_bureau_balance = ft.Relationship(
    es["bureau"]["SK_ID_BUREAU"],
    es["bureau_balance"]["SK_ID_BUREAU"]
    )

applications_to_pos_cash_balance = ft.Relationship(
    es["applications"]["SK_ID_CURR"],
    es["pos_cash_balance"]["SK_ID_CURR"]
    )

prev_applications_to_pos_cash_balance = ft.Relationship(
    es["prev_applications"]["SK_ID_PREV"],
    es["pos_cash_balance"]["SK_ID_PREV"]
    )

applications_to_credit_card_balance = ft.Relationship(
    es["applications"]["SK_ID_CURR"],
    es["credit_card_balance"]["SK_ID_CURR"]
    )

prev_applications_to_credit_card_balance = ft.Relationship(
    es["prev_applications"]["SK_ID_PREV"],
    es["credit_card_balance"]["SK_ID_PREV"]
    )

applications_to_installments_payments = ft.Relationship(
    es["applications"]["SK_ID_CURR"],
    es["installments_payments"]["SK_ID_CURR"]
    )

prev_applications_to_installments_payments = ft.Relationship(
    es["prev_applications"]["SK_ID_PREV"],
    es["installments_payments"]["SK_ID_PREV"]
    )

es = es.add_relationship(applications_to_prev_applications)
es = es.add_relationship(applications_to_bureau)
es = es.add_relationship(bureau_to_bureau_balance)
es = es.add_relationship(applications_to_pos_cash_balance)
es = es.add_relationship(prev_applications_to_pos_cash_balance)
es = es.add_relationship(applications_to_credit_card_balance)
es = es.add_relationship(prev_applications_to_credit_card_balance)
es = es.add_relationship(applications_to_installments_payments)
es = es.add_relationship(prev_applications_to_installments_payments)



In [14]:
es

Entityset: applications
  Entities:
    applications [Rows: 307511, Columns: 122]
    prev_applications [Rows: 1670214, Columns: 37]
    bureau [Rows: 1716428, Columns: 17]
    bureau_balance [Rows: 27299925, Columns: 4]
    pos_cash_balance [Rows: 10001358, Columns: 9]
    credit_card_balance [Rows: 3840312, Columns: 24]
    installments_payments [Rows: 13605401, Columns: 9]
  Relationships:
    prev_applications.SK_ID_CURR -> applications.SK_ID_CURR
    bureau.SK_ID_CURR -> applications.SK_ID_CURR
    bureau_balance.SK_ID_BUREAU -> bureau.SK_ID_BUREAU
    pos_cash_balance.SK_ID_CURR -> applications.SK_ID_CURR
    pos_cash_balance.SK_ID_PREV -> prev_applications.SK_ID_PREV
    credit_card_balance.SK_ID_CURR -> applications.SK_ID_CURR
    credit_card_balance.SK_ID_PREV -> prev_applications.SK_ID_PREV
    installments_payments.SK_ID_CURR -> applications.SK_ID_CURR
    installments_payments.SK_ID_PREV -> prev_applications.SK_ID_PREV