### Analysis
Now that we've sanity-checked the fields, let's take a deeper look at what we have.

In [None]:
# Calculate which features correlate most strongly to the TARGET variable
correlations = application_train.corr()['TARGET'].sort_values()

In [None]:
strong_correlations = correlations[:10]

# We want to omit TARGET, which is always the last record... 
strong_correlations = strong_correlations.append(correlations[-10:-1])  

In [None]:
#display(correlations)
display(strong_correlations)

Positive values are more closely tied to failure to repay the loan.  Negative values are more closely tied to successful repayments.

### Looking at  Age vs. Repayment
Age is strongly closely correlated with failure to repay.  People that have payment problems trend a few years younger. Not super surprising.

In [None]:
# Age is represented as a negative integers of days before the application date.
# The absolute value divided by 365 should give us age in years... 

# The average age of someone who pays on time
avg_good = np.average(abs(application_train.loc[application_train['TARGET'] == 0, 'DAYS_BIRTH'])) / 365

# The average age of someone who defaults
avg_bad = np.average(abs(application_train.loc[application_train['TARGET'] == 1, 'DAYS_BIRTH'])) / 365

# Examine distribution
plt.figure(figsize=(15,10))
plt.title("Age Distribution in Years")
plt.axvline(avg_good, color="blue")    # Plot the average
plt.axvline(avg_bad, color="orange") # Plot the mean

sns.kdeplot(abs(application_train.loc[application_train['TARGET'] == 0, 'DAYS_BIRTH']) / 365, label = "Paid As Expected", shade=True)
sns.kdeplot(abs(application_train.loc[application_train['TARGET'] == 1, 'DAYS_BIRTH']) / 365, label = "Payment Problems", shade=True)

print("Average age of someone who defaults: {0}  Avg age of someone who pays: {1}".format(avg_bad, avg_good))

### Employment Length vs Repayment

In [None]:
# DAYS_EMPLOYED is represented as a negative integers of days before the application date.
# The absolute value divided by 365 should give us age in years... 

# The average age of someone who pays on time
avg_good = np.average(abs(application_train.loc[application_train['TARGET'] == 0, 'DAYS_EMPLOYED'])) / 365

# The average age of someone who defaults
avg_bad = np.average(abs(application_train.loc[application_train['TARGET'] == 1, 'DAYS_EMPLOYED'])) / 365

# Examine distribution
plt.figure(figsize=(15,10))
plt.title("Employment in Years")
plt.axvline(avg_good, color="blue")    # Plot the average
plt.axvline(avg_bad, color="orange") # Plot the mean

sns.kdeplot(abs(application_train.loc[application_train['TARGET'] == 0, 'DAYS_EMPLOYED']) / 365, label = "Paid As Expected", shade=True)
sns.kdeplot(abs(application_train.loc[application_train['TARGET'] == 1, 'DAYS_EMPLOYED']) / 365, label = "Payment Problems", shade=True)

print("Average employment of someone who defaults: {0}  Avg employment of someone who pays: {1}".format(avg_bad, avg_good))

NOTE: There's a large section of the population that has an impossibly high tenure (~1000 years).  This is clearly problematic, and should probably handle it differently.

### Output

Output lists of fields that need transformation for consumption by the preprocessing script

In [None]:
lb = LabelBinarizer()
bool_phone = lb.fit_transform(application_train['FLAG_PHONE'])

explore_non_numeric(bool_phone)

string_to_bool_features.append('FLAG_PHONE')

#### FLAG_EMAIL
Did client provide email (1=YES, 0=NO)

In [None]:
explore_non_numeric(application_train['FLAG_EMAIL'])

### Preprocessing 
This is a Y/N string field.  We'll re-encode it as a bool for easier analysis later

In [None]:
lb = LabelBinarizer()
bool_email = lb.fit_transform(application_train['FLAG_EMAIL'])

explore_non_numeric(bool_email)

string_to_bool_features.append('FLAG_EMAIL')

#### OCCUPATION_TYPE
Family status of the client

In [None]:
occupation_type = application_train['OCCUPATION_TYPE']
occupation_type = occupation_type.replace(np.nan, "Missing")

explore_non_numeric(occupation_type)

# Swap the normalized data with the original field
application_train['OCCUPATION_TYPE_NORMALIZED'] = occupation_type
result = application_train.drop('OCCUPATION_TYPE',axis=1)

#Track this for one-hot encoding later
non_numeric_features.append("OCCUPATION_TYPE");

#### CNT_FAM_MEMBERS
How many family members does client have

In [None]:
cnt_fam_members_numeric = np.nan_to_num(application_train['CNT_FAM_MEMBERS'])
explore_numeric(cnt_fam_members_numeric)
numeric_features.append("CNT_FAM_MEMBERS")

#### REGION_RATING_CLIENT
Our rating of the region where client lives (1,2,3)

In [None]:
explore_numeric(application_train['REGION_RATING_CLIENT'], graphType="hist", bins=3)

#### REGION_RATING_CLIENT_W_CITY
Our rating of the region where client lives with taking city into account (1,2,3)

In [None]:
obs_30 = np.nan_to_num(application_train['OBS_30_CNT_SOCIAL_CIRCLE'])
explore_numeric(obs_30)

#### Preprocessing: 
This feature is highly skewed, in that we have a large number of values near zero, and body of other values distributed across the range.

It's common to do a log transformation for fields like this. The result looks much more like a normal distribution, with mean and average values near the peak.

In [None]:
obs_30 = pd.DataFrame(data=obs_30)
obs_30_transformed = obs_30.apply(lambda x: np.log(x + 1))

#visualize the new distribution
explore_numeric(obs_30_transformed, showMeanLines=False)

log_transform_features.append('OBS_30_CNT_SOCIAL_CIRCLE')

#### DEF_30_CNT_SOCIAL_CIRCLE
How many observation of client's social surroundings defaulted on 30 DPD (days past due)

In [None]:
def_30 = np.nan_to_num(application_train['DEF_30_CNT_SOCIAL_CIRCLE'])
explore_numeric(def_30)

In [None]:
# We need it to be a DataFrame column
def_30 = pd.DataFrame(data=def_30)

# Select the column with the actual data
def_30 =  def_30[0]

# Apply a log transformation
def_30_transformed = def_30.apply(lambda x: np.log(x + 1))

#visualize the new distribution
explore_numeric(def_30_transformed)

#numeric_features.append('DEF_30_CNT_SOCIAL_CIRCLE')
log_transform_features.append('DEF_30_CNT_SOCIAL_CIRCLE')

#### OBS_60_CNT_SOCIAL_CIRCLE
How many observation of client's social surroundings with observable 30 DPD (days past due) default

In [None]:
obs_60 = np.nan_to_num(application_train['OBS_60_CNT_SOCIAL_CIRCLE'])
explore_numeric(obs_60)

In [None]:
# We need it to be a DataFrame column
obs_60 = pd.DataFrame(data=obs_60)

# Select the column in the DataFrame with the actual data
obs_60 = obs_60[0]

obs_60_transformed = obs_60.apply(lambda x: np.log(x + 1))

#visualize the new distribution
explore_numeric(obs_60_transformed, showMeanLines=False)

#numeric_features.append('OBS_60_CNT_SOCIAL_CIRCLE')
log_transform_features.append('OBS_60_CNT_SOCIAL_CIRCLE')

#### DEF_60_CNT_SOCIAL_CIRCLE
How many observation of client's social surroundings defaulted on 60 DPD (days past due)

In [None]:
explore_non_numeric(application_train['REG_CITY_NOT_LIVE_CITY'])

#### REG_CITY_NOT_WORK_CITY
Flag if client's permanent address does not match work address (1=different, 0=same, at city level)

In [None]:
explore_non_numeric(application_train['REG_CITY_NOT_WORK_CITY'])

#### LIVE_CITY_NOT_WORK_CITY
Flag if client's contact address does not match work address (1=different, 0=same, at city level)

In [None]:
explore_non_numeric(application_train['LIVE_CITY_NOT_WORK_CITY'])

#### ORGANIZATION_TYPE
Type of organization where client works

In [None]:
explore_non_numeric(application_train['ORGANIZATION_TYPE'])

# Keep track for one-hot encoding later
non_numeric_features.append("ORGANIZATION_TYPE")

#### FONDKAPREMONT_MODE
Not sure - Described as "normalized"

In [None]:
#explore_non_numeric(application_train['FONDKAPREMONT_MODE'])
application_train['FONDKAPREMONT_MODE'] = application_train['FONDKAPREMONT_MODE'].replace({np.nan: "not specified"})

explore_non_numeric(application_train['FONDKAPREMONT_MODE'])

# Keep track for one-hot encoding later
non_numeric_features.append("FONDKAPREMONT_MODE")

#### EMERGENCYSTATE_MODE
Not sure - Described as "normalized"

In [None]:
#explore_non_numeric(application_train['FONDKAPREMONT_MODE'])
application_train['EMERGENCYSTATE_MODE'] = application_train['EMERGENCYSTATE_MODE'].replace({np.nan: "not specified"})

explore_non_numeric(application_train['EMERGENCYSTATE_MODE'])

# Keep track for one-hot encoding later
non_numeric_features.append("EMERGENCYSTATE_MODE")

#### HOUSETYPE_MODE
Not sure - Described as "normalized"

In [None]:
explore_numeric(application_train['REGION_RATING_CLIENT_W_CITY'],graphType="hist",bins=3)

#### WEEKDAY_APPR_PROCESS_START
On which day of the week did the client apply for the loan

In [None]:
explore_non_numeric(application_train['WEEKDAY_APPR_PROCESS_START'])
non_numeric_features.append("WEEKDAY_APPR_PROCESS_START")

#### HOUR_APPR_PROCESS_START
Approximately at what hour did the client apply for the loan

In [None]:
'''
NOTE: Although this field is an integer, it rrepresents the hour at which the application was filed.
Simply treating it as an integer would impart a higher weight to applications filed later in the day, 
which doesn't really make any sense.  

Instead, I'm treating them as 24 separate categories and one-hot encoding them. 

This should ensure that the application time is considered, but not arbitrarily weighted.
'''
explore_non_numeric(application_train['HOUR_APPR_PROCESS_START'])
non_numeric_features.append("HOUR_APPR_PROCESS_START")

#### REG_REGION_NOT_LIVE_REGION
Flag if client's permanent address does not match contact address (1=different, 0=same, at region level)

In [None]:
explore_non_numeric(application_train['REG_REGION_NOT_LIVE_REGION'])

#### REG_REGION_NOT_WORK_REGION
Flag if client's permanent address does not match work address (1=different, 0=same, at region level)

In [None]:
explore_non_numeric(application_train['REG_REGION_NOT_WORK_REGION'])

#### LIVE_REGION_NOT_WORK_REGION
Flag if client's contact address does not match work address (1=different, 0=same, at region level)

In [None]:
explore_non_numeric(application_train['LIVE_REGION_NOT_WORK_REGION'])

#### REG_CITY_NOT_LIVE_CITY
Flag if client's permanent address does not match contact address (1=different, 0=same, at city level)

In [None]:
# Convert Non-Numerics
amt_annuity = np.nan_to_num(application_train['AMT_ANNUITY'], copy=True)
explore_numeric(amt_annuity)

#### Preprocessing: 
This feature is skewed right, in that we have a large number of values near zero, and body of other values distributed across the range.

It's common to do a log transformation for fields like this. The result looks normally distributed.

In [None]:
annuity = application_train['AMT_ANNUITY']
annuity_transformed = annuity.apply(lambda x: np.log(x + 1))
annuity_transformed_numeric = np.nan_to_num(annuity_transformed, copy=True)

#visualize the new distribution
explore_numeric(annuity_transformed_numeric)

numeric_features.append('AMT_ANNUITY')
#log_transform_features.append('AMT_ANNUITY')

#### AMT_GOODS_PRICE
For consumer loans it is the price of the goods for which the loan is given

In [None]:
# Convert Non-Numerics
amt_goods_price = np.nan_to_num(application_train['AMT_GOODS_PRICE'], copy=True)
explore_numeric(amt_goods_price)

In [None]:
goods_price = application_train['AMT_GOODS_PRICE']
goods_price_transformed = goods_price.apply(lambda x: np.log(x + 1))
goods_price_transformed_numeric = np.nan_to_num(goods_price_transformed, copy=True)

#visualize the new distribution
explore_numeric(goods_price_transformed_numeric)

numeric_features.append('AMT_GOODS_PRICE')
#log_transform_features.append('AMT_GOODS_PRICE')

#### NAME_TYPE_SUITE
Who was accompanying client when he was applying for the loan

In [None]:
# Replace any nans with a string for easier encoding
# https://stackoverflow.com/questions/43066071/replace-numpy-nan-with-string-in-list-with-strings
name_type_suite = ['missing' if x is np.nan else x for x in application_train['NAME_TYPE_SUITE']]

explore_non_numeric(name_type_suite)

# Replace original data with modified data
application_train['NAME_TYPE_SUITE_NORMALIZED'] = name_type_suite
result = application_train.drop('NAME_TYPE_SUITE',axis=1)

# One-Hot Encode This Later
non_numeric_features.append('NAME_TYPE_SUITE')

#### NAME_INCOME_TYPE
Clients income type (businessman, working, maternity leave)

In [None]:
explore_non_numeric(application_train['NAME_INCOME_TYPE'])

# One-Hot Encode This Later
non_numeric_features.append("NAME_INCOME_TYPE")

#### NAME_EDUCATION_TYPE
Level of highest education the client achieved

In [None]:
explore_numeric(application_train['DAYS_EMPLOYED'])

#### Preprocessing: 
There's a large body of outliers that indicate that someone has worked ~100 years.  This is clearly wrong.  Also, I don't expect this field to be normally distributed (I think you'd see a decreasing number of people employed in the same role over time and a body of people with no job or no work history), so doing a log transformation seems inappropriate.


In [None]:
days_employed = application_train['DAYS_EMPLOYED']

# A bunch of records indicate that the person has worked for ~100 years.
# We'll need to replace those
days_employed = days_employed.replace({365243: 0})

#visualize the new distribution
explore_numeric(days_employed)

# Swap the normalized data with the original field
application_train['DAYS_EMPLOYED_TRANSFORMED'] = days_employed
result = application_train.drop('DAYS_EMPLOYED',axis=1)

# Add to our list of numeric features so that we scale the values appropriately later
numeric_features.append('DAYS_EMPLOYED')

#### DAYS_REGISTRATION
How many days before the application did client change his registration

In [None]:
explore_numeric(application_train['DAYS_REGISTRATION'])
numeric_features.append('DAYS_REGISTRATION')

#### DAYS_ID_PUBLISH
How many days before the application did client change the identity document with which he applied for the loan

In [None]:
days_id_publish = application_train['DAYS_ID_PUBLISH']
numeric_features.append('DAYS_ID_PUBLISH')

#### OWN_CAR_AGE
Age of client's car

In [None]:
own_car_age = application_train['OWN_CAR_AGE']
own_car_age_numeric = np.nan_to_num(application_train['OWN_CAR_AGE'])
own_car_age_numeric = pd.DataFrame(data=own_car_age_numeric)
explore_numeric(own_car_age_numeric, showMeanLines=False)

#### Preprocessing: 
This feature is skewed right, in that we have a large number of values near zero, and body of other values distributed across the range.

It's common to do a log transformation for fields like this. The result looks more normally distributed, and I think the high representation of people with new or no cars is more reasonable.

In [None]:
own_car_age_numeric_transformed = own_car_age_numeric.apply(lambda x: np.log(x + 1))

#visualize the new distribution
explore_numeric(own_car_age_numeric_transformed, showMeanLines=False)

#numeric_features.append('OWN_CAR_AGE')
log_transform_features.append('OWN_CAR_AGE')

#### FLAG_MOBIL
Did client provide mobile phone (1=YES, 0=NO)

In [None]:
explore_numeric(application_train['AMT_INCOME_TOTAL'])

In [None]:
income_total = application_train['AMT_INCOME_TOTAL']
income_total_transformed = income_total.apply(lambda x: np.log(x + 1))

#visualize the new distribution
explore_numeric(income_total_transformed)

# Let's just try this with outlier removal
numeric_features.append('AMT_INCOME_TOTAL')

#log_transform_features.append('AMT_INCOME_TOTAL')

#### AMT_CREDIT
Credit amount of the loan

In [None]:
explore_numeric(application_train['AMT_CREDIT'])

In [None]:
credit_total = application_train['AMT_CREDIT']
credit_total_transformed = credit_total.apply(lambda x: np.log(x + 1))

#visualize the new distribution
explore_numeric(credit_total_transformed)


numeric_features.append('AMT_CREDIT')

#log_transform_features.append('AMT_CREDIT')

#### AMT_ANNUITY
Loan annuity

In [None]:
def_60 = np.nan_to_num(application_train['DEF_60_CNT_SOCIAL_CIRCLE'])
explore_numeric(def_60)

In [None]:
# We need it to be a DataFrame column
def_60 = pd.DataFrame(data=def_60)

# Select the column in the DataFrame with the actual data
def_60 = def_60[0]

def_60_transformed = def_60.apply(lambda x: np.log(x + 1))

#visualize the new distribution
explore_numeric(def_60_transformed)

#numeric_features.append('DEF_60_CNT_SOCIAL_CIRCLE')
log_transform_features.append('DEF_60_CNT_SOCIAL_CIRCLE')

#### DAYS_LAST_PHONE_CHANGE
How many days before application did client change phone

In [None]:
application_train['HOUSETYPE_MODE'] = application_train['HOUSETYPE_MODE'].replace({np.nan: "not specified"})

explore_non_numeric(application_train['HOUSETYPE_MODE'])

# Keep track for one-hot encoding later
non_numeric_features.append("HOUSETYPE_MODE")

#### WALLSMATERIAL_MODE
Not sure - Described as "normalized"

In [None]:
application_train['WALLSMATERIAL_MODE'] = application_train['WALLSMATERIAL_MODE'].replace({np.nan: "not specified"})

explore_non_numeric(application_train['WALLSMATERIAL_MODE'])

# Keep track for one-hot encoding later
non_numeric_features.append("WALLSMATERIAL_MODE")

#### EXT_SOURCE_1
Normalized score from external data source

In [None]:
ext_source_1 = np.nan_to_num(application_train['EXT_SOURCE_1'])
explore_numeric(ext_source_1)

#### EXT_SOURCE_2
Normalized score from external data source

In [None]:
ext_source_2 = np.nan_to_num(application_train['EXT_SOURCE_2'])
explore_numeric(ext_source_1)

#### EXT_SOURCE_3
Normalized score from external data source

In [None]:
ext_source_3 = np.nan_to_num(application_train['EXT_SOURCE_3'])
explore_numeric(ext_source_3)

#### APARTMENTS_AVG
Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor

In [None]:
explore_non_numeric(application_train['FLAG_OWN_REALTY'])

In [None]:
lb = LabelBinarizer()
bool_own_realty = lb.fit_transform(application_train['FLAG_OWN_REALTY'])

explore_non_numeric(bool_own_realty)

string_to_bool_features.append('FLAG_OWN_REALTY')

### CNT_CHILDREN
Number of children the client has

In [None]:
explore_numeric(application_train['CNT_CHILDREN'])   

#### Preprocessing: 
This feature is highly skewed, in that we have a large number of values near zero, and body of other values distributed across the range.

It's common to do a log transformation for fields like this. 

In [None]:
cnt_children = application_train['CNT_CHILDREN']
cnt_children_transformed = cnt_children.apply(lambda x: np.log(x + 1))

#visualize the new distribution
explore_numeric(cnt_children_transformed, graphType="hist", bins=5)

# I think outlier removal might be a better way to handle this
#numeric_features.append('CNT_CHILDREN')
log_transform_features.append('CNT_CHILDREN')



##### AMT_INCOME_TOTAL
Income of the client

In [None]:
apartments_avg = np.nan_to_num(application_train['APARTMENTS_AVG'])
explore_numeric(apartments_avg)

#### BASEMENTAREA_AVG
Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor

In [None]:
basementarea_avg = np.nan_to_num(application_train['BASEMENTAREA_AVG'])
explore_numeric(basementarea_avg)

#### YEARS_BEGINEXPLUATATION_AVG
Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor

In [None]:
non_numeric_features = []    # non-numeric features for one-hot encoding
numeric_features = []        # numeric features (for feature scaling)
string_to_bool_features = [] # boolean features (that need conversion from string to binary)
log_transform_features = []  # skewed features that need log transformation

### Inspect Data

#### Application Data

In [None]:
display(application_train.head(n=10))

### Utility Functions

In [None]:
# Visualize the contents of a non-numeric column

def explore_non_numeric(field, showDetails=True):

    # Inspired By: 
    # https://stackoverflow.com/questions/10741346/numpy-most-efficient-frequency-counts-for-unique-values-in-an-array

    unique, counts = np.unique(field, return_counts=True)
    total_records = np.size(field)

    if (showDetails): 
        i = 0
        while i < np.size(unique): 
            percentage = (counts[i] / total_records) * 100
            print("Value: {0} Count: {1} Percentage: {2}".format(unique[i], counts[i], percentage))
            i += 1

    # Example code from: https://matplotlib.org/examples/pie_and_polar_charts/pie_demo_features.html
    fig1, ax1 = plt.subplots()
    ax1.pie(counts, labels=unique, autopct='%1.1f%%',shadow=True, startangle=90)
    ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    plt.show()

In [None]:
# Visualize the contents of a numeric column

def explore_numeric(field, showUnique=False, graphType="dist", bins=20, showMeanLines=True):     
    mean = np.mean(field)
    avg = np.average(field)
    
    # Take a look at the range
    print("Max: {0} Min: {1} Mean: {2} Avg: {3} Std: {4}".format(
    np.max(field),
    np.min(field),
    mean,
    avg,
    np.std(field)))

    # Unique Values
    # print("Values: {0}".format(application_train['CNT_CHILDREN'].unique()))

    if (showUnique): 
    
        unique, counts = np.unique(application_train['CNT_CHILDREN'], return_counts=True)
        total_records = np.size(application_train['CNT_CHILDREN'])

        i = 0
        while i < np.size(unique): 
            percentage = (counts[i] / total_records) * 100
            print("Value: {0} Count: {1} Percentage: {2}".format(unique[i], counts[i], percentage))
            i += 1

    # Examine distribution
    plt.figure(figsize=(15,10))
    plt.title("Distribution Graph")
    if (showMeanLines):
        plt.axvline(avg, color="red")    # Plot the average
        plt.axvline(mean, color="green") # Plot the mean
    
    if (graphType == "hist"): 
        sns.distplot(field,bins=bins)
    else: 
        sns.distplot(field)

#### TARGET

Target variable (1 - client with payment difficulties: he/she had late payment more than X days on at least one of the first Y installments of the loan in our sample, 0 - all other cases)

In [None]:
explore_non_numeric(application_train['TARGET'])

#### CODE_GENDER
Gender of the client

In [None]:
explore_non_numeric(application_train['CODE_GENDER'])

# Categorize it for one-hot encoding later
non_numeric_features.append('CODE_GENDER')

#### NAME_CONTRACT_TYPE
Identification if loan is cash or revolving

In [None]:
days = np.nan_to_num(application_train['DAYS_LAST_PHONE_CHANGE'])
explore_numeric(days)

In [None]:
days = pd.DataFrame(data=days)

# Select the column in the DataFrame with the actual data
days = days[0]

# Negative numbers are undefined for log, so offset them into the positive range
min = np.min(days)
offset = 2 * abs(min)

days_transformed = days.apply(lambda x: np.log(x + offset + 1))

#visualize the new distribution
explore_numeric(np.nan_to_num(days_transformed), showMeanLines=False)

#numeric_features.append('DAYS_LAST_PHONE_CHANGE')
log_transform_features.append('DAYS_LAST_PHONE_CHANGE')

#### AMT_REQ_CREDIT_BUREAU_HOUR
Number of enquiries to Credit Bureau about the client one hour before application

In [None]:
hour = np.nan_to_num(application_train['AMT_REQ_CREDIT_BUREAU_HOUR'])
explore_numeric(hour)
numeric_features.append("AMT_REQ_CREDIT_BUREAU_HOUR")

#### AMT_REQ_CREDIT_BUREAU_DAY
Number of enquiries to Credit Bureau about the client one day before application (excluding one hour before application)

In [None]:
day = np.nan_to_num(application_train['AMT_REQ_CREDIT_BUREAU_DAY'])
explore_numeric(day)
numeric_features.append("AMT_REQ_CREDIT_BUREAU_DAY")

#### AMT_REQ_CREDIT_BUREAU_WEEK
Number of enquiries to Credit Bureau about the client one week before application (excluding one day before application)

In [None]:
hour = np.nan_to_num(application_train['AMT_REQ_CREDIT_BUREAU_WEEK'])
explore_numeric(hour)
numeric_features.append("AMT_REQ_CREDIT_BUREAU_WEEK")

#### AMT_REQ_CREDIT_BUREAU_MONTH
Number of enquiries to Credit Bureau about the client one month before application (excluding one week before application)

#### LANDAREA_AVG
Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor

In [None]:
landarea_avg = np.nan_to_num(application_train['LANDAREA_AVG'])
explore_numeric(landarea_avg)

#### LIVINGAPARTMENTS_AVG
Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor

In [None]:
livingapartments_avg = np.nan_to_num(application_train['LIVINGAPARTMENTS_AVG'])
explore_numeric(livingapartments_avg)

#### LIVINGAREA_AVG
Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor

In [None]:
livingarea_avg = np.nan_to_num(application_train['LIVINGAREA_AVG'])
explore_numeric(livingarea_avg)

### Note: It looks like these cells are well-normalized and usable as-is.  Skipping the remainder of the normalized housing stats.

#### OBS_30_CNT_SOCIAL_CIRCLE
How many observation of client's social surroundings with observable 30 DPD (days past due) default

In [None]:
explore_non_numeric(application_train['NAME_EDUCATION_TYPE'])

# One-Hot Encode This Later
non_numeric_features.append("NAME_EDUCATION_TYPE")

#### NAME_FAMILY_STATUS
Family status of the client

In [None]:
explore_non_numeric(application_train['NAME_FAMILY_STATUS'])

# One-Hot Encode This Later
non_numeric_features.append("NAME_FAMILY_STATUS")

#### NAME_HOUSING_TYPE
What is the housing situation of the client (renting, living with parents, ...)

In [None]:
explore_non_numeric(application_train['NAME_HOUSING_TYPE'])

# One-Hot Encode This Later
non_numeric_features.append("NAME_HOUSING_TYPE")

#### REGION_POPULATION_RELATIVE
Normalized population of region where client lives (higher number means the client lives in more populated region)

#### FLOORSMAX_AVG
Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor

In [None]:
floorsmax_avg = np.nan_to_num(application_train['FLOORSMAX_AVG'])
explore_numeric(floorsmax_avg)

#### FLOORSMIN_AVG
Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor

In [None]:
floorsmin_avg = np.nan_to_num(application_train['FLOORSMIN_AVG'])
explore_numeric(floorsmin_avg)

In [None]:
month = np.nan_to_num(application_train['AMT_REQ_CREDIT_BUREAU_MON'])
explore_numeric(month)
numeric_features.append("AMT_REQ_CREDIT_BUREAU_MON")

#### AMT_REQ_CREDIT_BUREAU_QRT
Number of enquiries to Credit Bureau about the client one month before application (excluding one week before application)

In [None]:
explore_non_numeric(application_train['NAME_CONTRACT_TYPE'])

# Categorize it for one-hot encoding later
non_numeric_features.append('NAME_CONTRACT_TYPE')

#### FLAG_OWN_CAR

Flag if the client owns a car

In [None]:
explore_non_numeric(application_train['FLAG_OWN_CAR'])

In [None]:
lb = LabelBinarizer()
bool_own_car = lb.fit_transform(application_train['FLAG_OWN_CAR'])

explore_non_numeric(bool_own_car)

string_to_bool_features.append("FLAG_OWN_CAR")

#### FLAG_OWN_REALTY

Flag if client owns a house or flat

In [None]:
explore_non_numeric(application_train['FLAG_EMP_PHONE'])

In [None]:
lb = LabelBinarizer()
bool_emp_phone = lb.fit_transform(application_train['FLAG_EMP_PHONE'])

explore_non_numeric(bool_emp_phone)

string_to_bool_features.append('FLAG_EMP_PHONE')

#### FLAG_WORK_PHONE
Did client provide home phone (1=YES, 0=NO)

In [None]:
explore_non_numeric(application_train['FLAG_WORK_PHONE'])

In [None]:
lb = LabelBinarizer()
bool_work_phone = lb.fit_transform(application_train['FLAG_WORK_PHONE'])

explore_non_numeric(bool_work_phone)

string_to_bool_features.append('FLAG_WORK_PHONE')

#### FLAG_PHONE
Did client provide home phone (1=YES, 0=NO)

In [None]:
explore_non_numeric(application_train['FLAG_PHONE'])

In [None]:
explore_numeric(application_train['REGION_POPULATION_RELATIVE'])

# Add to our list of numeric features so that we scale the values appropriately later
numeric_features.append('REGION_POPULATION_RELATIVE')

#### DAYS_BIRTH
Client's age in days, [subtracted from] at the time of application

In [None]:
explore_numeric(application_train['DAYS_BIRTH'])

# Add to our list of numeric features so that we scale the values appropriately later
numeric_features.append('DAYS_BIRTH')

#### DAYS_EMPLOYED
How many days before the application the person started current employment

In [None]:
qrt = np.nan_to_num(application_train['AMT_REQ_CREDIT_BUREAU_QRT'])
explore_numeric(qrt)
numeric_features.append("AMT_REQ_CREDIT_BUREAU_QRT")

#### AMT_REQ_CREDIT_BUREAU_YEAR
Number of enquiries to Credit Bureau about the client one day year (excluding last 3 months before application)

In [None]:
year = np.nan_to_num(application_train['AMT_REQ_CREDIT_BUREAU_YEAR'])
explore_numeric(year)
numeric_features.append("AMT_REQ_CREDIT_BUREAU_YEAR")



#### FLAG_DOCUMENT_4 - 21
Did client provide document 4

In [None]:
print("Document 2")
explore_non_numeric(application_train['FLAG_DOCUMENT_2'])

print("Document 3")
explore_non_numeric(application_train['FLAG_DOCUMENT_3'])

print("Document 4")
explore_non_numeric(application_train['FLAG_DOCUMENT_4'])

print("Document 5")
explore_non_numeric(application_train['FLAG_DOCUMENT_5'])

print("Document 6")
explore_non_numeric(application_train['FLAG_DOCUMENT_6'])

print("Document 7")
explore_non_numeric(application_train['FLAG_DOCUMENT_7'])

print("Document 8")
explore_non_numeric(application_train['FLAG_DOCUMENT_8'])

print("Document 9")
explore_non_numeric(application_train['FLAG_DOCUMENT_9'])

print("Document 10")
explore_non_numeric(application_train['FLAG_DOCUMENT_10'])

print("Document 11")
explore_non_numeric(application_train['FLAG_DOCUMENT_11'])

print("Document 12")
explore_non_numeric(application_train['FLAG_DOCUMENT_12'])

print("Document 13")
explore_non_numeric(application_train['FLAG_DOCUMENT_13'])

print("Document 14")
explore_non_numeric(application_train['FLAG_DOCUMENT_14'])

print("Document 15")
explore_non_numeric(application_train['FLAG_DOCUMENT_15'])

print("Document 16")
explore_non_numeric(application_train['FLAG_DOCUMENT_16'])

print("Document 17")
explore_non_numeric(application_train['FLAG_DOCUMENT_17'])

print("Document 18")
explore_non_numeric(application_train['FLAG_DOCUMENT_18'])

print("Document 19")
explore_non_numeric(application_train['FLAG_DOCUMENT_19'])

print("Document 20")
explore_non_numeric(application_train['FLAG_DOCUMENT_20'])

print("Document 21")
explore_non_numeric(application_train['FLAG_DOCUMENT_21'])

In [None]:
# Add some more "normalized" fields for numeric processing
numeric_features.append("APARTMENTS_AVG")
numeric_features.append("APARTMENTS_MEDI")
numeric_features.append("APARTMENTS_MODE")
numeric_features.append("BASEMENTAREA_AVG")
numeric_features.append("BASEMENTAREA_MEDI")
numeric_features.append("BASEMENTAREA_MODE")
numeric_features.append("COMMONAREA_AVG")
numeric_features.append("COMMONAREA_MEDI")
numeric_features.append("COMMONAREA_MODE")
numeric_features.append("ELEVATORS_AVG")
numeric_features.append("ELEVATORS_MEDI")
numeric_features.append("ELEVATORS_MODE")
numeric_features.append("ENTRANCES_AVG")
numeric_features.append("ENTRANCES_MEDI")
numeric_features.append("ENTRANCES_MODE")
numeric_features.append("EXT_SOURCE_1")
numeric_features.append("EXT_SOURCE_2")
numeric_features.append("EXT_SOURCE_3")
numeric_features.append("FLOORSMAX_AVG")
numeric_features.append("FLOORSMAX_MEDI")
numeric_features.append("FLOORSMAX_MODE")
numeric_features.append("FLOORSMIN_AVG")
numeric_features.append("FLOORSMIN_MEDI")
numeric_features.append("FLOORSMIN_MODE")
numeric_features.append("LANDAREA_AVG")
numeric_features.append("LANDAREA_MEDI")
numeric_features.append("LANDAREA_MODE")
numeric_features.append("LANDAREA_AVG")
numeric_features.append("LANDAREA_MEDI")
numeric_features.append("LANDAREA_MODE")
numeric_features.append("LIVINGAPARTMENTS_AVG")
numeric_features.append("LIVINGAPARTMENTS_MEDI")
numeric_features.append("LIVINGAPARTMENTS_MODE")
numeric_features.append("LIVINGAREA_AVG")
numeric_features.append("LIVINGAREA_MEDI")
numeric_features.append("LIVINGAREA_MODE")
numeric_features.append("NONLIVINGAPARTMENTS_AVG")
numeric_features.append("NONLIVINGAPARTMENTS_MEDI")
numeric_features.append("NONLIVINGAPARTMENTS_MODE")
numeric_features.append("NONLIVINGAREA_AVG")
numeric_features.append("NONLIVINGAREA_MEDI")
numeric_features.append("NONLIVINGAREA_MODE")
numeric_features.append("TOTALAREA_MODE")
numeric_features.append("YEARS_BEGINEXPLUATATION_AVG")
numeric_features.append("YEARS_BEGINEXPLUATATION_MEDI")
numeric_features.append("YEARS_BEGINEXPLUATATION_MODE")
numeric_features.append("YEARS_BUILD_AVG")
numeric_features.append("YEARS_BUILD_MEDI")
numeric_features.append("YEARS_BUILD_MODE")

In [None]:
explore_non_numeric(application_train['FLAG_MOBIL'])

NOTE: Given the fact that 99.999% of people have a cellphone, we can drop this feature.  It has no value in terms of differentiation.

#### FLAG_EMP_PHONE

Did client provide work phone (1=YES, 0=NO)

In [None]:
years_beginexpluatation_avg = np.nan_to_num(application_train['YEARS_BEGINEXPLUATATION_AVG'])
explore_numeric(years_beginexpluatation_avg)

#### YEARS_BUILD_AVG
Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor

In [None]:
years_build_avg = np.nan_to_num(application_train['YEARS_BUILD_AVG'])
explore_numeric(years_build_avg)

#### COMMONAREA_AVG
Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor

In [None]:
commonarea_avg = np.nan_to_num(application_train['COMMONAREA_AVG'])
explore_numeric(commonarea_avg)

#### ELEVATORS_AVG
Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor

In [None]:
elevators_avg = np.nan_to_num(application_train['ELEVATORS_AVG'])
explore_numeric(elevators_avg)

#### ENTRANCES_AVG
Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor

### Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display # Allows the use of display() for DataFrames
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelBinarizer

# Show all of the data in a dataframe
pd.set_option('display.max_columns', None)

### Load Data

In [None]:
# Training Set
application_train = pd.read_csv('data/application_train.csv')
print("Loaded Training Set: {0} rows {1} columns".format(application_train.shape[0], application_train.shape[1]))

columns = pd.read_csv('data/HomeCredit_columns_description.csv')

In [None]:
entrances_avg = np.nan_to_num(application_train['ENTRANCES_AVG'])
explore_numeric(entrances_avg)