In [24]:
import pandas as pd
import numpy as np
pd.set_option('max_colwidth', 5000)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['figure.figsize'] = (12,8)

Data

In [25]:
data = pd.read_parquet("data/data_target_4xgb.parquet.gzip")
data.shape

(1857375, 121)

# Drop columns

## Co-borrowers

In [26]:
co_borr_cols = [
    "annual_inc_joint", "application_type", "dti_joint", "revol_bal_joint", "sec_app_fico_range_low", "sec_app_fico_range_high",
    "sec_app_earliest_cr_line", "sec_app_inq_last_6mths", "sec_app_mort_acc", "sec_app_open_acc", "sec_app_revol_util", "sec_app_open_act_il", "sec_app_num_rev_accts",
    "sec_app_chargeoff_within_12_mths", "sec_app_collections_12_mths_ex_med"
]

data.drop(co_borr_cols, axis=1, inplace=True)

## Hardship

In [27]:
hardship_cols = [
    "hardship_flag", "hardship_type", "hardship_reason", "hardship_status", "deferral_term", "hardship_amount", "hardship_start_date", "hardship_end_date", 
    "payment_plan_start_date", "hardship_length", "hardship_dpd", "hardship_loan_status", "orig_projected_additional_accrued_interest", "hardship_payoff_balance_amount",
    "hardship_last_payment_amount"
]

data.drop(hardship_cols, axis=1, inplace=True)

# Ordinal encodeing

## Process emp_length

In [28]:
mapping_dict = {
    "emp_length": {
        "10+ years": '10',
        "9 years": '9',
        "8 years": '8',
        "7 years": '7',
        "6 years": '6',
        "5 years": '5',
        "4 years": '4',
        "3 years": '3',
        "2 years": '2',
        "1 year": '1',
        "< 1 year": '0',
        "n/a": '0'
    },
}
data = data.replace(mapping_dict)
data['emp_length'] = pd.to_numeric(data['emp_length'])
data[['emp_length']].head()

Unnamed: 0,emp_length
0,10.0
1,0.0
2,10.0
3,10.0
4,1.0


In [29]:
data['emp_length'] = data['emp_length'].fillna(0)

## Process sub_grade

In [30]:
int_grade = []

dd = {
    'A': 0,
    'B': 1,
    'C': 2,
    'D': 3,
    'E': 4,
    'F': 5,
    'G': 6
}

for elem in data["sub_grade"]:
    int_grade.append(
        dd[elem[0]] * 5 + int(elem[1])
    )
data["sub_grade"] = int_grade

Quick check

In [31]:
for i in (1, 36):
    assert i in data["sub_grade"]

# One-hot encoding

In [32]:
nominal_columns = ["home_ownership", "purpose", "verification_status"]

for column in nominal_columns:
    dummy_df = pd.get_dummies(data[column])
    dummy_df = dummy_df.add_prefix(f"{column}_ohe_")
    data = pd.concat([data, dummy_df], axis=1)

### One-hot top ones

In [33]:
value_counts = data['addr_state'].value_counts()

most_frequent = value_counts[value_counts >= 100_000].index
one_hot_encoded = pd.get_dummies(data['addr_state'][data['addr_state'].isin(most_frequent)])

In [34]:
one_hot_encoded

Unnamed: 0,CA,FL,NY,TX
3,1,0,0,0
7,1,0,0,0
8,1,0,0,0
9,0,0,0,1
11,1,0,0,0
...,...,...,...,...
1857365,0,1,0,0
1857367,1,0,0,0
1857369,1,0,0,0
1857373,0,0,1,0


In [35]:
data = pd.concat([data, one_hot_encoded], axis=1)

In [36]:
value_counts = data['emp_title'].value_counts()

most_frequent = value_counts[value_counts >= 100_000].index
one_hot_encoded = pd.get_dummies(data['emp_title'][data['emp_title'].isin(most_frequent)])
one_hot_encoded

In [37]:
data = pd.concat([data, one_hot_encoded], axis=1)

# Debt settlement flag

In [38]:
data.loc[data["debt_settlement_flag"] == 'N', "debt_settlement_flag"] = '0'
data.loc[data["debt_settlement_flag"] == 'Y', "debt_settlement_flag"] = '1'

data["debt_settlement_flag"] = pd.to_numeric(data["debt_settlement_flag"])

In [39]:
data.loc[data["initial_list_status"] == 'w', "initial_list_status"] = '1'
data.loc[data["initial_list_status"] == 'f', "initial_list_status"] = '0'

data["initial_list_status"] = pd.to_numeric(data["initial_list_status"])

# New columns

Let's now go ahead and create a column for the average of fico_range_low and fico_range_high columns and name it fico_average. Note that this is not the average FICO score for each borrower, but rather an average of the high and low range that we know the borrower is in.

In [40]:
data['fico_average'] = (data['fico_range_high'] + data['fico_range_low']) / 2
drop_cols = ['fico_range_low', 'fico_range_high']
data = data.drop(drop_cols, axis=1)

# Fill nans

In [41]:
data['acc_open_past_24mths'] = data['acc_open_past_24mths'].fillna(0)

# Filter non-numeric data

In [42]:
dd = dict(data.dtypes)
numeric_cols = []

for column, type_name in dd.items():
    type_name = str(type_name).lower()
    if 'float' not in type_name and 'int' not in type_name:
        pass
    else:
        numeric_cols.append(column)

len(numeric_cols)

107

In [43]:
data_numeric = data[numeric_cols].copy()
data_numeric.shape

(1857375, 107)

In [44]:
half_count = len(data_numeric) / 2
data_numeric.dropna(thresh=half_count, axis=1, inplace=True) # Drop any column with more than 50% missing values
data_numeric.shape

(1857375, 97)

In [45]:
data_numeric.to_parquet("data/data_with_feats_4xgb.parquet.gzip", compression='gzip')

# Drafts

In [46]:
# drop_cols = ['last_credit_pull_d', 'addr_state', 'title','earliest_cr_line']
# data = data.drop(drop_cols,axis=1)