In [27]:
import pandas as pd
import numpy as np
pd.set_option('max_colwidth', 5000)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['figure.figsize'] = (12,8)

Data

In [28]:
data = pd.read_parquet("data/data_target_4xgb.parquet.gzip")
data.shape

(1860764, 119)

In [31]:
for col in data.columns:
    print(col)

loan_amnt
term
int_rate
installment
sub_grade
emp_title
emp_length
home_ownership
annual_inc
verification_status
url
purpose
zip_code
addr_state
dti
delinq_2yrs
earliest_cr_line
fico_range_low
fico_range_high
inq_last_6mths
mths_since_last_delinq
mths_since_last_record
open_acc
pub_rec
revol_bal
revol_util
total_acc
initial_list_status
last_credit_pull_d
collections_12_mths_ex_med
mths_since_last_major_derog
application_type
annual_inc_joint
dti_joint
verification_status_joint
acc_now_delinq
tot_coll_amt
tot_cur_bal
open_acc_6m
open_act_il
open_il_12m
open_il_24m
mths_since_rcnt_il
total_bal_il
il_util
open_rv_12m
open_rv_24m
max_bal_bc
all_util
total_rev_hi_lim
inq_fi
total_cu_tl
inq_last_12m
acc_open_past_24mths
avg_cur_bal
bc_open_to_buy
bc_util
chargeoff_within_12_mths
delinq_amnt
mo_sin_old_il_acct
mo_sin_old_rev_tl_op
mo_sin_rcnt_rev_tl_op
mo_sin_rcnt_tl
mort_acc
mths_since_recent_bc
mths_since_recent_bc_dlq
mths_since_recent_inq
mths_since_recent_revol_delinq
num_accts_ever_120_

# Drop columns

## Co-borrowers

In [3]:
co_borr_cols = [
    "annual_inc_joint", "application_type", "dti_joint", "revol_bal_joint", "sec_app_fico_range_low", "sec_app_fico_range_high",
    "sec_app_earliest_cr_line", "sec_app_inq_last_6mths", "sec_app_mort_acc", "sec_app_open_acc", "sec_app_revol_util", "sec_app_open_act_il", "sec_app_num_rev_accts",
    "sec_app_chargeoff_within_12_mths", "sec_app_collections_12_mths_ex_med"
]

data.drop(co_borr_cols, axis=1, inplace=True)

## Hardship

In [4]:
hardship_cols = [
    "hardship_flag", "hardship_type", "hardship_reason", "hardship_status", "deferral_term", "hardship_amount", "hardship_start_date", "hardship_end_date", 
    "payment_plan_start_date", "hardship_length", "hardship_dpd", "hardship_loan_status", "orig_projected_additional_accrued_interest", "hardship_payoff_balance_amount",
    "hardship_last_payment_amount"
]

data.drop(hardship_cols, axis=1, inplace=True)

# Ordinal encodeing

## Process emp_length

In [5]:
mapping_dict = {
    "emp_length": {
        "10+ years": '10',
        "9 years": '9',
        "8 years": '8',
        "7 years": '7',
        "6 years": '6',
        "5 years": '5',
        "4 years": '4',
        "3 years": '3',
        "2 years": '2',
        "1 year": '1',
        "< 1 year": '0',
        "n/a": '0'
    },
}
data = data.replace(mapping_dict)
data['emp_length'] = pd.to_numeric(data['emp_length'])
data[['emp_length']].head()

Unnamed: 0,emp_length
0,10.0
1,0.0
2,10.0
3,10.0
4,1.0


In [6]:
data['emp_length'] = data['emp_length'].fillna(0)

## Process sub_grade

In [7]:
int_grade = []

dd = {
    'A': 0,
    'B': 1,
    'C': 2,
    'D': 3,
    'E': 4,
    'F': 5,
    'G': 6
}

for elem in data["sub_grade"]:
    int_grade.append(
        dd[elem[0]] * 5 + int(elem[1])
    )
data["sub_grade"] = int_grade

Quick check

In [8]:
for i in (1, 36):
    assert i in data["sub_grade"]

# One-hot encoding

In [9]:
nominal_columns = ["home_ownership", "purpose", "verification_status"]
dummy_df = pd.get_dummies(data[nominal_columns])
data = pd.concat([data, dummy_df], axis=1)
data = data.drop(nominal_columns, axis=1)

### One-hot top ones

In [10]:
value_counts = data['addr_state'].value_counts()

most_frequent = value_counts[value_counts >= 100_000].index
one_hot_encoded = pd.get_dummies(data['addr_state'][data['addr_state'].isin(most_frequent)])

In [11]:
one_hot_encoded

Unnamed: 0,CA,FL,NY,TX
3,1,0,0,0
7,1,0,0,0
8,1,0,0,0
9,0,0,0,1
11,1,0,0,0
...,...,...,...,...
1860754,0,1,0,0
1860756,1,0,0,0
1860758,1,0,0,0
1860762,0,0,1,0


In [12]:
data = pd.concat([data, one_hot_encoded], axis=1)

In [13]:
value_counts = data['emp_title'].value_counts()

most_frequent = value_counts[value_counts >= 100_000].index
one_hot_encoded = pd.get_dummies(data['emp_title'][data['emp_title'].isin(most_frequent)])
one_hot_encoded

In [14]:
data = pd.concat([data, one_hot_encoded], axis=1)

# Debt settlement flag

In [15]:
data.loc[data["debt_settlement_flag"] == 'N', "debt_settlement_flag"] = '0'
data.loc[data["debt_settlement_flag"] == 'Y', "debt_settlement_flag"] = '1'

data["debt_settlement_flag"] = pd.to_numeric(data["debt_settlement_flag"])

In [16]:
data.loc[data["initial_list_status"] == 'w', "initial_list_status"] = '1'
data.loc[data["initial_list_status"] == 'f', "initial_list_status"] = '0'

data["initial_list_status"] = pd.to_numeric(data["initial_list_status"])

In [17]:
data["term"].value_counts()

36 months    1393606
60 months     467158
Name: term, dtype: Int64

In [18]:
data.loc[data["term"] == ' 36 months', "term"] = '36'
data.loc[data["term"] == ' 60 months', "term"] = '60'

data["term"] = pd.to_numeric(data["term"])

# New columns

Let's now go ahead and create a column for the average of fico_range_low and fico_range_high columns and name it fico_average. Note that this is not the average FICO score for each borrower, but rather an average of the high and low range that we know the borrower is in.

In [19]:
data['fico_average'] = (data['fico_range_high'] + data['fico_range_low']) / 2
drop_cols = ['fico_range_low', 'fico_range_high']
data = data.drop(drop_cols, axis=1)

# Fill nans

In [20]:
data['acc_open_past_24mths'] = data['acc_open_past_24mths'].fillna(0)

# Filter non-numeric data

In [21]:
dd = dict(data.dtypes)
numeric_cols = []

for column, type_name in dd.items():
    type_name = str(type_name).lower()
    if 'float' not in type_name and 'int' not in type_name:
        pass
    else:
        numeric_cols.append(column)

len(numeric_cols)

82

In [22]:
data_numeric = data[numeric_cols].copy()
data_numeric.shape

(1860764, 82)

In [23]:
half_count = len(data_numeric) / 2
data_numeric.dropna(thresh=half_count, axis=1, inplace=True) # Drop any column with more than 50% missing values
data_numeric.shape

(1860764, 72)

In [24]:
data_numeric.to_parquet("data/data_with_feats_4xgb.parquet.gzip", compression='gzip')

# Drafts

In [25]:
# drop_cols = ['last_credit_pull_d', 'addr_state', 'title','earliest_cr_line']
# data = data.drop(drop_cols,axis=1)