In [1]:
# Import libraries and data

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

frame = pd.read_csv("data-with-ids/loan.csv", index_col = None)

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
# Cast emp_length to int
def emp_length_to_int(s):
    return max([int(n) for n in str(s).split() if n.isdigit()] + [0])
frame['emp_length_num'] = frame['emp_length'].apply(emp_length_to_int)

# Add late and default dummies
frame['late_16_30'] = (frame.loan_status == "Late (16-30 days)") * 1
frame['late_31_120'] = (frame.loan_status == "Late (31-120 days)") * 1
frame['default'] = (frame.loan_status == "Default") * 1

# Add interest-to-total-received and late-fees-to-total-received ratios
frame['int_to_total'] = (frame['total_rec_int'] / frame['total_pymnt']).fillna(0)
frame['late_fees_to_total'] = (frame['total_rec_late_fee'] / frame['total_pymnt']).fillna(0)

# Convert earliest credit line to date
frame['earliest_cr_line_date'] = pd.to_datetime(frame.earliest_cr_line)
frame['earliest_cr_line_date'] = (frame['earliest_cr_line_date'] - frame['earliest_cr_line_date'].min()) / np.timedelta64(1,'M')


In [3]:
frame['earliest_cr_line_date'].min()

0.0

## m a c h i n e l e a r n i n g

In [6]:
# Construct data frame of just pre-application characteristics

post_variables = ["acc_now_delinq", "acc_open_past_24mths", "all_util", 
                "chargeoff_within_12_mths", "collection_recovery_fee", 
                "funded_amnt", "funded_amnt_inv", "grade", "id", "initial_list_status",
                "installment", "int_rate", "issue_d", "last_credit_pull_d", 
                "last_fico_range_high", "last_fico_range_low", "last_pymnt_amnt",
                "last_pymnt_d", "member_id", "next_pymnt_d", "out_prncp", 
                "out_prncp_inv", "pymnt_plan", "sub_grade", "term",
                "total_pymnt", "total_pymnt_inv", "total_rec_int", "total_rec_late_fee",
                "total_rec_prncp", "hardship_flag", "hardship_type", "hardship_reason",
                "hardship_status", "deferral_term", "hardship_amount",
                "hardship_start_date", "hardship_end_date", "payment_plan_start_date",
                "hardship_length", "hardship_dpd", "hardship_loan_status",
                "earliest_cr_line",
                "zip_code", "title", "purpose", "desc", "url", "loan_status", "emp_length",
                "emp_title"]
post_variables = list(set(post_variables).intersection(frame.columns))

pre_data = frame.drop(post_variables, axis=1)
pre_data_dummies = pd.get_dummies(pre_data)

# Drop nas
pre_data_dummies_no_na = pre_data_dummies[pre_data_dummies.columns[pre_data_dummies.isnull().sum() < 100]].dropna()

In [30]:
## Random forest regression of default on pre-application characteristics


# Split data into traning and test

from sklearn.model_selection import train_test_split

regressands = ["late_16_30", "late_31_120", "default", "int_to_total", "late_fees_to_total"]
features = pre_data_dummies_no_na.drop(regressands, axis=1)
labels = pre_data_dummies_no_na[regressands]
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.8, 
                                                                            random_state = 42)

# Fit random forest

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators = 20, random_state = 42)
rf.fit(train_features, train_labels)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=50, n_jobs=1, oob_score=False, random_state=42,
           verbose=0, warm_start=False)

In [31]:
# Check feature importance for RF

feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = train_features.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
feature_importances[:10]

Unnamed: 0,importance
dti,0.122996
loan_amnt,0.118713
revol_bal,0.110505
earliest_cr_line_date,0.101277
annual_inc,0.091214
total_acc,0.075174
open_acc,0.061035
emp_length_num,0.042507
inq_last_6mths,0.029105
delinq_2yrs,0.017839


In [32]:
# Predict

predictions = rf.predict(test_features)

In [43]:
# Check mean absolute prediction error

np.mean(np.abs(predictions - test_labels), axis=0)

late_16_30            0.006152
late_31_120           0.029241
default               0.003137
int_to_total          0.115799
late_fees_to_total    0.000167
dtype: float64