## Load and clean data

In [1]:
# Import libraries and data

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

frame = pd.read_csv("data-with-ids/loan.csv", index_col = None)

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
frame.shape

(887379, 74)

In [2]:
# Cast emp_length to int
def emp_length_to_int(s):
    return max([int(n) for n in str(s).split() if n.isdigit()] + [0])
frame['emp_length_num'] = frame['emp_length'].apply(emp_length_to_int)

# Add late and default dummies
frame['late_16_30'] = (frame.loan_status == "Late (16-30 days)") * 1
frame['late_31_120'] = (frame.loan_status == "Late (31-120 days)") * 1
frame['default'] = (frame.loan_status == "Default") * 1

# Add interest-to-total-received and late-fees-to-total-received ratios
frame['int_to_total'] = (frame['total_rec_int'] / frame['total_pymnt']).fillna(0)
frame['late_fees_to_total'] = (frame['total_rec_late_fee'] / frame['total_pymnt']).fillna(0)

# Convert earliest credit line to date
frame['earliest_cr_line_date'] = pd.to_datetime(frame.earliest_cr_line)
frame['earliest_cr_line_date'] = (frame['earliest_cr_line_date'] - frame['earliest_cr_line_date'].min()) / np.timedelta64(1,'M')


In [3]:
frame['earliest_cr_line_date'].min()

0.0

## m a c h i n e l e a r n i n g

In [6]:
# Construct data frame of just pre-application characteristics

post_variables = ["acc_now_delinq", "acc_open_past_24mths", "all_util", 
                "chargeoff_within_12_mths", "collection_recovery_fee", 
                "funded_amnt", "funded_amnt_inv", "grade", "id", "initial_list_status",
                "installment", "int_rate", "issue_d", "last_credit_pull_d", 
                "last_fico_range_high", "last_fico_range_low", "last_pymnt_amnt",
                "last_pymnt_d", "member_id", "next_pymnt_d", "out_prncp", 
                "out_prncp_inv", "pymnt_plan", "sub_grade", "term",
                "total_pymnt", "total_pymnt_inv", "total_rec_int", "total_rec_late_fee",
                "total_rec_prncp", "hardship_flag", "hardship_type", "hardship_reason",
                "hardship_status", "deferral_term", "hardship_amount",
                "hardship_start_date", "hardship_end_date", "payment_plan_start_date",
                "hardship_length", "hardship_dpd", "hardship_loan_status",
                "earliest_cr_line",
                "zip_code", "title", "purpose", "desc", "url", "loan_status", "emp_length",
                "emp_title"]
post_variables = list(set(post_variables).intersection(frame.columns))

pre_data = frame.drop(post_variables, axis=1)
pre_data_dummies = pd.get_dummies(pre_data)

# Drop nas
pre_data_dummies_no_na = pre_data_dummies[pre_data_dummies.columns[pre_data_dummies.isnull().sum() < 100]].dropna()

In [61]:
## Random forest regression of default on pre-application characteristics


# Split data into traning and test

from sklearn.model_selection import train_test_split

regressands = ["late_16_30", "late_31_120", "default", "int_to_total", "late_fees_to_total"]
features = pre_data_dummies_no_na.drop(regressands, axis=1)
labels = pre_data_dummies_no_na[regressands]
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.8, 
                                                                            random_state = 42)

# Fit random forest

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators = 64, random_state = 42)
rf.fit(train_features, train_labels)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=64, n_jobs=1, oob_score=False, random_state=42,
           verbose=0, warm_start=False)

In [62]:
# Check feature importance for RF

feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = train_features.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
feature_importances[:10]

Unnamed: 0,importance
dti,0.123189
loan_amnt,0.118567
revol_bal,0.110298
earliest_cr_line_date,0.100971
annual_inc,0.091306
total_acc,0.075287
open_acc,0.061161
emp_length_num,0.042546
inq_last_6mths,0.029077
delinq_2yrs,0.017829


In [101]:
# Predict out of sample with random forest
predictions = rf.predict(test_features)

In [105]:
# Getted fitted values (predictions in sample)
fitted_vals = rf.predict(train_features)

In [117]:
# Check in sample performance
np.sqrt(np.mean((fitted_vals - train_labels) ** 2, axis=0))

late_16_30            0.020328
late_31_120           0.044456
default               0.014521
int_to_total          0.054906
late_fees_to_total    0.000972
dtype: float64

In [116]:
# Check mean squared error out of sample
np.sqrt(np.mean((predictions - test_labels) ** 2, axis=0))

late_16_30            0.052858
late_31_120           0.116974
default               0.038159
int_to_total          0.145281
late_fees_to_total    0.002458
dtype: float64

In [118]:
# Mean squared error of naive estimator that guesses 0 for everything
np.sqrt(np.mean(train_labels ** 2))

late_16_30            0.052331
late_31_120           0.114582
default               0.036851
int_to_total          0.314678
late_fees_to_total    0.002751
dtype: float64

In [65]:
# Append predictions to test data
predicted_cols = [c + "_pred" for c in train_labels.columns]
predictions_df = pd.DataFrame(predictions, columns = predicted_cols, index = test_features.index)

test_data_with_preds = test_features.join(predictions_df)

## Logistic regression model

In [188]:
## Logistic regression of default (and other risks) on pre-application characteristics

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(train_features, train_labels.default)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [189]:
np.sqrt(np.mean((lr.predict_proba(train_features)[:,1] - train_labels.default) ** 2))

0.036900647586326128

In [194]:
lr.predict_proba(train_features)[:,1]

array([ 0.00605395,  0.00118431,  0.00117701, ...,  0.00053972,
        0.00277197,  0.00227416])

In [193]:
train_features.iloc[69742]

loan_amnt                                     6000.000000
annual_inc                                   19328.000000
dti                                             34.220000
delinq_2yrs                                      0.000000
inq_last_6mths                                   0.000000
open_acc                                         4.000000
pub_rec                                          0.000000
revol_bal                                    18626.000000
total_acc                                        8.000000
recoveries                                       0.000000
policy_code                                      1.000000
emp_length_num                                   0.000000
earliest_cr_line_date                          120.018892
home_ownership_ANY                               0.000000
home_ownership_MORTGAGE                          0.000000
home_ownership_NONE                              0.000000
home_ownership_OTHER                             0.000000
home_ownership