In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

from imblearn.over_sampling import SMOTE
import pickle

In [2]:
import sklearn
print('The sklearn version is {}.'.format(sklearn.__version__))

The sklearn version is 0.20.1.


In [3]:
data_train = pd.read_csv("cleaned_2013_14", low_memory = False);
data_test = pd.read_csv("cleaned_2015", low_memory = False);

In [4]:
data_train.columns

Index(['funded_amnt', 'int_rate', 'total_pymnt', 'annual_inc', 'dti',
       'loan_status', 'revol_util', 'term', 'term_adj', 'emp_length_1 year',
       'emp_length_10+ years', 'emp_length_2 years', 'emp_length_3 years',
       'emp_length_4 years', 'emp_length_5 years', 'emp_length_6 years',
       'emp_length_7 years', 'emp_length_8 years', 'emp_length_9 years',
       'emp_length_< 1 year', 'home_ownership_ANY', 'home_ownership_MORTGAGE',
       'home_ownership_OWN', 'home_ownership_RENT',
       'verification_status_Not Verified',
       'verification_status_Source Verified', 'verification_status_Verified',
       'grade_A', 'grade_B', 'grade_C', 'grade_D', 'grade_E', 'grade_F',
       'grade_G', 'purpose_car', 'purpose_credit_card',
       'purpose_debt_consolidation', 'purpose_home_improvement',
       'purpose_house', 'purpose_major_purchase', 'purpose_medical',
       'purpose_moving', 'purpose_other', 'purpose_renewable_energy',
       'purpose_small_business', 'purpose_vacat

In [5]:
data_test.shape

(346845, 49)

In [6]:
data_test.columns == data_train.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True])

In [7]:
def split_data(df, cols):
    x = df.drop(cols, axis = 1)
    y = df.paid
    return x, y

cols_to_drop_training = ['loan_status', 'paid', 'amnt', 'total_pymnt', 'term_adj']
x_train_initial, y_train_initial = split_data(data_train, cols_to_drop_training)
x_test, y_test = split_data(data_test, cols_to_drop_training)

In [8]:
sm = SMOTE(random_state=1, ratio = 1.0)
x_train, y_train = sm.fit_sample(x_train_initial, y_train_initial)

In [9]:
x_train.shape, x_test.shape

((578050, 44), (346845, 44))

In [11]:
#create a function to take advantage of sklearn make_pipeline
#pipeline adds second order terms and interaction terms to X_train and then fits model

def rf_model(x, y, d):
    model = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False),
    RandomForestClassifier(max_features = 100, max_depth = d, n_estimators = 100, verbose=3))
    model.fit(x, y)
    return model

In [12]:
rf_complex = rf_model(x_train, y_train, 10)

KeyboardInterrupt: 

In [12]:
# with open('Random_Forest_Model_100features.pkl', 'rb') as file:  
#     rf_complex = pickle.load(file)

In [6]:
def create_df(data_1, data_2, penal = 0.4):
    df = pd.DataFrame(data_1)
    df['int_rate'] = data_2['int_rate']
    df['amnt'] = data_2['amnt']
    df['total_pymnt'] = data_2['total_pymnt']
    df['term'] = data_2['term']
    df['term_adj'] = data_2['term_adj']
    df['ROI'] = (((1 + df['int_rate'])*(1-df['proba']))+(df['proba']*drr*penal))-1
    df['Real_ROI'] = df['amnt']*(((df['total_pymnt']/df['amnt'])**(1/df['term_adj']))-1)
    df['annualized_amnt'] = df['amnt']*(1/df['term_adj'])
    return df

In [13]:
distress = data_train[data_train.paid == 1]
drr = (np.sum(distress.total_pymnt)/np.sum(distress.amnt))**(1/np.mean(distress.term))
print(drr)

0.8999757837386154


In [14]:
train_data = {'proba' : rf_complex.predict_proba(x_train)[:, 1], 'true_val' : y_train}
test_data = {'proba' : rf_complex.predict_proba(x_test)[:, 1], 'true_val' : y_test}

df_train = create_df(train_data, data_train)
df_test =  create_df(test_data, data_test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
baseline_train = np.mean(df_train.ROI)
baseline_test = np.mean(df_test.ROI)
print(baseline_train, baseline_test)
alpha_low = 0
alpha_high = -np.inf

In [None]:
loans_to_buy_train = df_train[(df_train.ROI > alpha_low*baseline_train) & (df_train.ROI < alpha_high*baseline_train)]
loans_to_buy_test = df_test[(df_test.ROI > alpha_low*baseline_test) & (df_test.ROI < alpha_high*baseline_test)]

In [None]:
def return_performance(df_sub, df_main):
    perf_subset = np.sum(df_sub.Real_ROI)/np.sum(df_sub.annualized_amnt)
    perf_mainset = np.sum(df_main.Real_ROI)/np.sum(df_main.annualized_amnt)
    return (perf_subset, perf_mainset, perf_subset - perf_mainset)

In [None]:
training_set_performance = return_performance(loans_to_buy_train, df_train)
testing_set_performance = return_performance(loans_to_buy_test, df_test)

In [None]:
print(training_set_performance)
print(testing_set_performance)

In [None]:
fig = plt.figure(figsize=(15,6));
ax = plt.subplot(1,1,1);

ax.hist(df_test.loc[df_test.true_val == 0,'proba'], density = True, bins = 50, label='Paid Off', alpha=0.6);
ax.hist(df_test.loc[df_test.true_val == 1,'proba'], density = True, bins = 50, label='Defaulted', alpha=0.5);
ax.set_title('Initial Predicted ROI for Defaulted & Non-Defaulted Customers with Higher Order Random Forest')
ax.legend(loc='best')

plt.show()

In [None]:
fig = plt.figure(figsize=(15,6));
ax = plt.subplot(1,1,1);

ax.hist(df_train.loc[df_train.true_val == 0,'proba'], bins = 50, label='Paid Off', alpha=0.6);
ax.hist(df_train.loc[df_train.true_val == 1,'proba'], bins = 50, label='Defaulted', alpha=0.5);
ax.set_title('Initial Predicted ROI for Defaulted & Non-Defaulted Customers with Higher Order Random Forest')
ax.legend(loc='best')

plt.show()

In [None]:
sns.kdeplot(df_train.loc[df_train.true_val == 0,'proba'], label = 'Paid Out');
sns.kdeplot(df_train.loc[df_train.true_val == 1,'proba'], label = 'Defaulted');

In [None]:
sns.set(rc={'figure.figsize':(16,8)})
sns.kdeplot(df_test.loc[df_test.true_val == 0,'proba'], label = 'Paid Out');
sns.kdeplot(df_test.loc[df_test.true_val == 1,'proba'], label = 'Defaulted', linestyle='--');
plt.ylim(0, 3)
plt.xlim(0, 1);
plt.xlabel('Probability of Default', size = 15);
plt.ylabel('Density of Distributions', size = 15);
plt.legend(prop={'size': 13});
plt.savefig('default_probability_distribution.png', bbox_inches='tight')

In [None]:
sns.kdeplot(df_test['ROI'], label = '2015 Data');
sns.kdeplot(df_train['ROI'], label = '2014 Data', linestyle='--');
plt.ylim(0, 5)
plt.xlim(-0.5, 0.1);
plt.xlabel('Predicted Return on Investment', size = 15);
plt.ylabel('Density of Distributions', size = 15);
plt.legend(prop={'size': 13});
plt.savefig('ROI_distribution.png', bbox_inches='tight')

In [None]:
# Dump the trained decision tree classifier with Pickle
decision_tree_pkl_filename = 'RF_Model_retrained.pkl'
# Open the file to save as pkl file
decision_tree_model_pkl = open(decision_tree_pkl_filename, 'wb')
pickle.dump(rf_complex, decision_tree_model_pkl)
# Close the pickle instances
decision_tree_model_pkl.close()