In [2]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import accuracy_score
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from keras.models import load_model

from imblearn.over_sampling import SMOTE
import pickle

In [2]:
data_train = pd.read_csv("../lipika/cleaned_2013_14", low_memory = False);
data_test = pd.read_csv("../lipika/cleaned_2015", low_memory = False);

In [3]:
data_test.columns == data_train.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True])

In [4]:
def split_data(df, cols):
    x = df.drop(cols, axis = 1)
    y = df.paid
    return x, y

cols_to_drop_training = ['loan_status', 'paid', 'amnt', 'total_pymnt', 'term_adj', 'zip_code']
x_train_initial, y_train_initial = split_data(data_train, cols_to_drop_training)
x_test, y_test = split_data(data_test, cols_to_drop_training)

In [5]:
sm = SMOTE(random_state=1, ratio = 1.0)
x_train, y_train = sm.fit_sample(x_train_initial, y_train_initial)

In [6]:
with open('Tuned_RF.pkl', 'rb') as file:  
    rf = pickle.load(file)

with open('QDA.pkl', 'rb') as file:  
    qda = pickle.load(file, encoding='latin1')

with open('Tuned_LGBM.pkl', 'rb') as file:  
    lgbm = pickle.load(file, encoding='latin1')

with open('../anthony/AdaboostGS.pkl', 'rb') as file:  
    adaboost = pickle.load(file, encoding='latin1')

with open('../bhaven/Tuned_logReg_all_training_data.pkl', 'rb') as file:  
    logreg = pickle.load(file, encoding='latin1')





In [7]:
models = [rf, qda, lgbm, adaboost, logreg]

In [11]:
def create_df(data_1, data_2, penal = 0.5):
    df = pd.DataFrame(data_1)
    
    df['int_rate'] = data_2['int_rate']
    df['amnt'] = data_2['amnt']
    df['total_pymnt'] = data_2['total_pymnt']
    df['term_adj'] = data_2['term_adj']
    df['ROI'] = (((1 + df['int_rate'])*(1-df['proba']))+(df['proba']*drr*penal))-1

    df['Real_ROI'] = df['amnt']*(((df['total_pymnt']/df['amnt'])**(1/df['term_adj']))-1)
    df['annualized_amnt'] = df['amnt']*(1/df['term_adj'])
    
    return df

In [12]:
train_data = {}
test_data= {}
df_train = {}
df_test = {}

for model in models:
    tr = {'proba' : model.predict_proba(x_train)[:, 1], 
                         'pred' : model.predict(x_train), 
                         'true_val' : y_train}
    df_tr = create_df(tr, data_train)    
    train_data[model] = tr
    df_train[model] = df_tr
    
    
    te = {'proba' : model.predict_proba(x_test)[:, 1], 
                         'pred' : model.predict(x_test), 
                         'true_val' : y_train}    
    df_te = create_df(te, data_train)    
    test_data[model] = te
    df_test[model] = df_te

NameError: name 'models' is not defined

In [None]:
#load neural net
nn = load_model('NN_final_model.h5')
# nn = load_model('NN_final_model.h5')

tr = {'proba' : model.predict_proba(x_train)[:, 1], 
                         'pred' : model.predict(x_train), 
                         'true_val' : y_train}

nn_train = {'proba': nn.predict(x_train)[:, 1], 'true_val': y_train}
nn_test = {'proba': nn.predict(x_test)[:, 1], 'true_val': y_test}

nn_train['pred'] = np.where(nn_train['proba'] > 0.5, 1, 0)
nn_test['pred'] = np.where(nn_test['proba'] > 0.5, 1, 0)

df_train['nn'] = create_df(nn_train, data_train)
df_test['nn'] =  create_df(nn_test, data_test)


In [None]:
with open('df_train_dict.pickle', 'wb') as handle:
    pickle.dump(df_train, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('df_test_dict.pickle', 'wb') as handle:
    pickle.dump(df_test, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('filename.pickle', 'rb') as handle:
#     b = pickle.load(handle)



In [None]:
# distress = data_train[data_train.paid == 1]
# drr = (np.sum(distress.total_pymnt)/np.sum(distress.amnt))**(1/np.mean(distress.term))
# print(drr)

In [None]:
# # train_data[model] = {'proba' : model.predict_proba(x_train)[:, 1], 'pred' : model.predict(x_train), 'true_val' : y_train}
# # test_data = {'proba' : rf_complex.predict_proba(x_test)[:, 1], 'pred' : rf_complex.predict(x_test),  'true_val' : y_test}

# df_train = create_df(train_data, data_train)
# df_test =  create_df(test_data, data_test)

In [None]:
# baseline_train = np.mean(df_train.ROI)
# baseline_test = np.mean(df_test.ROI)
# print(baseline_train, baseline_test)
# alpha_low = 0
# alpha_high = -np.inf

In [None]:
# print(roc_auc_score(df_train.true_val, df_train.pred))
# print(roc_auc_score(df_test.true_val, df_test.pred))

In [None]:
# print(np.sum(df_train.true_val)/df_train.shape[0])
# print(1 - np.sum(df_test.true_val)/df_test.shape[0])

In [None]:
# loans_to_buy_train = df_train[(df_train.ROI > alpha_low*baseline_train) & (df_train.ROI < alpha_high*baseline_train)]
# loans_to_buy_test = df_test[(df_test.ROI > alpha_low*baseline_test) & (df_test.ROI < alpha_high*baseline_test)]

In [None]:
# def return_performance(df_sub, df_main):
#     perf_subset = np.sum(df_sub.Real_ROI)/np.sum(df_sub.annualized_amnt)
#     perf_mainset = np.sum(df_main.Real_ROI)/np.sum(df_main.annualized_amnt)
#     return (perf_subset, perf_mainset, perf_subset - perf_mainset)

In [None]:
# training_set_performance = return_performance(loans_to_buy_train, df_train)
# testing_set_performance = return_performance(loans_to_buy_test, df_test)

In [None]:
# fig = plt.figure(figsize=(15,6));
# ax = plt.subplot(1,1,1);

# ax.hist(df_test.loc[df_test.true_val == 0,'proba'], density = True, bins = 50, label='Paid Off', alpha=0.6);
# ax.hist(df_test.loc[df_test.true_val == 1,'proba'], density = True, bins = 50, label='Defaulted', alpha=0.5);
# ax.set_title('Initial Predicted ROI for Defaulted & Non-Defaulted Customers with Higher Order Random Forest')
# ax.legend(loc='best')

# plt.show()

In [None]:
# fig = plt.figure(figsize=(15,6));
# ax = plt.subplot(1,1,1);

# ax.hist(df_train.loc[df_train.true_val == 0,'proba'], bins = 50, label='Paid Off', alpha=0.6);
# ax.hist(df_train.loc[df_train.true_val == 1,'proba'], bins = 50, label='Defaulted', alpha=0.5);
# ax.set_title('Initial Predicted ROI for Defaulted & Non-Defaulted Customers with Higher Order Random Forest')
# ax.legend(loc='best')

# plt.show()

In [None]:
# sns.kdeplot(df_train.loc[df_train.true_val == 0,'proba'], label = 'Paid Out');
# sns.kdeplot(df_train.loc[df_train.true_val == 1,'proba'], label = 'Defaulted');

In [None]:
# sns.set(rc={'figure.figsize':(16,8)})
# sns.kdeplot(df_test.loc[df_test.true_val == 0,'proba'], label = 'Paid Out');
# sns.kdeplot(df_test.loc[df_test.true_val == 1,'proba'], label = 'Defaulted', linestyle='--');
# plt.ylim(0, 3)
# plt.xlim(0, 1);
# plt.xlabel('Probability of Default', size = 15);
# plt.ylabel('Density of Distributions', size = 15);
# plt.legend(prop={'size': 13});
# plt.savefig('default_probability_distribution.png', bbox_inches='tight')

In [None]:
# sns.kdeplot(df_test['ROI'], label = '2015 Data');
# sns.kdeplot(df_train['ROI'], label = '2014 Data', linestyle='--');
# plt.ylim(0, 5)
# plt.xlim(-0.5, 0.1);
# plt.xlabel('Predicted Return on Investment', size = 15);
# plt.ylabel('Density of Distributions', size = 15);
# plt.legend(prop={'size': 13});
# plt.savefig('ROI_distribution.png', bbox_inches='tight')