In [1]:
import numpy as np
import pandas as pd
import sklearn.feature_selection as fs
from sklearn import preprocessing, ensemble

%reload_ext autoreload
%autoreload 2
from function_library import label_encoder

In [2]:
## load data
file_path = '../Cleaned Datasets/Sampled Dataset_Balanced.csv.gz'
data = pd.read_csv(file_path, compression = 'gzip').sort_values(by = 'issue_d', ascending = True)
pd.set_option("display.max_columns", data.shape[1])

## list of features to use from feature selection
lst_final = ['mths_since_last_delinq',
 'grade',     ## to be dropped later, manually added only for weighting purpose
 'sub_grade',
 'home_ownership',
 'total_rev_hi_lim',
 'annual_inc',
 'term',
 'active_ins_acct_frac',
 'fico_score',
 'funded_amnt',
 'all_util',
 'emp_length',
 'revol_util',
 'treasury_rate',
 'credit_hist',
 'int_rate',
 'purpose',
 'dti',
 'tot_hi_cred_lim',
 'total_acc',
 'tot_cur_bal',
 'total_il_high_credit_limit',
 'il_util',
 'addr_state',
 'total_bc_limit',
 'unemp_rate_3mon',
 'num_il_tl']

In [14]:
## get classifier and regressor targets
df = data.copy().set_index('issue_d', drop = True)
# df_status = df['loan_status']
# df_return = df['return_rate']

## create dataframes for classification and regression separately
df_class = df[lst_final + ['loan_status']]

## include the loan status column to split dataset into default and fully paid
df_regress = df[lst_final + ['return_rate', 'loan_status']]
## default subset
df_regress_d = df_regress[df_regress.loan_status == 'Default'].drop('loan_status', axis = 1)
## fully paid subset
df_regress_fp = df_regress[df_regress.loan_status == 'Fully Paid'].drop('loan_status', axis = 1)

In [15]:
df_regress.shape

(132894, 29)

## Grid Search with Customized Weighted Score Metric On Random Forest with Time Series

In [25]:
# from sklearn.metrics import accuracy_score, r2_score
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import GridSearchCV
import pickle

import warnings
warnings.filterwarnings('ignore')

In [26]:
from sklearn.metrics import accuracy_score, r2_score

## define an accuracy scorer with grade weight
def grade_weighted_accuracy(y_true, y_pred, sample_weight):
    
    weighted_acc = accuracy_score(y_true, y_pred, normalize = True,
                                sample_weight = sample_weight.loc[y_true.index.values].values.reshape(-1)
                                 )
    return weighted_acc

## define R^2 scorer with grade weight
def grade_weighted_r2(y_true,y_pred,sample_weight):

    weighted_r2 = r2_score(y_true, y_pred,
                         sample_weight=sample_weight.loc[y_true.index.values].values.reshape(-1))
    return weighted_r2 

In [18]:
## grade weight dictionary
grade_weight_ab = {'A': 100,
                  'B': 100,
                  'C': 1,
                  'D': 1,
                  'E': 1,
                  'F': 1,
                  'G': 1}

grade_weight_cd = {'A': 1,
                  'B': 1,
                  'C': 100,
                  'D': 100,
                  'E': 1,
                  'F': 1,
                  'G': 1}

grade_weight_efg = {'A': 1,
                  'B': 1,
                  'C': 1,
                  'D': 1,
                  'E': 100,
                  'F': 100,
                  'G': 100}

### Find the best RF model in each time window (non-overlapping) with Grid Search

- Classifier: combined dataframe
- Regressor: default only
- Regressor: fully paid only

In [21]:
## get the lsit of possible time windows
lst_time = data.issue_d.unique().tolist()

## width and shift length of each time window -> no overlap btwn windows
window = 3

##### Classifier Models with Time Series

In [20]:
%%time

## incorporate the weights into the dataframes
df_class['weight'] = df_class.grade.map(grade_weight_efg)

## label encode non-numeric features
# df_class_le = df_class.copy()
df_class_le = label_encoder(df_class)

## split feature and target dataframes - note: weight is the last column
# classifier
df_c_x = df_class_le[set(df_class_le.columns) - set(['loan_status'])].drop('grade', axis = 1)
df_c_y = df_class_le.loan_status

## open and create a pickle file to store models
filename_class = './Classifier_EFG.pkl'
file = open(filename_class, 'wb')

## loop through each time window
for i in range(0, len(lst_time) - window + 1, window):
    # months in the ith time window
    lst_window = lst_time[i : i + window]
    # subset of dataframe
    df_window_x = df_c_x.loc[lst_window].reset_index(drop = True)
    df_window_y = df_c_y.loc[lst_window].reset_index(drop = True)
    
    # a dataframe of the grade weights
    df_grade_weight = pd.DataFrame(df_window_x.weight, index = df_window_x.index)
    
    score_params = {"sample_weight": df_grade_weight}

    grade_weighted_scorer = make_scorer(score_func = grade_weighted_accuracy,
                                        greater_is_better = True,
                                        needs_proba = False,
                                        needs_threshold = False,
                                        **score_params)
    grid_para_rf = {
        'n_estimators': [5, 10, 15, 20, 25, 30],
        'max_depth': [None],
        'max_features': ['auto'],
        'min_samples_leaf': range(5, 15),
        'warm_start': [False, True]}
     
    gs_rf = GridSearchCV(estimator = ensemble.RandomForestClassifier(),\
                                      param_grid = grid_para_rf,\
                                      scoring = grade_weighted_scorer,\
                                      n_jobs = -1, cv = 5, return_train_score = False)
    
    # fit the grid search
    gs_rf.fit(df_window_x, df_window_y)
    
    # save the model using pickle
    pickle.dump(gs_rf, file)
    
    # status report
    print('Saved the model for period {} to {}. The best score is {}.'.\
          format(lst_window[0], lst_window[-1], np.round(gs_rf.best_score_, 6)))
    print('-' * 88)
    
file.close()

Saved the model for period 2007-10-01 to 2007-12-01. The best score is 0.778878.
----------------------------------------------------------------------------------------
Saved the model for period 2008-01-01 to 2008-03-01. The best score is 0.66134.
----------------------------------------------------------------------------------------
Saved the model for period 2008-04-01 to 2008-06-01. The best score is 0.675306.
----------------------------------------------------------------------------------------
Saved the model for period 2008-07-01 to 2008-10-01. The best score is 0.85.
----------------------------------------------------------------------------------------
Saved the model for period 2008-11-01 to 2009-01-01. The best score is 0.672449.
----------------------------------------------------------------------------------------
Saved the model for period 2009-02-01 to 2009-04-01. The best score is 0.774957.
--------------------------------------------------------------------------

##### Regressor Models with Time Series

In [30]:
df_r_d_x.head(3)

Unnamed: 0_level_0,weight,dti,total_il_high_credit_limit,total_bc_limit,purpose,addr_state,il_util,unemp_rate_3mon,all_util,mths_since_last_delinq,funded_amnt,int_rate,total_acc,total_rev_hi_lim,tot_cur_bal,num_il_tl,term,treasury_rate,annual_inc,tot_hi_cred_lim,emp_length,sub_grade,fico_score,credit_hist,active_ins_acct_frac,home_ownership,revol_util
issue_d,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
2007-10-01,1,3.26,-999.0,-999.0,9,24,-999.0,5.2,-999.0,22.0,4650.0,10.28,25.0,-999.0,-999.0,-999.0,36,4.01,45600.0,-999.0,10,10,687.0,17.926027,-999.0,1,35.0
2007-10-01,1,3.01,-999.0,-999.0,2,9,-999.0,4.47,-999.0,0.0,4900.0,13.75,5.0,-999.0,-999.0,-999.0,36,4.01,43500.0,-999.0,4,21,647.0,3.50137,-999.0,5,90.1
2007-11-01,1,16.97,-999.0,-999.0,2,47,-999.0,4.37,-999.0,0.0,15450.0,10.91,39.0,-999.0,-999.0,-999.0,36,3.35,120000.0,-999.0,7,12,697.0,26.520548,-999.0,5,44.5


In [37]:
%%time

## incorporate the weights into the dataframes
df_regress_d['weight'] = df_regress_d.grade.map(grade_weight_efg)
df_regress_fp['weight'] = df_regress_fp.grade.map(grade_weight_efg)

## label encode non-numeric features
df_regress_d_le = label_encoder(df_regress_d)
df_regress_fp_le = label_encoder(df_regress_fp)

## split feature and target dataframes - note: weight is the last column
# regressor - default
df_r_d_x = df_regress_d_le[set(df_regress_d.columns) - set(['return_rate'])].drop('grade', axis = 1)
df_r_d_y = df_regress_d_le.return_rate

# regressor - fully paid
df_r_fp_x = df_regress_fp_le[set(df_regress_fp.columns) - set(['return_rate'])].drop('grade', axis = 1)
df_r_fp_y = df_regress_fp_le.return_rate

## open and create a pickle file to store models
filename_regress_def = './Regressor_Default_EFG.pkl'
filename_regress_fp = './Regressor_Fully Paid_EFG.pkl'


#### search for the best classifiers
file = open(filename_regress_def, 'wb')

## loop through each time window
for i in range(0, len(lst_time) - window + 1, window):
    # months in the ith time window
    lst_window = lst_time[i : i + window]
    # subset of dataframe
    df_window_x = df_r_d_x.loc[lst_window].reset_index(drop = True)
    df_window_y = df_r_d_y.loc[lst_window].reset_index(drop = True)
    
    # a dataframe of the grade weights
    df_grade_weight = pd.DataFrame(df_window_x.weight, index = df_window_x.index)
    
    score_params = {"sample_weight": df_grade_weight}

    grade_weighted_scorer = make_scorer(score_func = grade_weighted_r2,
                                        greater_is_better = True,
                                        needs_proba = False,
                                        needs_threshold = False,
                                        **score_params)
    grid_para_rf = {
        'n_estimators': [3, 5, 10, 15, 20, 25, 30, 35, 40],
        'max_depth': [None],
        'max_features': ['auto'],
        'min_samples_leaf': range(2, 20),
        'warm_start': [False, True]}
     
    gs_rf = GridSearchCV(estimator = ensemble.RandomForestRegressor(),\
                                      param_grid = grid_para_rf,\
                                      scoring = grade_weighted_scorer,\
                                      n_jobs = -1, cv = 5, return_train_score = False)
    
    # fit the grid search
    gs_rf.fit(df_window_x, df_window_y)
    
    # save the model using pickle
    pickle.dump(gs_rf, file)
    
    # status report
    print('Saved the default model for period {} to {}. The best score is {}.'.\
          format(lst_window[0], lst_window[-1], np.round(gs_rf.best_score_, 6)))
    print(gs_rf.best_params_)
    print('-' * 100)
    
file.close()


#### search for the best regressors
file = open(filename_regress_fp, 'wb')

## loop through each time window
for i in range(0, len(lst_time) - window + 1, window):
    # months in the ith time window
    lst_window = lst_time[i : i + window]
    # subset of dataframe
    df_window_x = df_r_fp_x.loc[lst_window].reset_index(drop = True)
    df_window_y = df_r_fp_y.loc[lst_window].reset_index(drop = True)
    
    # a dataframe of the grade weights
    df_grade_weight = pd.DataFrame(df_window_x.weight, index = df_window_x.index)
    
    score_params = {"sample_weight": df_grade_weight}

    grade_weighted_scorer = make_scorer(score_func = grade_weighted_r2,
                                        greater_is_better = True,
                                        needs_proba = False,
                                        needs_threshold = False,
                                        **score_params)
    grid_para_rf = {
        'n_estimators': [3, 5, 10, 15, 20, 25, 30, 35, 40],
        'max_depth': [None],
        'max_features': ['auto'],
        'min_samples_leaf': range(2, 20),
        'warm_start': [False, True]}
     
    gs_rf = GridSearchCV(estimator = ensemble.RandomForestRegressor(),\
                                      param_grid = grid_para_rf,\
                                      scoring = grade_weighted_scorer,\
                                      n_jobs = -1, cv = 5, return_train_score = False)
    
    # fit the grid search
    gs_rf.fit(df_window_x, df_window_y)
    
    # save the model using pickle
    pickle.dump(gs_rf, file)
    
    # status report
    print('Saved the fully-paid model for period {} to {}. The best score is {}.'.\
          format(lst_window[0], lst_window[-1], np.round(gs_rf.best_score_, 6)))
    print(gs_rf.best_params_)
    print('-' * 100)
    
file.close()

Saved the default model for period 2007-10-01 to 2007-12-01. The best score is nan.
{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 2, 'n_estimators': 3, 'warm_start': False}
----------------------------------------------------------------------------------------------------
Saved the default model for period 2008-01-01 to 2008-03-01. The best score is -1.187848.
{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 18, 'n_estimators': 3, 'warm_start': True}
----------------------------------------------------------------------------------------------------
Saved the default model for period 2008-04-01 to 2008-06-01. The best score is -13.429246.
{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 2, 'n_estimators': 20, 'warm_start': True}
----------------------------------------------------------------------------------------------------
Saved the default model for period 2008-07-01 to 2008-10-01. The best score is -380894.774777.
{'max_depth': No

Saved the default model for period 2014-11-01 to 2015-01-01. The best score is 0.024507.
{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 19, 'n_estimators': 10, 'warm_start': True}
----------------------------------------------------------------------------------------------------
Saved the default model for period 2015-02-01 to 2015-04-01. The best score is -0.007754.
{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 16, 'n_estimators': 40, 'warm_start': False}
----------------------------------------------------------------------------------------------------
Saved the default model for period 2015-05-01 to 2015-07-01. The best score is -0.016005.
{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 16, 'n_estimators': 15, 'warm_start': False}
----------------------------------------------------------------------------------------------------
Saved the default model for period 2015-08-01 to 2015-10-01. The best score is 0.041993.
{'max_depth':

Saved the fully-paid model for period 2010-11-01 to 2011-01-01. The best score is 0.438775.
{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 12, 'n_estimators': 5, 'warm_start': False}
----------------------------------------------------------------------------------------------------
Saved the fully-paid model for period 2011-02-01 to 2011-04-01. The best score is 0.240068.
{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 4, 'n_estimators': 3, 'warm_start': False}
----------------------------------------------------------------------------------------------------
Saved the fully-paid model for period 2011-05-01 to 2011-07-01. The best score is 0.282607.
{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 15, 'n_estimators': 20, 'warm_start': True}
----------------------------------------------------------------------------------------------------
Saved the fully-paid model for period 2011-08-01 to 2011-10-01. The best score is 0.246349.
{'max_

Saved the fully-paid model for period 2017-11-01 to 2018-01-01. The best score is 0.310837.
{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 5, 'n_estimators': 10, 'warm_start': False}
----------------------------------------------------------------------------------------------------
Saved the fully-paid model for period 2018-02-01 to 2018-04-01. The best score is 0.241041.
{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 10, 'n_estimators': 3, 'warm_start': False}
----------------------------------------------------------------------------------------------------
Saved the fully-paid model for period 2018-05-01 to 2018-07-01. The best score is -0.739916.
{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 16, 'n_estimators': 3, 'warm_start': True}
----------------------------------------------------------------------------------------------------
Saved the fully-paid model for period 2018-08-01 to 2018-10-01. The best score is 0.166426.
{'max

### Load the pickle file and predict default probability with the saved models - for Nov. and Dec. 2018 issued current loans

In [None]:
filename_pkl = './model_test.pkl'
file = open(filename_pkl, 'wb')

In [None]:
pickle.dump(grid_search_rf, file)

In [None]:
file.close()

In [None]:
def pickleLoader(pklFile):
    try:
        while True:
            yield pickle.load(pklFile)
    except EOFError:
        pass

In [None]:
file = open('./model_test.pkl', 'rb')

In [None]:
for model in pickleLoader(file):
    print(model.predict_proba(df_window_x))

In [None]:
file.close()

In [None]:
df_window_y

In [None]:
model1.best_params_

In [None]:
model2.best_params_

In [None]:
model.best_params_