In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# from HelperFunctions import minibatch 
%reload_ext autoreload
%autoreload 2
from HelperFunctions import minibatch, dummify_columns, undummify, feature_standardize, label_encode_column, columns_of_type

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

# For Bayesian Optimizer
# from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [2]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, AdaBoostRegressor
randomForest = RandomForestRegressor()
gbm = GradientBoostingRegressor()
abr = AdaBoostRegressor()
from xgboost import XGBRegressor
xgb = XGBRegressor()
from lightgbm import LGBMRegressor
lgb = LGBMRegressor()

In [3]:
temp_df=pd.read_csv('down_sampled_df_v2.csv')

In [4]:
df=temp_df[temp_df.loan_status=='Default'].copy()
df.reset_index(drop=True,inplace=True)

# Define Grade Weights HERE:

In [5]:
grade_weight_dict={'A':1,
                  'B':1,
                  'C':1,
                  'D':1,
                  'E':1,
                  'F':1,
                  'G':1}

df['weight']=df['grade'].map(grade_weight_dict)
grade_weight_array=df['weight'].values
index = df.index
grade_weight_frame = pd.DataFrame(grade_weight_array, index=index)

# Dropping features not needed for modeling

In [6]:
drop_list=['sub_grade','issue_d','zip_code','RANDOM','id','weight','loan_duration']
df.drop(drop_list,axis=1,inplace=True)

In [7]:
x_train=df.drop(['loan_status','return_rate'],axis=1)
y_train=df.return_rate

In [8]:
cat_list=columns_of_type(x_train,'string')
cont_list=columns_of_type(x_train,'number')

# Label Encode ALL Categoricals

In [9]:
label_encoded_df=label_encode_column(x_train,cat_list)

# Standardize ALL Features

In [10]:
final_train_df=label_encoded_df.apply(lambda x:feature_standardize(x,scaleType='standardize'),axis=0)

# CUSTOM SCORING FUNCTION: WEIGHTED R2

In [11]:
from sklearn.metrics import r2_score
from sklearn.metrics.scorer import make_scorer
def grade_weighted_r2(y_true,y_pred,sample_weight):

    weighted_r2=r2_score(y_true,y_pred,sample_weight=sample_weight.loc[y_true.index.values].values.reshape(-1))
    return weighted_r2    

score_params = {"sample_weight": grade_weight_frame}

grade_weighted_scorer = make_scorer(score_func=grade_weighted_r2,
                                    greater_is_better=True,
                                    needs_proba=False,
                                    needs_threshold=False,
                                    **score_params)

In [12]:
grid_para_forest = {
    'n_estimators': [100,200,500],
    'max_depth':[5,7,9,12],
    'max_features':[5,10,15,20],
    'min_samples_leaf':[3,5,7],
    'min_samples_split':[2,3,4,5]
}

grid_search_forest = GridSearchCV(estimator = RandomForestRegressor(), param_grid = grid_para_forest,
                                  scoring=grade_weighted_scorer,
                                  cv = 5, n_jobs = -1)

In [13]:
%time grid_search_forest.fit(final_train_df,y_train)
print('best parameters:', grid_search_forest.best_params_)
print('best score:', grid_search_forest.best_score_)

CPU times: user 1min 53s, sys: 1.13 s, total: 1min 54s
Wall time: 9h 52min 37s
best parameters: {'max_depth': 12, 'max_features': 10, 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 500}
best score: 0.11801404992706574


In [14]:
results=grid_search_forest.predict(final_train_df)
r_d=pd.DataFrame(results,columns=['r_d'])
r_d.to_csv('r_d.csv')

#### Testing without weighted r2

In [15]:
# grid_search_forest = GridSearchCV(estimator = RandomForestRegressor(), param_grid = grid_para_forest,
#                                   cv = 5, n_jobs = -1)
# grid_search_forest.fit(final_train_df,y_train)
# print('best parameters:', grid_search_forest.best_params_)
# print('best score:', grid_search_forest.best_score_)

In [21]:
grid_search_forest.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_max_depth', 'param_max_features', 'param_min_samples_leaf', 'param_min_samples_split', 'param_n_estimators', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [22]:
grid_search_forest.cv_results_

{'mean_fit_time': array([  2.80736203,   8.23165932,  33.46865482,   6.84551454,
         16.32465754,  62.36403894,  12.43504558,  25.87836046,
         59.19541154,  11.31017232,  24.26667061,  58.61549072,
         11.5085804 ,  23.86749587,  69.30030341,  14.48430047,
         28.29509449,  62.49385176,  12.35988307,  24.22065029,
         60.86840148,  12.3487741 ,  24.59010892,  61.13112478,
         12.24302373,  25.17517743,  62.64814439,  12.41745286,
         25.21253533,  63.04677114,  12.63350925,  24.57954922,
         61.10269771,  12.38490381,  23.69877138,  54.53718433,
         19.48438044,  43.86411958, 110.62178984,  22.40503721,
         42.29212804, 101.12464871,  19.00661545,  42.59922152,
        107.576753  ,  21.93184934,  43.07890983, 110.13877382,
         22.86009574,  43.70272727, 107.22153893,  21.39063339,
         41.8823504 , 107.09597468,  21.67048998,  42.29803648,
        106.18056378,  21.31309876,  42.44771218, 105.44929852,
         21.31072497,  