In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# from HelperFunctions import minibatch 
%reload_ext autoreload
%autoreload 2
from HelperFunctions import minibatch, dummify_columns, undummify, feature_standardize, label_encode_column, columns_of_type

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

# For Bayesian Optimizer
# from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [2]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, AdaBoostRegressor
randomForest = RandomForestRegressor()
gbm = GradientBoostingRegressor()
abr = AdaBoostRegressor()
from xgboost import XGBRegressor
xgb = XGBRegressor()
from lightgbm import LGBMRegressor
lgb = LGBMRegressor()

In [3]:
df=pd.read_csv('down_sampled_df_v2.csv')

# Define Grade Weights HERE:

In [4]:
grade_weight_dict={'A':1,
                  'B':2,
                  'C':3,
                  'D':4,
                  'E':5,
                  'F':6,
                  'G':7}

df['weight']=df['grade'].map(grade_weight_dict)
grade_weight_array=df['weight'].values
index = df.index
grade_weight_frame = pd.DataFrame(grade_weight_array, index=index)

# Dropping features not needed for modeling

In [5]:
drop_list=['sub_grade','issue_d','zip_code','RANDOM','id','weight']
df.drop(drop_list,axis=1,inplace=True)

In [6]:
x_train=df.drop(['loan_status','return_rate'],axis=1)
y_train=df.return_rate

In [7]:
cat_list=columns_of_type(x_train,'string')
cont_list=columns_of_type(x_train,'number')

# Label Encode ALL Categoricals

In [8]:
label_encoded_df=label_encode_column(x_train,cat_list)

# Standardize ALL Features

In [9]:
final_train_df=label_encoded_df.apply(lambda x:feature_standardize(x,scaleType='standardize'),axis=0)

# CUSTOM SCORING FUNCTION: WEIGHTED R2

In [10]:
from sklearn.metrics import r2_score
from sklearn.metrics.scorer import make_scorer
def grade_weighted_r2(y_true,y_pred,sample_weight):

    weighted_r2=r2_score(y_true,y_pred,sample_weight=sample_weight.loc[y_true.index.values].values.reshape(-1))
    return weighted_r2    

score_params = {"sample_weight": grade_weight_frame}

grade_weighted_scorer = make_scorer(score_func=grade_weighted_r2,
                                    greater_is_better=True,
                                    needs_proba=False,
                                    needs_threshold=False,
                                    **score_params)

In [11]:
grid_para_forest = {
    'n_estimators': range(20, 80, 2),
    'max_depth':[2,3],
    'max_features':[1, 2],
    'min_samples_leaf':[1,2],
    'min_samples_split':[2,3]
}

grid_search_forest = GridSearchCV(estimator = RandomForestRegressor(), param_grid = grid_para_forest,
                                  scoring=grade_weighted_scorer,
                                  cv = 5, n_jobs = -1)

In [12]:
grid_search_forest.fit(final_train_df,y_train)
print('best parameters:', grid_search_forest.best_params_)
print('best score:', grid_search_forest.best_score_)

best parameters: {'max_depth': 2, 'max_features': 2, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 22}
best score: -0.17592313508539406


#### Testing without weighted r2

In [14]:
grid_search_forest = GridSearchCV(estimator = RandomForestRegressor(), param_grid = grid_para_forest,
                                  cv = 5, n_jobs = -1)
grid_search_forest.fit(final_train_df,y_train)
print('best parameters:', grid_search_forest.best_params_)
print('best score:', grid_search_forest.best_score_)

best parameters: {'max_depth': 2, 'max_features': 2, 'min_samples_leaf': 2, 'min_samples_split': 3, 'n_estimators': 26}
best score: -0.10625887293477396
