In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# from HelperFunctions import minibatch 
%reload_ext autoreload
%autoreload 2
from HelperFunctions import minibatch, dummify_columns, undummify, feature_standardize, label_encode_column, columns_of_type

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

# For Bayesian Optimizer
# from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [2]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, AdaBoostRegressor
randomForest = RandomForestRegressor()
gbm = GradientBoostingRegressor()
abr = AdaBoostRegressor()
from xgboost import XGBRegressor
xgb = XGBRegressor()
from lightgbm import LGBMRegressor
lgb = LGBMRegressor()

In [3]:
temp_df=pd.read_csv('down_sampled_df_v2.csv')

In [4]:
df=temp_df[temp_df.loan_status=='Fully Paid'].copy()
df.reset_index(drop=True,inplace=True)

# Define Grade Weights HERE:

In [5]:
grade_weight_dict={'A':1,
                  'B':1,
                  'C':1,
                  'D':1,
                  'E':1,
                  'F':1,
                  'G':1}

df['weight']=df['grade'].map(grade_weight_dict)
grade_weight_array=df['weight'].values
index = df.index
grade_weight_frame = pd.DataFrame(grade_weight_array, index=index)

# Dropping features not needed for modeling

In [6]:
drop_list=['sub_grade','issue_d','zip_code','RANDOM','id','weight','loan_duration']
df.drop(drop_list,axis=1,inplace=True)

In [7]:
x_train=df.drop(['loan_status','return_rate'],axis=1)
y_train=df.return_rate

In [8]:
cat_list=columns_of_type(x_train,'string')
cont_list=columns_of_type(x_train,'number')

# Label Encode ALL Categoricals

In [9]:
label_encoded_df=label_encode_column(x_train,cat_list)

# Standardize ALL Features

In [10]:
final_train_df=label_encoded_df.apply(lambda x:feature_standardize(x,scaleType='standardize'),axis=0)

# CUSTOM SCORING FUNCTION: WEIGHTED R2

In [11]:
from sklearn.metrics import r2_score
from sklearn.metrics.scorer import make_scorer
def grade_weighted_r2(y_true,y_pred,sample_weight):

    weighted_r2=r2_score(y_true,y_pred,sample_weight=sample_weight.loc[y_true.index.values].values.reshape(-1))
    return weighted_r2    

score_params = {"sample_weight": grade_weight_frame}

grade_weighted_scorer = make_scorer(score_func=grade_weighted_r2,
                                    greater_is_better=True,
                                    needs_proba=False,
                                    needs_threshold=False,
                                    **score_params)

In [12]:
grid_para_forest = {
    'n_estimators': [100,200,500],
    'max_depth':[5,7,9,12],
    'max_features':[5,10,15,20],
    'min_samples_leaf':[3,5,7],
    'min_samples_split':[2,3,4,5]
}

grid_search_forest = GridSearchCV(estimator = RandomForestRegressor(), param_grid = grid_para_forest,
                                  scoring=grade_weighted_scorer,
                                  cv = 5, n_jobs = -1)

In [13]:
%time grid_search_forest.fit(final_train_df,y_train)
print('best parameters:', grid_search_forest.best_params_)
print('best score:', grid_search_forest.best_score_)



CPU times: user 1min 30s, sys: 1.21 s, total: 1min 31s
Wall time: 9h 48min 23s
best parameters: {'max_depth': 12, 'max_features': 20, 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 200}
best score: 0.30090516325222355


In [14]:
results=grid_search_forest.predict(final_train_df)
r_nd=pd.DataFrame(results,columns=['r_nd'])
r_nd.to_csv('r_nd.csv')

#### Testing without weighted r2

In [15]:
# grid_search_forest = GridSearchCV(estimator = RandomForestRegressor(), param_grid = grid_para_forest,
#                                   cv = 5, n_jobs = -1)
# grid_search_forest.fit(final_train_df,y_train)
# print('best parameters:', grid_search_forest.best_params_)
# print('best score:', grid_search_forest.best_score_)