In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# from HelperFunctions import minibatch 
%reload_ext autoreload
%autoreload 2
from HelperFunctions import minibatch, dummify_columns, undummify, feature_standardize, label_encode_column, columns_of_type

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix

# For Bayesian Optimizer
# from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [2]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, AdaBoostRegressor
randomForest = RandomForestRegressor()
gbm = GradientBoostingRegressor()
abr = AdaBoostRegressor()
from xgboost import XGBRegressor
xgb = XGBRegressor()
from lightgbm import LGBMRegressor
lgb = LGBMRegressor()

# Helper Function

In [3]:
def adjust_prob(unsampled_df,sampled_df,sampled_prob): # sampled_df, sampled_prob can also be test_df, test_prob
    # turning the predicted probability into a dataframe with column name default_prob
    sample_prob_df=pd.DataFrame(sampled_prob[:,0],columns=['sampled_prob'])
    # find actual default rate for each class
    grade_rate=unsampled_df.groupby('grade')['loan_status'].apply(lambda x:(x=='Default').sum()/x.count())
    grade_rate_dict=grade_rate.to_dict()
    # mapping the unsampled_df default rates to the test_df=sampled_df
    # and then getting the array of default_rates in the test_df
    sampled_df['default_rate']=sampled_df['grade'].map(grade_rate_dict)
    sampled_df.reset_index(drop=True, inplace=True)
    sample_prob_df.reset_index(drop=True, inplace=True)
    pre_adjust_df=pd.concat([sampled_df,sample_prob_df],axis=1)
    # Adjusting the default_probability to the true probability (accounting for down/up sampling)    
    sampled_frac=0.5
    real_prob=[]
    for row in pre_adjust_df.loc[:,['default_rate','sampled_prob']].iterrows():
        beta=sampled_frac/(1-row[1]['default_rate'])
        real_prob.append(beta*row[1]['sampled_prob']/((beta-1)*row[1]['sampled_prob']+1))
        #     prob=1/(1+(1/original_fraction-1)/(1/sampled_fraction-1)*(1/sampled_prob-1))
    a=pd.DataFrame(real_prob,columns=['actual_prob'])
    b=pd.DataFrame(sampled_prob[:,0],columns=['downsampled_prob'])
    return pd.concat([a,b],axis=1)

In [4]:
df=pd.read_csv('down_sampled_df_v2.csv')
pre_df=pd.read_csv('pre_downsample_df.csv')

# Define Grade Weights HERE:

In [5]:
grade_weight_dict={'A':1,
                  'B':2,
                  'C':3,
                  'D':4,
                  'E':5,
                  'F':6,
                  'G':7}

df['weight']=df['grade'].map(grade_weight_dict)
grade_weight_array=df['weight'].values
index = df.index
grade_weight_frame = pd.DataFrame(grade_weight_array, index=index)

# Dropping features not needed for modeling

In [6]:
drop_list=['sub_grade','issue_d','zip_code','RANDOM','id','weight']
df.drop(drop_list,axis=1,inplace=True)

In [7]:
x_train=df.drop(['loan_status','return_rate'],axis=1)
y_train=df.loan_status

In [8]:
cat_list=columns_of_type(x_train,'string')
cont_list=columns_of_type(x_train,'number')

# Label Encode ALL Categoricals

In [9]:
label_encoded_df=label_encode_column(x_train,cat_list)

# Standardize ALL Features

In [10]:
final_train_df=label_encoded_df.apply(lambda x:feature_standardize(x,scaleType='standardize'),axis=0)

# CUSTOM SCORING FUNCTION: WEIGHTED ACCURACY

In [11]:
from sklearn.metrics import accuracy_score
from sklearn.metrics.scorer import make_scorer
def grade_weighted_accuracy(y_true,y_pred,sample_weight):

    weighted_acc=accuracy_score(y_true,y_pred,
                                sample_weight=sample_weight.loc[y_true.index.values].values.reshape(-1),
                                normalize=True)
    return weighted_acc    

score_params = {"sample_weight": grade_weight_frame}

grade_weighted_scorer = make_scorer(score_func=grade_weighted_accuracy,
                                    greater_is_better=True,
                                    needs_proba=False,
                                    needs_threshold=False,
                                    **score_params)

In [12]:
# grid_para_forest = {
#     'n_estimators': range(20, 80, 2),
#     'max_depth':[2,3],
#     'max_features':[1, 2],
#     'min_samples_leaf':[1,2],
#     'min_samples_split':[2,3]
# }

grid_para_forest = {
    'min_samples_leaf':[1,2],
    'min_samples_split':[2,3]
}

grid_search_forest = GridSearchCV(estimator=RandomForestClassifier(n_estimators=10,max_depth=2,max_features=2),\
                                  param_grid=grid_para_forest,\
                                  n_jobs=-1,\
                                  scoring=grade_weighted_scorer,cv=5,\
                                  return_train_score=False)

In [13]:
grid_search_forest.fit(final_train_df,y_train)
print('best parameters:', grid_search_forest.best_params_)
print('best score:', grid_search_forest.best_score_)
confusion_matrix(y_train, grid_search_forest.predict(final_train_df))

best parameters: {'min_samples_leaf': 1, 'min_samples_split': 3}
best score: 0.5963810813453226


array([[19801,  6269],
       [13994, 11808]])

# Adjusting Probabilities to account for downsampling

In [14]:
sampled_prob=grid_search_forest.predict_proba(final_train_df)
adjust_prob(pre_df,df,sampled_prob)

Unnamed: 0,actual_prob,downsampled_prob
0,0.332095,0.483204
1,0.273819,0.414882
2,0.317502,0.466610
3,0.303558,0.450439
4,0.277691,0.419597
5,0.363830,0.518176
6,0.349545,0.502619
7,0.349853,0.502958
8,0.365103,0.519548
9,0.272661,0.413468


#### Testing without weighted accuracy

In [15]:
# grid_search_forest = GridSearchCV(estimator=RandomForestClassifier(n_estimators=10,max_depth=2,max_features=2),\
#                                   param_grid=grid_para_forest,\
#                                   n_jobs=-1,\
#                                   cv=5,\
#                                   return_train_score=False)
# grid_search_forest.fit(final_train_df,y_train)
# print('best parameters:', grid_search_forest.best_params_)
# print('best score:', grid_search_forest.best_score_)
# confusion_matrix(y_train, grid_search_forest.predict(final_train_df))

In [16]:
# sampled_prob=grid_search_forest.predict_proba(final_train_df)
# adjust_prob(pre_df,df,sampled_prob)