In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# from HelperFunctions import minibatch 
%reload_ext autoreload
%autoreload 2
from HelperFunctions import minibatch, dummify_columns, undummify, feature_standardize, label_encode_column, columns_of_type

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn import metrics

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [3]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, AdaBoostRegressor
randomForest = RandomForestRegressor()
gbm = GradientBoostingRegressor()
abr = AdaBoostRegressor()
from xgboost import XGBRegressor
xgb = XGBRegressor()
from lightgbm import LGBMRegressor
lgb = LGBMRegressor()

# Helper Function

In [72]:
def adjust_prob(unsampled_df,sampled_df,sampled_prob): # sampled_df, sampled_prob can also be test_df, test_prob
    # turning the predicted probability into a dataframe with column name default_prob
    sample_prob_df=pd.DataFrame(sampled_prob[:,0],columns=['sampled_prob'])
    # find actual default rate for each class
    grade_rate=unsampled_df.groupby('grade')['loan_status'].apply(lambda x:(x=='Default').sum()/x.count())
    grade_rate_dict=grade_rate.to_dict()
    # mapping the unsampled_df default rates to the test_df=sampled_df
    # and then getting the array of default_rates in the test_df
    sampled_df['default_rate']=sampled_df['grade'].map(grade_rate_dict)
    sampled_df.reset_index(drop=True, inplace=True)
    sample_prob_df.reset_index(drop=True, inplace=True)
    pre_adjust_df=pd.concat([sampled_df,sample_prob_df],axis=1)
    # Adjusting the default_probability to the true probability (accounting for down/up sampling)    
    sampled_frac=0.5
    real_prob=[]
    for row in pre_adjust_df.loc[:,['default_rate','sampled_prob']].iterrows():
        beta=sampled_frac/(1-row[1]['default_rate'])
        real_prob.append(beta*row[1]['sampled_prob']/((beta-1)*row[1]['sampled_prob']+1))
        #     prob=1/(1+(1/original_fraction-1)/(1/sampled_fraction-1)*(1/sampled_prob-1))
    a=pd.DataFrame(real_prob,columns=['actual_prob'])
    b=pd.DataFrame(sampled_prob[:,0],columns=['downsampled_prob'])
    return pd.concat([a,b],axis=1)

In [4]:
df=pd.read_csv('down_sampled_df_v2.csv',index_col='id')
pre_df=pd.read_csv('pre_downsample_df.csv',index_col='id')

  mask |= (ar1 == a)


In [5]:
drop_list=['sub_grade','issue_d','zip_code','addr_state','RANDOM']
df.drop(drop_list,axis=1,inplace=True)

In [6]:
x_train=df.drop(['loan_status','return_rate'],axis=1)
y_train=df.loan_status

In [7]:
cat_list=columns_of_type(x_train,'string')
cont_list=columns_of_type(x_train,'number')

In [8]:
dum_df=dummify_columns(x_train,cat_list)

# Standardize Features

In [9]:
dum_df=dum_df.apply(lambda x:feature_standardize(x,scaleType='standardize'),axis=0)

In [10]:
dum_df.head()

Unnamed: 0_level_0,funded_amnt,term,int_rate,emp_length,annual_inc,dti,delinq_2yrs,credit_age_years,mths_since_last_delinq,mths_since_last_derog_record,derog_records,revol_util,total_acc,collections_12_mths_ex_med,mths_since_last_major_derog,acc_now_delinq,collection_amt,install_util,all_util,rev_credit,inq_last_12m,chargeoff_within_12_mths,delinq_amnt,mths_since_RecentAcc_opened,all_accs_120days+_PastDue_ever,accs_90days+_PastDue_24m,accs_opened_past_12m,pct_acc_nvr_dlq,pub_rec_bankruptcies,tax_liens,total_credit,install_credit,fico,Outstanding_mortgage_debt,revol_frac,install_frac,mort_frac,card_frac,active_card_frac,active_revol_frac,active_install_frac,open_revol_frac,good_acc_frac,loan_duration,grade__B,grade__C,grade__D,grade__E,grade__F,grade__G,home_ownership__MORTGAGE,home_ownership__NONE,home_ownership__OTHER,home_ownership__OWN,home_ownership__RENT,verification_status__Source Verified,verification_status__Verified,purpose__credit_card,purpose__debt_consolidation,purpose__educational,purpose__home_improvement,purpose__house,purpose__major_purchase,purpose__medical,purpose__moving,purpose__other,purpose__renewable_energy,purpose__small_business,purpose__vacation,purpose__wedding,initial_list_status__w,application_type__Joint App
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1
65426,-1.2507,-0.742457,-0.923823,-0.815739,-0.245301,-0.762739,-0.375378,-0.230649,0.905157,1.879737,-0.346692,0.382824,0.72601,-0.125225,-0.62855,-0.068812,-1.269208,-0.748211,-0.83501,-1.277651,-0.835322,-0.087661,-0.020591,-4.513523,-4.516475,-4.51655,-4.516381,-4.505566,-0.37928,-0.115042,-1.041822,-1.092842,1.199346,-0.647401,-4.516561,-4.516561,-5.407963,-4.515584,-4.37633,-4.515584,-0.814373,-4.515584,-5.005319,0.243424,2.033927,-0.684839,-0.544853,-0.394054,-0.239771,-0.13152,1.095325,-0.00621,-0.01242,-0.348754,-0.88026,-0.805821,-0.756006,-0.478773,-1.247711,-0.014564,-0.249595,-0.080503,-0.145336,-0.112298,-0.092599,-0.258114,-0.028125,-0.129765,-0.084875,-0.041921,-1.139686,-0.147192
67503,-0.567841,-0.742457,-1.444537,-0.815739,-0.350907,-0.363714,-0.375378,0.292931,-1.028789,-0.488236,-0.346692,-0.989701,2.077014,-0.125225,-0.62855,-0.068812,-1.269208,-0.748211,-0.83501,-1.277651,-0.835322,-0.087661,-0.020591,-4.513523,-4.516475,-4.51655,-4.516381,-4.505566,-0.37928,-0.115042,-1.041822,-1.092842,2.908357,-0.647401,-4.516561,-4.516561,-5.407963,-4.515584,-4.37633,-4.515584,-0.814373,-4.515584,-5.005319,0.736064,-0.49166,-0.684839,-0.544853,-0.394054,-0.239771,-0.13152,1.095325,-0.00621,-0.01242,-0.348754,-0.88026,-0.805821,-0.756006,-0.478773,0.801468,-0.014564,-0.249595,-0.080503,-0.145336,-0.112298,-0.092599,-0.258114,-0.028125,-0.129765,-0.084875,-0.041921,-1.139686,-0.147192
69550,-0.43127,1.346879,-0.068218,-0.46693,-0.614923,-0.209895,-0.375378,-1.517537,-1.028789,-0.488236,-0.346692,0.398692,-1.131621,-0.125225,-0.62855,-0.068812,-1.269208,-0.748211,-0.83501,-1.277651,-0.835322,-0.087661,-0.020591,-4.513523,-4.516475,-4.51655,-4.516381,-4.505566,-0.37928,-0.115042,-1.041822,-1.092842,-0.129885,-0.647401,-4.516561,-4.516561,-5.407963,-4.515584,-4.37633,-4.515584,-0.814373,-4.515584,-5.005319,2.610238,-0.49166,-0.684839,1.835357,-0.394054,-0.239771,-0.13152,-0.912971,-0.00621,-0.01242,-0.348754,1.136028,-0.805821,-0.756006,-0.478773,0.801468,-0.014564,-0.249595,-0.080503,-0.145336,-0.112298,-0.092599,-0.258114,-0.028125,-0.129765,-0.084875,-0.041921,-1.139686,-0.147192
70348,-0.613365,-0.742457,-0.688991,0.230688,0.229927,-0.969037,-0.375378,-0.732165,1.036797,2.029068,1.06445,0.105146,0.219383,-0.125225,-0.62855,-0.068812,-1.269208,-0.748211,-0.83501,-1.277651,-0.835322,-0.087661,-0.020591,-4.513523,-4.516475,-4.51655,-4.516381,-4.505566,2.107324,-0.115042,-1.041822,-1.092842,-0.699555,-0.647401,-4.516561,-4.516561,-5.407963,-4.515584,-4.37633,-4.515584,-0.814373,-4.515584,-5.005319,0.570066,-0.49166,1.460198,-0.544853,-0.394054,-0.239771,-0.13152,-0.912971,-0.00621,-0.01242,-0.348754,1.136028,-0.805821,-0.756006,-0.478773,0.801468,-0.014564,-0.249595,-0.080503,-0.145336,-0.112298,-0.092599,-0.258114,-0.028125,-0.129765,-0.084875,-0.041921,-1.139686,-0.147192
102823,-1.193795,-0.742457,-0.648151,-0.815739,-1.010947,-0.900271,-0.375378,-1.847393,0.905157,1.879737,-0.346692,0.583811,-1.55381,-0.125225,-0.62855,-0.068812,-1.269208,-0.748211,-0.83501,-1.277651,-0.835322,-0.087661,-0.020591,-4.513523,-4.516475,-4.51655,-4.516381,-4.505566,-0.37928,-0.115042,-1.041822,-1.092842,-1.079335,-0.647401,-4.516561,-4.516561,-5.407963,-4.515584,-4.37633,-4.515584,-0.814373,-4.515584,-5.005319,1.38667,-0.49166,-0.684839,1.835357,-0.394054,-0.239771,-0.13152,-0.912971,-0.00621,-0.01242,-0.348754,1.136028,-0.805821,-0.756006,-0.478773,0.801468,-0.014564,-0.249595,-0.080503,-0.145336,-0.112298,-0.092599,-0.258114,-0.028125,-0.129765,-0.084875,-0.041921,-1.139686,-0.147192


# Stochastic Gradient Descent: Not good

In [20]:
# def acc_model(params):
#     clf = SGDClassifier(loss='log',max_iter=1000,tol=1e-3,random_state=0,**params)
#     return cross_val_score(clf, dum_df, y_train,cv=5,n_jobs=-1,scoring='accuracy').mean()

# alpha_range=np.logspace(-5,4,10)
# penalty_range=['l1','l2']

# param_space = {
#     'alpha':hp.choice('alpha',alpha_range),
#     'penalty': hp.choice('penalty', penalty_range)}

# best = acc_model({'alpha':alpha_range[0],'penalty':penalty_range[0]})
# def f(params):
#     global best
#     acc_score = acc_model(params)
#     if acc_score > best:
#         best = acc_score
#     print ('new best:', best, params)
#     return {'loss': -1*acc_score, 'status': STATUS_OK}

# trials = Trials()
# best = fmin(f, param_space, algo=tpe.suggest, max_evals=100, trials=trials)
# print ('best:')
# print (best)

new best:                                            
0.5025832830454263                                   
{'alpha': 0.1, 'penalty': 'l1'}                      
new best:                                                                      
0.5025832830454263                                                             
{'alpha': 10000.0, 'penalty': 'l2'}                                            
new best:                                                                      
0.5025832830454263                                                             
{'alpha': 0.001, 'penalty': 'l2'}                                              
new best:                                                                      
0.5103134592737637                                                             
{'alpha': 0.1, 'penalty': 'l2'}                                                
new best:                                                                      
0.5103134592737637                    

new best:                                                                       
0.5103134592737637                                                              
{'alpha': 1e-05, 'penalty': 'l2'}                                               
new best:                                                                       
0.5103134592737637                                                              
{'alpha': 0.1, 'penalty': 'l2'}                                                 
new best:                                                                       
0.5103134592737637                                                              
{'alpha': 1000.0, 'penalty': 'l2'}                                              
new best:                                                                       
0.5217456560771716                                                              
{'alpha': 1.0, 'penalty': 'l2'}                                                 
new best:                   

{'alpha': 1.0, 'penalty': 'l2'}                                                 
new best:                                                                       
0.5217456560771716                                                              
{'alpha': 1.0, 'penalty': 'l2'}                                                 
new best:                                                                       
0.5217456560771716                                                              
{'alpha': 1000.0, 'penalty': 'l2'}                                              
new best:                                                                       
0.5217456560771716                                                              
{'alpha': 0.01, 'penalty': 'l2'}                                                
new best:                                                                       
0.5217456560771716                                                              
{'alpha': 1.0, 'penalty': 'l

# Logit Classifier: NOT GOOD FOR -999 IMPUTATION! because analytical equation. Also, the data is highly likely to be non-linear. Stick with tree!

In [22]:
# def acc_model(params):
#     clf = LogisticRegression(**params)
#     return cross_val_score(clf, dum_df, y_train,cv=5,n_jobs=-1,scoring='accuracy').mean()

# C_range=np.logspace(0,5,3)
# penalty_range=['l1','l2']

# param_space = {
#     'C': hp.choice('C',C_range),
#     'penalty': hp.choice('penalty', penalty_range)}

# best = acc_model({'C':C_range[0],'penalty':penalty_range[0]})
# def f(params):
#     global best
#     acc_score = acc_model(params)
#     if acc_score > best:
#         best = acc_score
#     print ('new best:', best, params)
#     return {'loss': -1*acc_score, 'status': STATUS_OK}

# trials = Trials()
# best = fmin(f, param_space, algo=tpe.suggest, max_evals=15, trials=trials)
# print ('best:')
# print (best)

new best:                                           
0.4964717112521805                                  
{'C': 1.0, 'penalty': 'l2'}                         
new best:                                                                     
0.4964717112521805                                                            
{'C': 1.0, 'penalty': 'l1'}                                                   
new best:                                                                     
0.4964717112521805                                                            
{'C': 316.22776601683796, 'penalty': 'l2'}                                    
new best:                                                                     
0.4964717112521805                                                            
{'C': 1.0, 'penalty': 'l1'}                                                   
new best:                                                                      
0.4964717112521805                                

In [71]:
# logit=LogisticRegression(C=C_range[best['C']],penalty=penalty_range[best['penalty']])
# logit.fit(dum_df,y_train)
# sampled_prob=logit.predict_proba(dum_df) #[Default,Fully Paid]

# print('best cv score:', logit.score(dum_df,y_train))
# from sklearn.metrics import confusion_matrix
# confusion_matrix(y_train, logit.predict(dum_df))



best cv score: 0.5644471005552129


array([[15644, 10426],
       [12167, 13635]])

In [70]:
adjust_prob(pre_df,df,sampled_prob)

Unnamed: 0,actual_prob,downsampled_prob
0,0.166161,0.272580
1,0.121002,0.205632
2,0.105944,0.182225
3,0.147239,0.245102
4,0.131326,0.221357
5,0.122586,0.208062
6,0.154080,0.255130
7,0.151165,0.250871
8,0.000081,0.000153
9,0.111492,0.190916
