In [13]:
def LC_Classifier(df,col_to_drop,target,**grade_weight_dict):
    import numpy as np
    import pandas as pd
    from HelperFunctions import feature_standardize, label_encode_column, columns_of_type

    from sklearn.model_selection import GridSearchCV

    from sklearn.ensemble import RandomForestClassifier
    clf = RandomForestClassifier()

    from sklearn.metrics import accuracy_score
    from sklearn.metrics.scorer import make_scorer
    '''
    df: dataframe of interest (for x_train)
    col_to_drop: accepts a list of column names (strings) that will be dropped from the model
    target: the target variable, in the format of a string
    **grade_weight_dict: the dictionary for weights for different grades
    **grade_weight_dict is in the form of:
    grade_dict={'A':1,
                'B':1,
                'C':1,
                'D':1,
                'E':1,
                'F':1,
                'G':1}
    '''
    df['weight']=df['grade'].map(grade_weight_dict)
    grade_weight_array=df['weight'].values
    index = df.index
    grade_weight_frame = pd.DataFrame(grade_weight_array, index=index)

    df.drop(col_to_drop,axis=1,inplace=True)
    x_train=df.drop([target],axis=1)
    y_train=df.loc[:,target]
    
    cat_list=columns_of_type(x_train,'string')
    cont_list=columns_of_type(x_train,'number')
    
    label_encoded_df=label_encode_column(x_train,cat_list)
    
    final_train_df=label_encoded_df.apply(lambda x:feature_standardize(x,scaleType='standardize'),axis=0)
    
    def grade_weighted_accuracy(y_true,y_pred,sample_weight):

        weighted_acc=accuracy_score(y_true,y_pred,
                                    sample_weight=sample_weight.loc[y_true.index.values].values.reshape(-1),
                                    normalize=True)
        return weighted_acc    

    score_params = {"sample_weight": grade_weight_frame}

    grade_weighted_scorer = make_scorer(score_func=grade_weighted_accuracy,
                                        greater_is_better=True,
                                        needs_proba=False,
                                        needs_threshold=False,
                                        **score_params)
    grid_para_forest = {
    'n_estimators': [500],
    'max_depth':[15],
    'max_features':[10],
    'min_samples_leaf':[3],
    'min_samples_split':[3]
    }

    grid_search_forest = GridSearchCV(estimator=RandomForestClassifier(),\
                                      param_grid=grid_para_forest,\
                                      n_jobs=-1,\
                                      scoring=grade_weighted_scorer,cv=5,\
                                      return_train_score=False)
    grid_search_forest.fit(final_train_df,y_train)
    sampled_prob=grid_search_forest.predict_proba(final_train_df)
    return sampled_prob

In [14]:
import numpy as np
import pandas as pd
df=pd.read_csv('down_sampled_df_v2.csv')

In [15]:
pre_df=pd.read_csv('pre_downsample_df.csv')

In [16]:
drop_list=['sub_grade','issue_d','zip_code','RANDOM','id','weight','loan_duration','return_rate']

In [17]:
grade_weight_dict={'A':1,
            'B':1,
            'C':1,
            'D':1,
            'E':1,
            'F':1,
            'G':1}

In [18]:
sampled_prob=LC_Classifier(df,drop_list,'loan_status',grade_weight_dict=grade_weight_dict)



# Helper Function

In [19]:
def adjust_prob(unsampled_df,sampled_df,sampled_prob): # sampled_df, sampled_prob can also be test_df, test_prob
    # turning the predicted probability into a dataframe with column name default_prob
    sample_prob_df=pd.DataFrame(sampled_prob[:,0],columns=['sampled_prob'])
    # find actual default rate for each class
    grade_rate=unsampled_df.groupby('grade')['loan_status'].apply(lambda x:(x=='Default').sum()/x.count())
    grade_rate_dict=grade_rate.to_dict()
    # mapping the unsampled_df default rates to the test_df=sampled_df
    # and then getting the array of default_rates in the test_df
    sampled_df['default_rate']=sampled_df['grade'].map(grade_rate_dict)
    sampled_df.reset_index(drop=True, inplace=True)
    sample_prob_df.reset_index(drop=True, inplace=True)
    pre_adjust_df=pd.concat([sampled_df,sample_prob_df],axis=1)
    # Adjusting the default_probability to the true probability (accounting for down/up sampling)    
    sampled_frac=0.5
    real_prob=[]
    for row in pre_adjust_df.loc[:,['default_rate','sampled_prob']].iterrows():
        beta=sampled_frac/(1-row[1]['default_rate'])
        real_prob.append(beta*row[1]['sampled_prob']/((beta-1)*row[1]['sampled_prob']+1))
        #     prob=1/(1+(1/original_fraction-1)/(1/sampled_fraction-1)*(1/sampled_prob-1))
    a=pd.DataFrame(real_prob,columns=['actual_prob'])
    b=pd.DataFrame(sampled_prob[:,0],columns=['downsampled_prob'])
    return pd.concat([a,b],axis=1)

# Adjusting Probabilities to account for downsampling

In [20]:
prob_df=adjust_prob(pre_df,df,sampled_prob)

In [21]:
prob_df

Unnamed: 0,actual_prob,downsampled_prob
0,0.238310,0.370411
1,0.166306,0.272788
2,0.186997,0.301929
3,0.192358,0.309332
4,0.166010,0.272365
5,0.232834,0.363347
6,0.209272,0.332300
7,0.229440,0.358942
8,0.244386,0.378183
9,0.192216,0.309136
