In [3]:
def LC_Regressor(df,col_to_drop,target,**grade_weight_dict):
    import numpy as np
    import pandas as pd
    from HelperFunctions import feature_standardize, label_encode_column, columns_of_type

    from sklearn.model_selection import GridSearchCV

    from sklearn.ensemble import RandomForestRegressor
    reg = RandomForestRegressor()

    from sklearn.metrics import r2_score
    from sklearn.metrics.scorer import make_scorer
    '''
    df: dataframe of interest (for x_train)
    col_to_drop: accepts a list of column names (strings) that will be dropped from the model
    target: the target variable, in the format of a string
    **grade_weight_dict: the dictionary for weights for different grades
    **grade_weight_dict is in the form of:
    grade_dict={'A':1,
                'B':1,
                'C':1,
                'D':1,
                'E':1,
                'F':1,
                'G':1}
    '''
    df['weight']=df['grade'].map(grade_weight_dict)
    grade_weight_array=df['weight'].values
    index = df.index
    grade_weight_frame = pd.DataFrame(grade_weight_array, index=index)

    df.drop(col_to_drop,axis=1,inplace=True)
    x_train=df.drop([target],axis=1)
    y_train=df.loc[:,target]
    
    cat_list=columns_of_type(x_train,'string')
    cont_list=columns_of_type(x_train,'number')
    
    label_encoded_df=label_encode_column(x_train,cat_list)
    
    final_train_df=label_encoded_df.apply(lambda x:feature_standardize(x,scaleType='standardize'),axis=0)
    
    def grade_weighted_r2(y_true,y_pred,sample_weight):

        weighted_r2=r2_score(y_true,y_pred,sample_weight=sample_weight.loc[y_true.index.values].values.reshape(-1))
        return weighted_r2    

    score_params = {"sample_weight": grade_weight_frame}

    grade_weighted_scorer = make_scorer(score_func=grade_weighted_r2,
                                        greater_is_better=True,
                                        needs_proba=False,
                                        needs_threshold=False,
                                        **score_params)
    grid_para_forest = {
    'n_estimators': [500],
    'max_depth':[15],
    'max_features':[10],
    'min_samples_leaf':[3],
    'min_samples_split':[3]
    }
    
    grid_search_forest = GridSearchCV(estimator = RandomForestRegressor(), param_grid = grid_para_forest,
                                      scoring=grade_weighted_scorer,
                                      cv = 5, n_jobs = -1)
    grid_search_forest.fit(final_train_df,y_train)
    r_d=grid_search_forest.predict(final_train_df)
    return r_d

In [2]:
import numpy as np
import pandas as pd
df=pd.read_csv('down_sampled_df_v2.csv')

In [4]:
drop_list=['sub_grade','issue_d','zip_code','RANDOM','id','weight','loan_duration','loan_status']

In [8]:
grade_weight_dict={'A':1,
            'B':1,
            'C':1,
            'D':1,
            'E':1,
            'F':1,
            'G':1}

In [10]:
LC_Regressor(df,drop_list,'return_rate',grade_weight_dict=grade_weight_dict)



array([-0.07601403, -0.09266711, -0.09288768, ..., -0.29070041,
       -0.42087286, -0.17200294])