In [1]:
import numpy as np
import pandas as pd
import sklearn.feature_selection as fs
from sklearn import preprocessing, ensemble

In [2]:
file_path = '../Cleaned Datasets/Sampled Dataset_Balanced.csv.gz'
data = pd.read_csv(file_path, compression = 'gzip')
pd.set_option("display.max_columns", data.shape[1])

## get classifier and regressor targets
df = data.copy().set_index('issue_d', drop = False)
df_status = df[['loan_status']]
df_return = df[['return_rate']]

## remove unknown features for slection process
lst_targets = ['loan_status', 'total_payout', 'duration', 'return_rate']
data.drop(lst_targets, axis = 1, inplace = True)

## only select numeric features to go through variance and F-tests
data_numeric = data.select_dtypes(include = ['float', 'int64'])

data.head(3)

Unnamed: 0,acc_now_delinq,addr_state,all_util,annual_inc,application_type,chargeoff_within_12_mths,collections_12_mths_ex_med,delinq_2yrs,delinq_amnt,dti,emp_length,funded_amnt,grade,home_ownership,il_util,initial_list_status,inq_last_12m,int_rate,issue_d,mo_sin_rcnt_tl,mths_since_last_delinq,mths_since_last_major_derog,mths_since_last_record,num_accts_ever_120_pd,num_rev_accts,num_tl_90g_dpd_24m,num_tl_op_past_12m,open_acc,num_il_tl,pct_tl_nvr_dlq,pub_rec,pub_rec_bankruptcies,purpose,revol_util,sub_grade,tax_liens,term,tot_coll_amt,tot_cur_bal,tot_hi_cred_lim,total_acc,total_bc_limit,total_il_high_credit_limit,total_rev_hi_lim,verification_status,fico_score,active_rev_acct_frac,active_ins_acct_frac,credit_hist,treasury_rate,unemp_rate_3mon
0,0.0,MO,-999.0,45600.0,Individual,0.0,0.0,1.0,0.0,3.26,10,4650.0,C,MORTGAGE,-999.0,f,-999.0,10.28,2007-10-01,-999.0,22.0,-999.0,0.0,-999.0,-999.0,-999.0,-999.0,3.0,-999.0,-999.0,0.0,-999.0,other,35.0,C1,0.0,36,-999.0,-999.0,-999.0,25.0,-999.0,-999.0,-999.0,Not Verified,687.0,-999.0,-999.0,17.926027,4.01,5.2
1,0.0,CO,-999.0,14400.0,Individual,0.0,0.0,0.0,0.0,1.67,1,4000.0,C,RENT,-999.0,f,-999.0,11.22,2007-10-01,-999.0,0.0,-999.0,0.0,-999.0,-999.0,-999.0,-999.0,1.0,-999.0,-999.0,0.0,-999.0,car,37.0,C4,0.0,36,-999.0,-999.0,-999.0,2.0,-999.0,-999.0,-999.0,Not Verified,672.0,-999.0,-999.0,3.50137,4.01,3.53
2,0.0,FL,-999.0,43500.0,Individual,0.0,0.0,0.0,0.0,3.01,4,4900.0,E,RENT,-999.0,f,-999.0,13.75,2007-10-01,-999.0,0.0,-999.0,63.0,-999.0,-999.0,-999.0,-999.0,5.0,-999.0,-999.0,1.0,-999.0,debt_consolidation,90.1,E2,0.0,36,-999.0,-999.0,-999.0,5.0,-999.0,-999.0,-999.0,Not Verified,647.0,-999.0,-999.0,3.50137,4.01,4.47


### 1. Check Variance of Features

In [3]:
## perform variance test with a threshold of 0.5
var_test = fs.VarianceThreshold(threshold = 0.5).fit(data_numeric)

## get the lists of selected and deselected features
lst_selected_var = data_numeric.columns[var_test.get_support(indices = True)].tolist()
lst_dropped_var = list(set(data_numeric.columns) - set(lst_selected_var))
lst_dropped_var

['pub_rec',
 'tax_liens',
 'collections_12_mths_ex_med',
 'treasury_rate',
 'acc_now_delinq',
 'chargeoff_within_12_mths']

### 2. Univariate Test

- Skip $\chi^2\$ test because we imputed NAs to -999

- For classifier: $F$ test
- For regressor: $F$ test 

In [17]:
## define a percentile for fs.SelectePercentile
percentile = 90

In [18]:
%%time
## perform the F-test for classification with a percentile threshold
f_class = fs.SelectPercentile(fs.f_classif, percentile).fit(data_numeric, df_status.loan_status)

## get the lists of selected and deselected features
lst_selected_fc = data_numeric.columns[f_class.get_support(indices = True)].tolist()
lst_dropped_fc = list(set(data_numeric.columns) - set(lst_selected_fc))
lst_dropped_fc

Wall time: 259 ms


['pct_tl_nvr_dlq',
 'num_accts_ever_120_pd',
 'num_tl_op_past_12m',
 'active_rev_acct_frac',
 'num_tl_90g_dpd_24m']

In [19]:
%%time
## perform the F-test for regression with a percentile of 90
f_reg = fs.SelectPercentile(fs.f_regression, percentile).fit(data_numeric, df_return.return_rate)

## get the lists of selected and deselected features
lst_selected_fr = data_numeric.columns[f_reg.get_support(indices = True)].tolist()
lst_dropped_fr = list(set(data_numeric.columns) - set(lst_selected_fr))
lst_dropped_fr

Wall time: 75 ms


['tax_liens',
 'pub_rec_bankruptcies',
 'mths_since_last_record',
 'tot_coll_amt',
 'chargeoff_within_12_mths']

### 3. Radom Forest Selection

In [7]:
## define a label encoder function
def label_encoder(df):
    from sklearn import preprocessing
    
    # get the list of categorical features in the dataframe
    lst = df.select_dtypes(include = 'O').columns.tolist()
    
    # initiate a label encoder from sklearn
    le = preprocessing.LabelEncoder()
    
    # label endoce each feature column
    for l in lst:
        df[l] = le.fit_transform(df[l])
        
    return df

In [40]:
## Define a feature importance selector using random forest

def feature_selector_rf(x, y, class_or_reg, random_col_name, threshold = None,\
                        leaf_size = 30, n_estimators = 100, random_state = 10, plot = True):
    
#     from sklearn import ensemble
#     import matplotlib.pyplot as plt
    
    # initiate the model as a classifier or a regressor
    if 'class' in str.lower(class_or_reg):
        rf = ensemble.RandomForestClassifier()
    elif 'regress' in str.lower(class_or_reg):
        rf = ensemble.RandomForestRegressor()
    else:
        raise TypeError('class_or_reg has to be either class(*) or regress(*)')
    
    # change param settings - default: all CPUs, warm start, oob score
    rf.set_params(n_estimators = n_estimators, random_state = random_state, min_samples_leaf = leaf_size,\
                  oob_score = True, n_jobs = -1, warm_start = True)
    
    # fit the rf model
    rf.fit(x, y)
    
    # create a dataframe for feature importances
    df_fi = pd.DataFrame(rf.feature_importances_, index = x.columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)
    
    # the cutoff feature importance is the larger of random column or defined threshold
    fi_random_col = df_fi.loc[random_col_name].Importance
    fi_threshold = 0 if threshold == None else threshold
    fi_cutoff = max(fi_random_col, fi_threshold)
    
    # selected features with feature importance >= the random column or the defined threshold if given
    df_selected = df_fi[df_fi.Importance >= fi_cutoff]
    df_dropped = df_fi[df_fi.Importance < fi_cutoff]
    
    lst_selected = df_selected.index.tolist()
    lst_dropped = df_dropped.index.tolist()
    
    if plot:
        # print RF oob & score
        print('RF score: %.4f'%rf.score(x, y))
        print('RF oob score: %.4f'%rf.oob_score_)
        print('-' * 88)    
        print('Important features by FI over cutoff or threshold:\n', lst_selected)
        
        # plot features with FI > Perc (1% by default)
        plt.figure(figsize = (10, 5))
        plt.barh(df_selected.index, df_selected.Importance, color = 'crimson', alpha = 0.5)
        plt.title('Top {} Feature Importances (>= {}%) - {}'.format(df_selected.shape[0], np.round(fi_cutoff * 100, 2), y.columns[0]))    
    
    # return list of selected features
    return lst_selected, lst_dropped, rf.score(x, y), rf.oob_score_

In [None]:
## add a column of randomly generated numbers to dataframe as the benchmark
np.random.seed(10)
data['random'] = np.random.random(data.shape[0])

## label encode categorical features
data = label_encoder(data)

In [None]:
data_normalized = (data - data.mean()) / data.std()
## -> normalization makes no difference for RF

In [None]:
%%time
lst_selected_rf_c, lst_dropped_rf_c = feature_selector_rf(data, label_encoder(df_status), 'class', 'random')

In [None]:
%%time
# lst_selected_rf_r, lst_dropped_rf_r = feature_selector_rf(data.drop('issue_d', axis = 1), df_return, 'regressor', 'random')
lst_selected_rf_r, lst_dropped_rf_r = feature_selector_rf(data, df_return, 'regressor', 'random')

In [70]:
## define a time series function to apply feature_selector_rf function on a moving window basis
def fs_rf_moving_window(df, time_col, df_tgt_class = None, df_tgt_regress = None, time_window = 12, time_increment = 3):
    
    # get the list of time (dates, months, quarters etc.) 
    lst_time = df[time_col].unique().tolist()
    
    # set the time column as the index column for time window slicing and drop the column
    # in case it is treated as an important feature (this will be misleading as we already sliced upon time)
    df.set_index(time_col, drop = True, inplace = True)
    
    # add a column of randomly generated numbers to dataframe as the benchmark
    np.random.seed(10)
    df['random'] = np.random.random(df.shape[0])

    ## label encode categorical features
    df = label_encoder(df)
    
    # initiate an empty list and scores for final output
    lst_important = []
    score_c = 0
    score_r = 0
    oob_c = 0
    oob_r = 0
    
    # loop through each time window to perform feature selection using random forest ("feature_selector_rf")
    for i in range(0, len(lst_time) - time_window + 1, time_increment):
        # get the list of periods for this time window
        lst_window = lst_time[i : i + time_window]
        # slice out the dataframe for each time window
        df_window = df.loc[lst_window]
        
        if not df_tgt_class is None:
            df_tgt_class_window = df_tgt_class.loc[lst_window]
            lst_selected_rf, lst_dropped_rf, score, oob = feature_selector_rf(df_window, label_encoder(df_tgt_class_window), 'class', 'random', plot = False)
            lst_important += lst_selected_rf
            score_c = max(score_c, score)
            oob_c = max(oob_c, oob)
            print('{} to {} is done for classifier.'.format(lst_window[0], lst_window[-1]))
            
        if not df_tgt_regress is None:
            df_tgt_regress_window = df_tgt_regress.loc[lst_window]
            lst_selected_rf, lst_dropped_rf, score, oob = feature_selector_rf(df_window, df_tgt_regress_window, 'regress', 'random', plot = False)
            lst_important += lst_selected_rf
            score_r = min(score_r, score)
            oob_r = min(oob_r, oob)
            print('{} to {} is done for regressor.'.format(lst_window[0], lst_window[-1]))
            print('-' * 55)
    
    return list(set(lst_important)), score_c, score_r, oob_c, oob_r

In [64]:
import warnings
warnings.filterwarnings('ignore')
data_train = data.copy()
lst_important_1mon, best_score_c, best_score_r, best_oob_c, best_oob_r = fs_rf_moving_window(data_train, 'issue_d', df_status, df_return, time_increment = 1)

2007-10-01 to 2008-10-01 is done for classifier.
2007-10-01 to 2008-10-01 is done for regressor.
-------------------------------------------------------
2007-11-01 to 2008-11-01 is done for classifier.
2007-11-01 to 2008-11-01 is done for regressor.
-------------------------------------------------------
2007-12-01 to 2008-12-01 is done for classifier.
2007-12-01 to 2008-12-01 is done for regressor.
-------------------------------------------------------
2008-01-01 to 2009-01-01 is done for classifier.
2008-01-01 to 2009-01-01 is done for regressor.
-------------------------------------------------------
2008-02-01 to 2009-02-01 is done for classifier.
2008-02-01 to 2009-02-01 is done for regressor.
-------------------------------------------------------
2008-03-01 to 2009-03-01 is done for classifier.
2008-03-01 to 2009-03-01 is done for regressor.
-------------------------------------------------------
2008-04-01 to 2009-04-01 is done for classifier.
2008-04-01 to 2009-04-01 is done 

2012-05-01 to 2013-04-01 is done for classifier.
2012-05-01 to 2013-04-01 is done for regressor.
-------------------------------------------------------
2012-06-01 to 2013-05-01 is done for classifier.
2012-06-01 to 2013-05-01 is done for regressor.
-------------------------------------------------------
2012-07-01 to 2013-06-01 is done for classifier.
2012-07-01 to 2013-06-01 is done for regressor.
-------------------------------------------------------
2012-08-01 to 2013-07-01 is done for classifier.
2012-08-01 to 2013-07-01 is done for regressor.
-------------------------------------------------------
2012-09-01 to 2013-08-01 is done for classifier.
2012-09-01 to 2013-08-01 is done for regressor.
-------------------------------------------------------
2012-10-01 to 2013-09-01 is done for classifier.
2012-10-01 to 2013-09-01 is done for regressor.
-------------------------------------------------------
2012-11-01 to 2013-10-01 is done for classifier.
2012-11-01 to 2013-10-01 is done 

2016-11-01 to 2017-10-01 is done for classifier.
2016-11-01 to 2017-10-01 is done for regressor.
-------------------------------------------------------
2016-12-01 to 2017-11-01 is done for classifier.
2016-12-01 to 2017-11-01 is done for regressor.
-------------------------------------------------------
2017-01-01 to 2017-12-01 is done for classifier.
2017-01-01 to 2017-12-01 is done for regressor.
-------------------------------------------------------
2017-02-01 to 2018-01-01 is done for classifier.
2017-02-01 to 2018-01-01 is done for regressor.
-------------------------------------------------------
2017-03-01 to 2018-02-01 is done for classifier.
2017-03-01 to 2018-02-01 is done for regressor.
-------------------------------------------------------
2017-04-01 to 2018-03-01 is done for classifier.
2017-04-01 to 2018-03-01 is done for regressor.
-------------------------------------------------------
2017-05-01 to 2018-04-01 is done for classifier.
2017-05-01 to 2018-04-01 is done 

In [68]:
## features with importances > random numbers on a 1-mon moving window
lst_important_1mon = ['active_rev_acct_frac',
 'credit_hist',
#  'random',
 'mths_since_last_delinq',
 'total_bc_limit',
 'tot_cur_bal',
 'total_rev_hi_lim',
 'revol_util',
 'num_il_tl',
 'sub_grade',
 'tot_hi_cred_lim',
 'total_acc',
 'annual_inc',
 'int_rate',
 'purpose',
 'funded_amnt',
 'fico_score',
 'treasury_rate',
 'all_util',
 'verification_status',
 'term',
 'unemp_rate_3mon',
 'home_ownership',
 'active_ins_acct_frac',
 'dti',
 'total_il_high_credit_limit',
 'addr_state',
 'il_util',
 'emp_length']

len(lst_important_1mon)

28

In [7]:
## features with importances > random numbers on a 3-mon moving window
lst_important_3mon = ['active_rev_acct_frac',
  'credit_hist',
#   'random',
  'mths_since_last_delinq',
  'total_bc_limit',
  'tot_cur_bal',
  'total_rev_hi_lim',
  'revol_util',
  'num_il_tl',
  'sub_grade',
  'tot_hi_cred_lim',
  'total_acc',
  'annual_inc',
  'int_rate',
  'purpose',
  'funded_amnt',
  'fico_score',
  'treasury_rate',
  'all_util',
  'term',
  'unemp_rate_3mon',
  'home_ownership',
  'active_ins_acct_frac',
  'dti',
  'total_il_high_credit_limit',
  'addr_state',
  'il_util',
  'emp_length']
len(lst_important_3mon)

27

In [69]:
set(lst_important_1mon) - set(lst_important_3mon)

{'verification_status'}

In [35]:
## combine drop lists from variance test and F-tests
lst_dropped = list(set(lst_dropped_var + lst_dropped_fc + lst_dropped_fr))

## overlap bwtn drop list and 3-mon list
list(set(lst_dropped) & set(lst_important_3mon))

['treasury_rate', 'active_rev_acct_frac']

In [36]:
## remove treasury rate from drop list as it should be important
lst_dropped.remove('treasury_rate')

In [41]:
lst_final = list(set(lst_important_3mon) - set(lst_dropped))
lst_final

['mths_since_last_delinq',
 'sub_grade',
 'home_ownership',
 'total_rev_hi_lim',
 'annual_inc',
 'term',
 'active_ins_acct_frac',
 'fico_score',
 'funded_amnt',
 'all_util',
 'emp_length',
 'revol_util',
 'treasury_rate',
 'credit_hist',
 'int_rate',
 'purpose',
 'dti',
 'tot_hi_cred_lim',
 'total_acc',
 'tot_cur_bal',
 'total_il_high_credit_limit',
 'il_util',
 'addr_state',
 'total_bc_limit',
 'unemp_rate_3mon',
 'num_il_tl']

In [40]:
len(lst_final)

26