# Select fewer important features. 

This step may be skipped depending on what you are using for prediction. 

In [2]:
# feature selection on imputed data
import scipy.stats as st
from scipy.stats import norm
from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn import preprocessing
import numpy as np
import pandas as pd
import sys

train = "data/train.csv"
prediction = "data/prediction.csv"
output_dir = "output"

# master skip vector, fill skip feature selection in order: correlation selection, 
#                               tree classifier/regression importance selection,
#                               variance feature selection
skip_vector=(False,False,False)

# feature selection settings
'''
correlation selection
if correlation p-value > p_val_threshold, remove column
set p_val_threshold to 1 to retain all columns
'''
bg = pd.read_csv(output_dir+'/imputed_bg.csv',index_col=0)
p_val_threshold = 0.05

# tree classifier/regression importance selection
'''
 * Input thold: remove all features that are less than this importance threshold
 *              by default, we will remove only features with 0 importance across
 *              all outcomes
 *       keep:  remove all but <keep> features (overwrites thold)
'''
thold = None 
keep = 100

# Variance feature selection
'''
'''
bi_var_p = 0.95
bi_var_threshold = bi_var_p*(1-bi_var_p)
normal_var_threshold = 0.005


disout = ['eviction','layoff','jobTraining']
conout = ['gpa','grit','materialHardship']
allout = disout+conout

IOError: File output/imputed_bg.csv does not exist

In [None]:
for outcome in allout:
    print "processing " + str(outcome)

    bg = pd.read_csv(output_dir+'/imputed_bg.csv',index_col=0)
    tr = pd.read_csv(train, low_memory=False)
    tr = tr.set_index('challengeID')
    pr = pd.read_csv(prediction, low_memory=False)
    pr = pr.set_index('challengeID')

    pr[:]=np.nan
    pr.update(tr)

    nb_samples = bg.shape[0]

    # figure out distribution for columns
    norm_cols=[]
    bi_cols=[]
    for col_name in bg:
        bg[col_name]
        col=bg[col_name]
        if col.min()>=0 and col.max()-col.min()<=1:
            bi_cols.append(col_name)
        else:
            norm_cols.append(col_name)

    # normalization of normal columns
    #rbs_scale = preprocessing.RobustScaler().fit(bg[norm_cols])
    #nrm_scale = preprocessing.Normalizer().fit(bg[norm_cols])
    #std_scale = preprocessing.StandardScaler().fit(bg[norm_cols])
    #norm_scale = preprocessing.Normalizer().fit(bg[norm_cols])
    #df_std = norm_scale.transform(bg[norm_cols])

    tm = np.mean(bg[norm_cols])
    ts = np.std(bg[norm_cols])
    normed = (bg[norm_cols]-tm).divide(ts)
    bg.loc[:,norm_cols]=normed


    if ~skip_vector[0]:
        # correlation feature selection, only continuous vs continuous implemented now
        # compute pairwise correlations
        print("running correlation based feature selection")
        col_names_to_remove = []

        if outcome not in disout: # dependent variable continuous
            mask=~np.isnan(pr[outcome])
            for ci in range(bg.shape[1]):
                if bg[[ci]].columns[0] not in bi_cols:
                    corr = st.pearsonr(bg[[ci]][mask].as_matrix().flatten(),pr[outcome][mask].as_matrix())
                    #spea_p = st.spearmanr(bg[[ci]][mask].as_matrix().flatten(),pr[co][mask].as_matrix())
                    if corr[1]>p_val_threshold:
                        col_names_to_remove.append(bg[[ci]].columns[0])
                    #else:
                        #plt.scatter(bg[[ci]][mask].as_matrix().flatten(),pr[co][mask].as_matrix())
                        #plt.savefig(str(co)+"_"+bg[[ci]].columns[0]+".png")        

        print('{}/{} features kept'.format(bg.shape[1] - len(col_names_to_remove), 
                                           bg.shape[1]))
        bg = bg.drop(col_names_to_remove, axis=1)
    
    if ~skip_vector[1]:
        # tree classifier/regression importance selection
        print("running tree classification/regression importance based feature selection")
        all_fi = np.zeros(bg.shape[1],dtype=np.float64)
        X = bg.as_matrix()
        y = pr[outcome].as_matrix()
        mask=~np.isnan(y)
        if outcome in conout:
            clf = ExtraTreesRegressor()
            #clf = RandomForestRegressor()
            y_train = np.asarray(y[mask], dtype="float")
        else:
            clf = ExtraTreesClassifier()
            #clf = RandomForestClassifier()
            y_train = np.asarray(y[mask], dtype="i4")
        X_train = np.asarray(X[mask,:])
        clf = clf.fit(X_train, y_train)
        all_fi+=clf.feature_importances_

        bs=bg.shape
        if keep==None:
            if thold==None:
                thold = np.unique(sorted(all_fi))[1]
            mask = all_fi>=thold
            #model = SelectFromModel(clf, prefit=True,threshold=)
            #X_new = model.transform(X_train)
            #X_new.shape 
            bg = bg.iloc[:,np.where(mask)[0]]
        else:
            if bg.shape[1]>keep:
                mask = all_fi>=sorted(all_fi)[-1*keep]
                bg = bg.iloc[:,np.where(mask)[0]]

        print('{}/{} features kept'.format(bg.shape[1],bs[1]))

    if ~skip_vector[2]:
        # Variance feature selection
        print("running variance based feature selection (remove columns with small variance)")
        col_names_to_remove = []
        idx = 0
        for col_name in bg:
            to_remove = False
            col = bg[col_name]

            if col.max()-col.min()<=1:
                # is binary
                b_p = col.sum()/nb_samples
                b_var = nb_samples*b_p*(1-b_p)
                if b_var < bi_var_threshold:
                    to_remove = True
            else: # is continuous
                mu, std = norm.fit(col)
                if std*std < normal_var_threshold:
                    to_remove = True

            if to_remove:
                col_names_to_remove.append(col_name)

        print('{}/{} features kept\n'.format(bg.shape[1] - len(col_names_to_remove), 
                                           bg.shape[1]))
        bg = bg.drop(col_names_to_remove, axis=1)
        
    bg.to_csv(output_dir+'/' + outcome + '_fselected_bg.csv')