This feature selection notebook does a filter followed by a wrapper for a binary dependent variable (binary classification). It's capable of doing the filter on more than one file. The variable files are called vars1.csv, vars2.csv ... Or you can make the input file name(s) anything you want.

The filter runs separately on each vars file and keeps the top num_filter variables from each file. If there are more than one vars files we'll again select the top num_filter variables across all the vars.csv files.

If balance = 0 the entire files are used. If balance != 0 then balance is the RATIO OF BADS TO GOODS retained for the rest of the feature selection. We keep all the rare class (bads) and downsample the goods. I think in general it's better to keep balance = 0.

I've got an annoying warning message from the wrapper and I can't figure out how to get rid of it. If anybody figures this out please send a message to stevecoggeshall@gmail.com

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as sps
import matplotlib.pyplot as plt
import datetime as dt
import gc
from sklearn.ensemble import RandomForestClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from lightgbm import LGBMClassifier
%matplotlib inline
start_time = dt.datetime.now()

In [2]:
# set some parameters
num_files = 1
# I recommend set num_filter to be about 10 to 20% of the original # variables
num_filter = 250
# I recommend set num_wrapper to be about 50, then look for a saturation of the model performance as variables are added
# Then you can run it again with num_wrapper just a bit above this saturation point, not more than about twice this saturation number
num_wrapper = 30
balance = 0
detect_rate = .03
index_name = 'Recnum'
y_name = 'Fraud'
good_label = 0
bad_label = 1

## Run a filter on all the files

In [3]:
%%time
filter_score_df_list = []
for i in range(num_files):
#     file_name = "vars"+str(i+1)+'.csv'
    file_name = 'card_vars no BL.csv'
    df = pd.read_csv(file_name)
    df.drop(columns=['Unnamed: 0'], inplace=True)
    print(df.shape)
    print("********** working on",file_name,"size is",df.shape)
    df = df.set_index(index_name) 
    df = df[df.index <= 83969] # remove the last two months as the out-of-time data (OOT)
    df = df[df.index >= 3338] # remove the first 2 weeks of records since their variables aren't well formed
    print(df.shape)
    df['RANDOM'] = np.random.ranf(len(df)) # add a random number variable to make sure it doesn't come up as important
    goods = df[df[y_name] == good_label]
    bads = df[df[y_name] == bad_label]
    del df # don't need this file anymore
    num_goods = len(goods)
    num_bads = len(bads)
    num_vars = len(bads.columns)-2
    if(balance != 0):
        if(i == 0):
            num_goods_desired = int(min(num_goods,num_bads*balance))
            goods = goods.sample(n=num_goods_desired,random_state=1)
            goods_keep = list(goods.index)
            goods_keep.sort()
    
        if(i > 0):
            goods = goods.loc[goods_keep] 
            
    df_sampled = pd.concat([goods,bads])
    df_sampled.sort_index(inplace=True)
    filter_score = pd.DataFrame(np.zeros((num_vars+1,2)))
    filter_score.columns = ['variable','filter score']   
    j = 0
    for column in df_sampled:
        filter_score.loc[j,'variable'] = column
        filter_score.loc[j,'filter score'] = sps.ks_2samp(goods[column],bads[column])[0]
        j = j+1
        if j%100 == 0:
            print(j)

    filter_score.sort_values(by=['filter score'], ascending=False, inplace=True)
    vars_keep = list(filter_score['variable'][1:num_filter+1]) 
    print(file_name,filter_score.head(20))
    if(i == 0): # if first time through need to initialize some stuff
        Y = pd.DataFrame(df_sampled[y_name], index=df_sampled.index)
        df_top = df_sampled.filter(vars_keep, axis=1)
            
    if(i > 0): # if more than one variable file we use this loop
        data_new_top = df_sampled.filter(vars_keep, axis=1)
        df_top = pd.concat([df_top,data_new_top], axis=1)

    filter_score_df_list.append(filter_score)
    
    del goods # delete these before starting the next file, if any
    del bads
    gc.collect()
filter_score = pd.concat(filter_score_df_list)

(96397, 1528)
********** working on card_vars no BL.csv size is (96397, 1528)
(80309, 1527)
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
card_vars no BL.csv                       variable  filter score
8                        Fraud      1.000000
151  Merch description_total_3      0.625512
536        merch_state_total_3      0.614292
316         card_merch_total_3      0.614279
96            Merchnum_total_3      0.614279
142  Merch description_total_1      0.611944
131    Merch description_max_0      0.609286
481          merch_zip_total_3      0.609191
516          merch_state_max_0      0.607813
76              Merchnum_max_0      0.607800
296           card_merch_max_0      0.607800
527        merch_state_total_1      0.607366
307         card_merch_total_1      0.607329
87            Merchnum_total_1      0.607329
472          merch_zip_total_1      0.605188
461            merch_zip_max_0      0.603673
371           card_zip_total_3      0.602817
261         

In [4]:
filter_score.sort_values(by=['filter score'], ascending=False, inplace=True)
filter_score.reset_index(drop=True,inplace=True)

In [5]:
filter_score.head(30)

Unnamed: 0,variable,filter score
0,Fraud,1.0
1,Merch description_total_3,0.625512
2,merch_state_total_3,0.614292
3,Merchnum_total_3,0.614279
4,card_merch_total_3,0.614279
5,Merch description_total_1,0.611944
6,Merch description_max_0,0.609286
7,merch_zip_total_3,0.609191
8,merch_state_max_0,0.607813
9,Merchnum_max_0,0.6078


In [6]:
filter_score.tail(10)

Unnamed: 0,variable,filter score
1518,Merch state_unique_count_for_card_state_3,0.0
1519,Merch state_unique_count_for_card_state_7,0.0
1520,Merch state_unique_count_for_card_state_14,0.0
1521,Merch state_unique_count_for_card_state_30,0.0
1522,Merch zip_card_zip_nunique,0.0
1523,merch_state_unique_count_for_card_state_7,0.0
1524,merch_state_unique_count_for_Merchnum_60,0.0
1525,merch_state_unique_count_for_card_state_30,0.0
1526,card_zip_unique_count_for_Merch zip_7,0.0
1527,merch_zip_unique_count_for_card_zip_14,0.0


In [7]:
filter_score.shape

(1528, 2)

In [8]:
filter_score.head(80).to_csv('filter_top.csv')
vars_keep = list(filter_score['variable'][num_files:num_filter+3])
print(i,' vars_keep:',vars_keep)

0  vars_keep: ['Merch description_total_3', 'merch_state_total_3', 'Merchnum_total_3', 'card_merch_total_3', 'Merch description_total_1', 'Merch description_max_0', 'merch_zip_total_3', 'merch_state_max_0', 'Merchnum_max_0', 'card_merch_max_0', 'merch_state_total_1', 'card_merch_total_1', 'Merchnum_total_1', 'merch_zip_total_1', 'merch_zip_max_0', 'card_zip_total_3', 'Merch zip_total_3', 'Cardnum_total_3', 'Cardnum_total_7', 'card_zip_total_1', 'Merch zip_total_1', 'Merch description_max_1', 'Merch state_actual/avg_30', 'card_state_actual/avg_30', 'card_state_variability_avg_30', 'Merch state_variability_avg_30', 'merch_state_max_1', 'card_merch_max_1', 'Merchnum_max_1', 'merch_zip_max_1', 'Merch description_max_3', 'Merch zip_max_0', 'card_zip_max_0', 'Cardnum_max_0', 'merch_state_total_7', 'Merchnum_total_7', 'card_merch_total_7', 'merch_state_avg_0', 'card_merch_avg_0', 'Merchnum_avg_0', 'Merch state_actual/med_30', 'card_state_actual/med_30', 'merch_state_total_0', 'Merchnum_total_

In [9]:
vars_keep_df = pd.DataFrame({'col':vars_keep})
vars_keep_df.to_csv('vars_keep_filter.csv',index=False)
df_keep = df_top.filter(vars_keep, axis=1)
df_keep.head()

Unnamed: 0_level_0,Merch description_total_3,merch_state_total_3,Merchnum_total_3,card_merch_total_3,Merch description_total_1,Merch description_max_0,merch_zip_total_3,merch_state_max_0,Merchnum_max_0,card_merch_max_0,...,card_merch_variability_med_30,Merch description_med_14,Merch state_variability_med_1,card_state_variability_med_1,merch_zip_med_30,Merchnum_med_30,card_merch_med_30,merch_state_med_30,merch_zip_variability_med_30,card_state_med_0
Recnum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3338,150.75,150.75,150.75,150.75,36.9,36.9,150.75,36.9,36.9,36.9,...,38.475,75.375,158.99,158.99,75.375,75.375,75.375,75.375,38.475,220.0
3339,590.0,590.0,590.0,590.0,590.0,590.0,590.0,590.0,590.0,590.0,...,0.0,590.0,-302.11,-302.11,590.0,590.0,590.0,590.0,0.0,344.975
3340,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,...,0.0,120.0,83.945,83.945,120.0,120.0,120.0,120.0,0.0,120.0
3341,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,...,0.0,545.0,-265.0,-265.0,545.0,545.0,545.0,545.0,0.0,290.0
3342,27442.5,27442.5,27442.5,27442.5,27442.5,2500.0,27442.5,2500.0,2500.0,2500.0,...,-1551.0,925.0,-2221.0,-2221.0,925.0,925.0,925.0,925.0,-1551.0,575.605


In [10]:
df_keep.shape

(80309, 250)

In [11]:
Y.head()

Unnamed: 0_level_0,Fraud
Recnum,Unnamed: 1_level_1
3338,0
3339,0
3340,0
3341,0
3342,0


In [12]:
Y = Y.values
Y_save = Y.copy()

In [13]:
# Y = np.array(Y)
X = df_keep
print(Y)

[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


In [14]:
print('time to here:', dt.datetime.now() - start_time)

time to here: 0:00:48.983609


In [15]:
print(X.shape,Y.shape)

(80309, 250) (80309, 1)


In [16]:
print(type(X),type(Y))

<class 'pandas.core.frame.DataFrame'> <class 'numpy.ndarray'>


In [17]:
# I'd like to define a scoring for the wrapper that's KS, but I haven't gotten around to this yet.
# def KSscore(classifier, x,y)
#     goods = 

In [18]:
def fdr(classifier, x, y, cutoff=detect_rate):
# Calculates FDR score for the given classifier on dataset x and y with cutoff value
# get the probability list from the given classifier
    return fdr_prob(y, classifier.predict_proba(x), cutoff)
def fdr_prob(y, y_prob, cutoff=detect_rate):
    if len(y_prob.shape) != 1:    # sometimes the proba list can contain many columns, one for each category
        y_prob = y_prob[:, -1:]   # only the last one (fraud_label==1) is used here.
    num_fraud = len(y[y == 1])    # count the total nunber of frauds   
# sort the proba list from high to low while retain the true (not predicted) fraud label
    sorted_prob = np.asarray(sorted(zip(y_prob, y), key=lambda x: x[0], reverse=True))
    cutoff_bin = sorted_prob[0:int(len(y) * cutoff), 1:]  # 3% cutoff
# return the FDR score (#fraud_in_cutoff / #total_fraud)
    return len(cutoff_bin[cutoff_bin == 1]) / num_fraud   

## Run a wrapper on the remaining top variables

In [None]:
%%time
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning) 
# If you're doing forward selection it's enough to stop at num_wrapper variables. 
# If you're doing backward selection you need to go through all the variables to get a sorted list of num_wrapper variables.

# I can't figure out how to get rid of this annoying warning! I don't know what I'm doing wrong...

nfeatures = len(X.columns)
# clf = RandomForestClassifier(n_estimators=5) # simple, fast nonlinear model for the wrapper
clf = LGBMClassifier(n_estimators=20,num_leaves=3) # simple, fast nonlinear model for the wrapper
# sfs = SFS(clf,k_features=num_wrapper,forward=True,verbose=0,scoring=fdr,cv=3,n_jobs=-1) # use for forward selection
sfs = SFS(clf,k_features=1,forward=False,verbose=0,scoring=fdr,cv=3,n_jobs=-1) # use for backward selection
sfs.fit(X,Y)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [None]:
print('time to here:', dt.datetime.now() - start_time)

In [None]:
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
fig1 = plot_sfs(sfs.get_metric_dict(),kind='std_dev', figsize=(15, 6))
plt.xticks(np.arange(0, len(X.columns), step=5))
plt.yticks(np.arange(0,1,step=.1))
plt.ylim([0., 1])
plt.xlim(0,num_wrapper)
plt.title('Stepwise Selection')
plt.grid()
plt.savefig('performance_nvars.png')
plt.show()

In [None]:
vars_FS = pd.DataFrame.from_dict(sfs.get_metric_dict()).T

In [None]:
ordered_vars_FS = vars_FS.copy()
for i in range(len(ordered_vars_FS)):
    ordered_vars_FS.loc[i+1,'add variables in this order'] = int(i+1)
    if i+1 == 1:
        ordered_vars_FS.loc[i+1,'variable name'] = (list(ordered_vars_FS.loc[i+1,'feature_names'])[0])
    else:
        ordered_vars_FS.loc[i+1,'variable name'] = (list(set(ordered_vars_FS.loc[i+1,'feature_names']) - set(ordered_vars_FS.loc[i,'feature_names'])))
# You might also need this following line. It converts a list to a string
    ordered_vars_FS.loc[i+1,'variable name'] = ordered_vars_FS.loc[i+1,'variable name'][0]

In [None]:
ordered_vars_FS

In [None]:
ordered_vars_FS.to_csv('Wrapper_selection_info.csv', index=False)

In [None]:
vars_keep = ordered_vars_FS['variable name']
vars_keep_list = ordered_vars_FS['variable name'].tolist()
vars_keep.to_csv('final_vars_list.csv',index=False)
vars_keep

In [None]:
filter_score.set_index('variable',drop=True,inplace=True)
filter_score = filter_score.iloc[1:,:]
filter_score

In [None]:
vars_keep_sorted = pd.DataFrame(vars_keep_list)
vars_keep_sorted.columns=['variable']
vars_keep_sorted.set_index('variable',drop=True,inplace=True)
vars_keep_sorted.head()

In [None]:
vars_keep_sorted = pd.concat([vars_keep_sorted,filter_score],axis=1,join='inner')

In [None]:
vars_keep_sorted.reset_index(inplace=True)
vars_keep_sorted.reset_index(inplace=True)
vars_keep_sorted['index'] = vars_keep_sorted['index'] + 1
vars_keep_sorted.rename(columns={'index':'wrapper order'},inplace=True)
vars_keep_sorted.to_csv('vars_keep_sorted.csv',index=False)
vars_keep_sorted

In [None]:
vars_keep_list.append(index_name)
vars_keep_list.append(y_name)
vars_keep_list

In [None]:
filter_score

In [None]:
%%time
df = pd.read_csv(file_name)
df.shape

In [None]:
df_keep = df.filter(vars_keep_list, axis=1)
# df_keep = df[df.index.isin(vars_keep_list)]
print(df_keep.shape)

In [None]:
df_keep.to_csv('vars_final.csv',index=False)

In [None]:
print("duration: ", dt.datetime.now() - start_time)