https://www.kaggle.com/sz8416/6-ways-for-feature-selection

In [1]:
import pandas as pd
import numpy as np
import gc
import warnings
warnings.filterwarnings("ignore")


In [2]:
import os
import configparser

# Load external config file
config = configparser.ConfigParser()
config.read("../resources/config.ini")

PATH_DATA_RAW = config["PATHS"]["PATH_DATA_RAW"]
PATH_DATA_INT = config["PATHS"]["PATH_DATA_INT"]
PATH_DATA_PRO = config["PATHS"]["PATH_DATA_PRO"]
PATH_REPORTS = config["PATHS"]["PATH_REPORTS"]
PATH_MODELS = config["PATHS"]["PATH_MODELS"]
PATH_SUB = config["PATHS"]["PATH_SUB"]

# Telegram Bot
token = config["TELEGRAM"]["token"]
chat_id = config["TELEGRAM"]["chat_id"]
FILENAME_NB = "02_baseline_models" # for Telegram messages

# Set global randome state
rnd_state = 42

# Define available cpu cores
n_cpu = os.cpu_count()
print("Number of CPUs used:", n_cpu)

Number of CPUs used: 16


In [3]:
application_train = pd.read_pickle(PATH_DATA_INT + "train-opt.pkl")

In [4]:
application_train.shape

(1000000, 287)

In [5]:
# Reducing the samples and working with a fractions of it
application_sample1 = application_train.loc[application_train.target==1].sample(frac=0.2, replace=False)
print('label 1 sample size:', str(application_sample1.shape[0]))

application_sample0 = application_train.loc[application_train.target==0].sample(frac=0.2, replace=False)
print('label 0 sample size:', str(application_sample0.shape[0]))

application = pd.concat([application_sample1, application_sample0], axis=0).sort_values('id')

label 1 sample size: 100097
label 0 sample size: 99903


In [6]:
application.shape

(200000, 287)

In [7]:
X = application.drop(['id', 'target'], axis=1)
y = application.target
feature_name = X.columns.tolist()

# Feature Selection

## Pearson Correlation

In [8]:
def cor_selector(X, y):
    cor_list = []
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)

    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]

    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-100:]].columns.tolist()

    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature

In [9]:
cor_support, cor_feature = cor_selector(X, y)
print(str(len(cor_feature)), 'selected features')

100 selected features


## Chi-2

In [10]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=100)
chi_selector.fit(X_norm, y)

SelectKBest(k=100, score_func=<function chi2 at 0x0000026180AE6550>)

In [11]:
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

100 selected features


## Wrapper

In [12]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=100, step=10, verbose=5)
rfe_selector.fit(X_norm, y)

Fitting estimator with 285 features.
Fitting estimator with 275 features.
Fitting estimator with 265 features.
Fitting estimator with 255 features.
Fitting estimator with 245 features.
Fitting estimator with 235 features.
Fitting estimator with 225 features.
Fitting estimator with 215 features.
Fitting estimator with 205 features.
Fitting estimator with 195 features.
Fitting estimator with 185 features.
Fitting estimator with 175 features.
Fitting estimator with 165 features.
Fitting estimator with 155 features.
Fitting estimator with 145 features.
Fitting estimator with 135 features.
Fitting estimator with 125 features.
Fitting estimator with 115 features.
Fitting estimator with 105 features.


RFE(estimator=LogisticRegression(), n_features_to_select=100, step=10,
    verbose=5)

In [13]:
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

100 selected features


## Embeded

### Logistic Regression

In [14]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

embeded_lr_selector = SelectFromModel(estimator=LogisticRegression(penalty="l2"), threshold='1.25*median')
embeded_lr_selector.fit(X_norm, y)


SelectFromModel(estimator=LogisticRegression(), threshold='1.25*median')

In [15]:
embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')

129 selected features


### Random Forest

In [16]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold='1.25*median')
embeded_rf_selector.fit(X, y)

SelectFromModel(estimator=RandomForestClassifier(), threshold='1.25*median')

In [17]:
embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')

13 selected features


### LightGBM

In [18]:
from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier

lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
                    reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)

embeded_lgb_selector = SelectFromModel(lgbc, threshold='1.25*median')
embeded_lgb_selector.fit(X, y)

SelectFromModel(estimator=LGBMClassifier(colsample_bytree=0.2,
                                         learning_rate=0.05,
                                         min_child_weight=40,
                                         min_split_gain=0.01, n_estimators=500,
                                         num_leaves=32, reg_alpha=3,
                                         reg_lambda=1),
                threshold='1.25*median')

In [19]:
embeded_lgb_support = embeded_lgb_selector.get_support()
embeded_lgb_feature = X.loc[:,embeded_lgb_support].columns.tolist()
print(str(len(embeded_lgb_feature)), 'selected features')

120 selected features


# Summary

In [20]:
pd.set_option('display.max_rows', None)
# put all selection together
feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embeded_lr_support,
                                     'Random Forest':embeded_rf_support, 'LightGBM':embeded_lgb_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(100)

Unnamed: 0,Feature,Pearson,Chi-2,RFE,Logistics,Random Forest,LightGBM,Total
1,f92,True,True,True,True,True,True,6
2,f8,True,True,True,True,True,True,6
3,f78,True,True,True,True,True,True,6
4,f77,True,True,True,True,True,True,6
5,f69,True,True,True,True,True,True,6
6,f58,True,True,True,True,True,True,6
7,f3,True,True,True,True,True,True,6
8,f22,True,True,True,True,True,True,6
9,f214,True,True,True,True,True,True,6
10,f179,True,True,True,True,True,True,6


In [21]:
feature_selection_df['Feature'].to_csv('features_selected_6way_all.csv', index=False)

In [24]:
feature_selection_df['Feature'].head(47).to_csv('features_selected_6way_47.csv', index=False)

In [25]:
#feature_selection_df['Feature'].head(47).tolist()

['f92',
 'f8',
 'f78',
 'f77',
 'f69',
 'f58',
 'f3',
 'f22',
 'f214',
 'f179',
 'f156',
 'f136',
 'f12',
 'f95',
 'f90',
 'f73',
 'f72',
 'f63',
 'f56',
 'f52',
 'f48',
 'f43',
 'f4',
 'f247',
 'f241',
 'f227',
 'f213',
 'f211',
 'f201',
 'f200',
 'f2',
 'f199',
 'f195',
 'f192',
 'f191',
 'f174',
 'f17',
 'f169',
 'f163',
 'f162',
 'f150',
 'f134',
 'f127',
 'f125',
 'f112',
 'f103',
 'f1',
 'f99',
 'f98',
 'f96']