# Feature Selection

In this short (compared to Feature Engineering one) notebook, I'll continue data preparation process. Here I'll try to "filter down" bloated dataset that we got at the end of previous notebook. Various techniques will be applied to select most useful for prediction features.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyod.models.mad import MAD
from scipy.stats import normaltest
import lightgbm as lgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 150)
pd.set_option('display.float_format', lambda x: '%.4f' % x)
sns.set()
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
import gc

In [2]:
def custom_lgbm_cv(features, target, col_tran, k=5):
    metric_df=pd.DataFrame(columns=['Train AUC', 'Test AUC'])
    feat_importances=[]
    kfold=StratifiedKFold(k)
    for f, (tr, te) in enumerate(kfold.split(features, y=target)):
        X_train, y_train=features.iloc[tr, :], target.iloc[tr]
        X_test, y_test=features.iloc[te, :], target.iloc[te]

        X_train_tr=col_tran.fit_transform(X_train)
        X_test_tr=col_tran.transform(X_test)
        weight=np.count_nonzero(y_train==0)/np.count_nonzero(y_train==1)

        params={'num_boost_round': 10000,
                'objective': 'binary',
                'scale_pos_weight': weight,
                'metric': 'auc',
                'learning_rate': 0.05,
                'reg_alpha': 0.1,
                'reg_lambda': 0.1,
                'subsample': 0.8,
                'n_jobs': -1,
                'random_state': 5,
                'verbose': -1}

        dtrain=lgb.Dataset(X_train_tr, label=y_train)
        dval=lgb.Dataset(X_test_tr, label=y_test)

        model=lgb.train(
                params=params,
                train_set=dtrain,
                valid_sets=[dtrain, dval],
                valid_names=['train', 'test'],
                callbacks=[lgb.early_stopping(100, verbose=-1)],
                verbose_eval=False)
        
        test_score, train_score=model.best_score['test']['auc'], model.best_score['train']['auc']
        metric_df.loc[f]=[train_score, test_score]

        feat_importances.append(model.feature_importance(importance_type='gain'))
    
    feat_importances=np.array(feat_importances).mean(axis=0)
    feat_importances_df=pd.DataFrame({'feature': col_tran.get_feature_names_out(),
                                        'importance': feat_importances})
    metric_df.loc['Avg']=[metric_df['Train AUC'].mean(), metric_df['Test AUC'].mean()]
    return metric_df, feat_importances_df

In [14]:
def custom_lgbm_cv_small(features, target, k=5):
    metric_df=pd.DataFrame(columns=['Train AUC', 'Test AUC'])
    feat_importances=[]
    kfold=StratifiedKFold(k)
    for f, (tr, te) in enumerate(kfold.split(features, y=target)):
        X_train, y_train=features.iloc[tr, :], target.iloc[tr]
        X_test, y_test=features.iloc[te, :], target.iloc[te]

        weight=np.count_nonzero(y_train==0)/np.count_nonzero(y_train==1)

        params={'num_boost_round': 10000,
                'objective': 'binary',
                'scale_pos_weight': weight,
                'metric': 'auc',
                'learning_rate': 0.05,
                'reg_alpha': 0.1,
                'reg_lambda': 0.1,
                'subsample': 0.8,
                'n_jobs': -1,
                'random_state': 5,
                'verbose': -1}

        dtrain=lgb.Dataset(X_train, label=y_train)
        dval=lgb.Dataset(X_test, label=y_test)

        model=lgb.train(
                params=params,
                train_set=dtrain,
                valid_sets=[dtrain, dval],
                valid_names=['train', 'test'],
                callbacks=[lgb.early_stopping(100, verbose=-1)],
                verbose_eval=False)
        
        test_score, train_score=model.best_score['test']['auc'], model.best_score['train']['auc']
        metric_df.loc[f]=[train_score, test_score]

        feat_importances.append(model.feature_importance(importance_type='gain'))
    
    feat_importances=np.array(feat_importances).mean(axis=0)
    feat_importances_df=pd.DataFrame({'feature': features.columns,
                                        'importance': feat_importances})
    metric_df.loc['Avg']=[metric_df['Train AUC'].mean(), metric_df['Test AUC'].mean()]
    return metric_df, feat_importances_df

In [15]:
import sys

def return_size(df):
    """Return size of dataframe in gigabytes"""
    return round(sys.getsizeof(df) / 1e9, 2)

def convert_types(df, print_info = False):
    
    original_memory = df.memory_usage().sum()
    
    # Iterate through each column
    for c in df:
        
        # Convert ids and booleans to integers
        if ('SK_ID' in c):
            df[c] = df[c].fillna(0).astype(np.int32)
            
        # Convert objects to category
        elif (df[c].dtype == 'object') and (df[c].nunique() < df.shape[0]):
            df[c] = df[c].astype('category')
        
        # Booleans mapped to integers
        elif list(df[c].unique()) == [1, 0]:
            df[c] = df[c].astype(bool)
        
        # Float64 to float32
        elif df[c].dtype == float:
            df[c] = df[c].astype(np.float32)
            
        # Int64 to int32
        elif df[c].dtype == int:
            df[c] = df[c].astype(np.int32)
        
    new_memory = df.memory_usage().sum()
    
    if print_info:
        print(f'Original Memory Usage: {round(original_memory / 1e9, 2)} gb.')
        print(f'New Memory Usage: {round(new_memory / 1e9, 2)} gb.')
        
    return df

In [5]:
train=pd.read_csv('data/train_previous_raw.csv')
train=convert_types(train, print_info=True)
train.shape

Original Memory Usage: 2.75 gb.
New Memory Usage: 1.41 gb.


(307511, 1125)

## Correlation

One of the simplest methods of feature selection is to remove highly correlated features. Presence of this features can negatively affect model's ability to learn, generalize. Hence, we should remove them. One should select the treshold themselves, but it's usually 0.8-0.9.

In [4]:
# corr_mat=train.corr().abs()

In [5]:
# upper=corr_mat.where(np.triu(np.ones(corr_mat.shape), k=1).astype(np.bool))
# upper.head()

In [6]:
# upper.to_csv('data/corr_mat.csv')

In [6]:
upper=pd.read_csv('data/corr_mat.csv').iloc[:, 1:]

I chose 90% colinearity as a threshold, you can try other values.

In [7]:
to_drop=[column for column in upper.columns if any(upper[column] > 0.9)]
print(f'{len(to_drop)} will be removed')

533 will be removed


In [8]:
train=train.drop(to_drop, axis=1)

In [9]:
train.shape

(307511, 592)

In [16]:
import re
train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
X, y=train.drop('TARGET', axis=1), train['TARGET']
cat_cols, num_cols=X.select_dtypes(include=['category', 'object']).columns, X.select_dtypes('number').columns

res, importances=custom_lgbm_cv_small(X, y)
res`

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[230]	train's auc: 0.840007	test's auc: 0.776108
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[317]	train's auc: 0.856136	test's auc: 0.778147
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[248]	train's auc: 0.844401	test's auc: 0.771501
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[267]	train's auc: 0.847285	test's auc: 0.777417
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[434]	train's auc: 0.874982	test's auc: 0.777167


Unnamed: 0,Train AUC,Test AUC
0,0.84,0.7761
1,0.8561,0.7781
2,0.8444,0.7715
3,0.8473,0.7774
4,0.875,0.7772
Avg,0.8526,0.7761


In [18]:
X, y=train.drop('TARGET', axis=1), train['TARGET']
cat_cols, num_cols=X.select_dtypes(include=['category', 'object']).columns, X.select_dtypes('number').columns

ohe=OneHotEncoder(sparse=False, handle_unknown='ignore')
col_tr=ColumnTransformer([
    ('cat', ohe, cat_cols),
    ('num', 'passthrough', num_cols)
])

res, importances=custom_lgbm_cv(X, y, col_tr)
res

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[314]	train's auc: 0.847446	test's auc: 0.778629
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[267]	train's auc: 0.839317	test's auc: 0.778186
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[317]	train's auc: 0.849113	test's auc: 0.773067
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[315]	train's auc: 0.848085	test's auc: 0.778294
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[438]	train's auc: 0.866888	test's auc: 0.775472


Unnamed: 0,Train AUC,Test AUC
0,0.8474,0.7786
1,0.8393,0.7782
2,0.8491,0.7731
3,0.8481,0.7783
4,0.8669,0.7755
Avg,0.8502,0.7767


## "Empty" features

Now I'll remove mostly empty features, i.e., features with 70-90% of NaNs. I'll re-use `miss_table` function from EDA notebook.

In [15]:
def miss_table(data):
    miss_table=data.isna().sum().to_frame(name='Count')
    miss_table['Percent']=miss_table['Count']/len(data)*100
    miss_table['Dtype']=data.dtypes[miss_table.index]
    miss_table['Count']=miss_table['Count'].replace({0: np.nan})
    miss_table=miss_table.dropna()
    print(f"There are {len(miss_table)}/{data.shape[1]} columns with missing values")
    print('Distribution by dtypes:')
    print(miss_table['Dtype'].value_counts())
    return miss_table.sort_values(by='Count', ascending=False)

mt=miss_table(train)
mt.head()

There are 539/592 columns with missing values
Distribution by dtypes:
float64    533
object       6
Name: Dtype, dtype: int64


Unnamed: 0,Count,Percent,Dtype
client_credit_AMT_PAYMENT_CURRENT_min_min,246451.0,80.1438,float64
client_credit_CNT_DRAWINGS_OTHER_CURRENT_max_min,246371.0,80.1178,float64
client_credit_AMT_DRAWINGS_POS_CURRENT_min_min,246371.0,80.1178,float64
client_credit_CNT_DRAWINGS_OTHER_CURRENT_mean_min,246371.0,80.1178,float64
client_credit_CNT_DRAWINGS_ATM_CURRENT_max_min,246371.0,80.1178,float64


In [22]:
# to_drop=
to_drop=mt.loc[mt['Percent']>75].index
print(f'{len(to_drop)} features with 75% of NaNs will be removed')

15 features with 75% of NaNs will be removed


In [23]:
train=train.drop(to_drop, axis=1)
train.shape

(307511, 577)

## Feature Importance

Feature importance that we get from tree-based models can be used for selecting features too. E.g., one can remove features with 0 importance. 

In [17]:
# https://towardsdatascience.com/boruta-explained-the-way-i-wish-someone-explained-it-to-me-4489d70e154a
# https://www.signifytechnology.com/blog/2018/07/a-feature-selection-tool-for-machine-learning-in-python-by-william-koehrsen?source=google.com

In [27]:
X, y=train.drop('TARGET', axis=1), train['TARGET']
cat_cols, num_cols=X.select_dtypes(include=['category', 'object']).columns, X.select_dtypes('number').columns

ohe=OneHotEncoder(sparse=False, handle_unknown='ignore')
col_tr=ColumnTransformer([
    ('cat', ohe, cat_cols),
    ('num', 'passthrough', num_cols)
])

res, importances=custom_lgbm_cv(X, y, col_tr)
res

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[291]	train's auc: 0.843049	test's auc: 0.778366
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[208]	train's auc: 0.828317	test's auc: 0.778082
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[285]	train's auc: 0.84419	test's auc: 0.773263
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[254]	train's auc: 0.837489	test's auc: 0.778917
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[387]	train's auc: 0.859407	test's auc: 0.776866


Unnamed: 0,Train AUC,Test AUC
0,0.843,0.7784
1,0.8283,0.7781
2,0.8442,0.7733
3,0.8375,0.7789
4,0.8594,0.7769
Avg,0.8425,0.7771


In [29]:
importances_reserve=importances.copy()