## Basic methods plus correlation pipeline with Feature-engine

In this notebook, we will apply basic methods to remove constant, quasi-constant and duplicated features, followed up by removing correlated features, in 1 single step, using Feature-engine and the Scikit-learn Pipeline.

In [1]:
# import libraries

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from feature_engine.selection import (
    DropConstantFeatures,
    DropDuplicateFeatures,
    SmartCorrelatedSelection 
)

In [2]:
# load data

data = pd.read_csv('..\precleaned-datasets\dataset_1.csv')
data.head()

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,...,var_292,var_293,var_294,var_295,var_296,var_297,var_298,var_299,var_300,target
0,0,0,0.0,0.0,0.0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0.0,0.0,0
1,0,0,0.0,3.0,0.0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0.0,0.0,0
2,0,0,0.0,5.88,0.0,0,0,0,0,0,...,0.0,0,0,3,0,0,0,0.0,67772.7216,0
3,0,0,0.0,14.1,0.0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0.0,0.0,0
4,0,0,0.0,5.76,0.0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0.0,0.0,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Columns: 301 entries, var_1 to target
dtypes: float64(127), int64(174)
memory usage: 114.8 MB


In [4]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis=1),
    data['target'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((35000, 300), (15000, 300))

In [5]:
# create pipeline

pipe = Pipeline([
    ('constant', DropConstantFeatures(tol = 0.98)),
    ('duplication', DropDuplicateFeatures()),
    ('correlation',SmartCorrelatedSelection(selection_method='variance')) # choosing selection method as variance  
])

pipe.fit(X_train)


Pipeline(steps=[('constant',
                 DropConstantFeatures(tol=0.98,
                                      variables=['var_1', 'var_2', 'var_3',
                                                 'var_4', 'var_5', 'var_6',
                                                 'var_7', 'var_8', 'var_9',
                                                 'var_10', 'var_11', 'var_12',
                                                 'var_13', 'var_14', 'var_15',
                                                 'var_16', 'var_17', 'var_18',
                                                 'var_19', 'var_20', 'var_21',
                                                 'var_22', 'var_23', 'var_24',
                                                 'var_25', 'var_26', 'var_27',
                                                 'var_28', 'var_29', 'var_30', ...])),
                ('duplication...
                ('correlation',
                 SmartCorrelatedSelection(selection_method='variance'

In [6]:
pipe.named_steps

{'constant': DropConstantFeatures(tol=0.98,
                      variables=['var_1', 'var_2', 'var_3', 'var_4', 'var_5',
                                 'var_6', 'var_7', 'var_8', 'var_9', 'var_10',
                                 'var_11', 'var_12', 'var_13', 'var_14',
                                 'var_15', 'var_16', 'var_17', 'var_18',
                                 'var_19', 'var_20', 'var_21', 'var_22',
                                 'var_23', 'var_24', 'var_25', 'var_26',
                                 'var_27', 'var_28', 'var_29', 'var_30', ...]),
 'duplication': DropDuplicateFeatures(variables=['var_4', 'var_15', 'var_17', 'var_18',
                                  'var_21', 'var_29', 'var_30', 'var_31',
                                  'var_35', 'var_37', 'var_38', 'var_41',
                                  'var_46', 'var_47', 'var_49', 'var_50',
                                  'var_52', 'var_55', 'var_57', 'var_58',
                                  'var_62',

In [7]:
len(pipe.named_steps['constant'].features_to_drop_)

186

In [8]:
len(pipe.named_steps['duplication'].features_to_drop_)

4

In [9]:
len(pipe.named_steps['correlation'].features_to_drop_)

59

In [10]:
# remove features
X_train = pipe.transform(X_train)
X_test = pipe.transform(X_test)

X_train.shape, X_test.shape

((35000, 51), (15000, 51))

In [13]:
# create a function to run a logistic regression and compare performance between train and test

def logistic_reg(X_train, X_test, y_train, y_test):
    
    log = LogisticRegression(random_state=44, max_iter=500)
    
    log.fit(X_train, y_train)
    
    pred_train = log.predict_proba(X_train)[:,1]
    pred_test = log.predict_proba(X_test)[:,1]
    
    print('ROC AUC for train : {}'.format(roc_auc_score(y_train, pred_train)))
    print('ROC AUC for test : {}'.format(roc_auc_score(y_test, pred_test)))

In [15]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [16]:
logistic_reg(X_train, X_test, y_train, y_test)

ROC AUC for train : 0.7895647194833147
ROC AUC for test : 0.7883434358848281
