# 4.1-Correlation-Pearson
# 4.2-Basic-methods-plus-correlation-pipeline
# 4.3-Correlation-with-Feature-engine
# 4.4-Pipeline-with-Feature-engine

In [1]:
# 4.3-Correlation-with-Feature-engine
# The DropCorrelatedFeatures class from Feature-engine does a similar job to the brute force approach that we described earlier.
# The SmartCorrelationSelection allows us to select a feature from each correlated group based on model performance, number of missing values, cardinality or variance.
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_validate
from feature_engine.selection import DropCorrelatedFeatures, SmartCorrelatedSelection
import warnings
warnings.filterwarnings('ignore')



In [2]:
data = pd.read_csv('../dataset_2.csv', nrows=50000)
print(data.shape)
data.head(1)

(50000, 109)


Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,...,var_100,var_101,var_102,var_103,var_104,var_105,var_106,var_107,var_108,var_109
0,4.53271,3.280834,17.982476,4.404259,2.34991,0.603264,2.784655,0.323146,12.009691,0.139346,...,2.079066,6.748819,2.941445,18.360496,17.726613,7.774031,1.473441,1.973832,0.976806,2.541417


In [3]:
# separate train and test sets
X_train, X_test, y_train, y_test = train_test_split( data.drop(labels=['target'], axis=1), data['target'], test_size=0.3, random_state=0)
X_train.shape, X_test.shape

((35000, 108), (15000, 108))

## Remove correlated

### Brute force approach

In [4]:
# set up the selector
sel = DropCorrelatedFeatures( threshold=0.8, method='pearson', missing_values='ignore')
# find correlated features
sel.fit(X_train)

In [5]:
# each set contains a group of correlated features
sel.correlated_feature_sets_

[{'var_3', 'var_80'},
 {'var_28', 'var_5', 'var_75'},
 {'var_11', 'var_33'},
 {'var_13', 'var_17'},
 {'var_15', 'var_57'},
 {'var_18', 'var_43'},
 {'var_19', 'var_29'},
 {'var_21', 'var_70', 'var_88'},
 {'var_22', 'var_24', 'var_32', 'var_39', 'var_42', 'var_76'},
 {'var_102', 'var_23'},
 {'var_26', 'var_59'},
 {'var_108', 'var_30'},
 {'var_35', 'var_87'},
 {'var_101', 'var_105', 'var_40', 'var_74', 'var_85'},
 {'var_46', 'var_94'},
 {'var_50', 'var_72'},
 {'var_52', 'var_66'},
 {'var_109', 'var_56'},
 {'var_104', 'var_60'},
 {'var_63', 'var_64', 'var_84', 'var_97'},
 {'var_106', 'var_77'},
 {'var_90', 'var_95'},
 {'var_100', 'var_98'}]

In [6]:
# in the above, var_3 is correlated to var_80, and 'var_28', 'var_5', 'var_75' are correlated with each other.
# the transformer selects 1 feature from each group.
# the rest will be removed and can be found in this attribute

len(sel.features_to_drop_)

34

In [7]:
# drop correlated features

X_train = sel.transform(X_train)
X_test = sel.transform(X_test)
X_train.shape, X_test.shape

((35000, 74), (15000, 74))

## SmartCorrelationSelection

### Model Performance

We will keep a feature from each correlation group based on the performance of a random forest.

In [8]:
X_train, X_test, y_train, y_test = train_test_split( data.drop(labels=['target'], axis=1), data['target'], test_size=0.3, random_state=0)

In [9]:
# random forest
rf = RandomForestClassifier( n_estimators=10, random_state=20, n_jobs=4, )
# correlation selector
sel = SmartCorrelatedSelection( variables=None, # if none, selector examines all numerical variables
    method="pearson", threshold=0.8, missing_values="raise", selection_method="model_performance", estimator=rf,
    scoring="roc_auc", cv=3, )

# this may take a while, because we are training
# a random forest per correlation group
sel.fit(X_train, y_train)

In [10]:
# groups of correlated features
sel.correlated_feature_sets_

[{'var_3', 'var_80'},
 {'var_28', 'var_5', 'var_75'},
 {'var_11', 'var_33'},
 {'var_13', 'var_17'},
 {'var_15', 'var_57'},
 {'var_18', 'var_43'},
 {'var_19', 'var_29'},
 {'var_21', 'var_70', 'var_88'},
 {'var_22', 'var_24', 'var_32', 'var_39', 'var_42', 'var_76'},
 {'var_102', 'var_23'},
 {'var_26', 'var_59'},
 {'var_108', 'var_30'},
 {'var_35', 'var_87'},
 {'var_101', 'var_105', 'var_40', 'var_74', 'var_85'},
 {'var_46', 'var_94'},
 {'var_50', 'var_72'},
 {'var_52', 'var_66'},
 {'var_109', 'var_56'},
 {'var_104', 'var_60'},
 {'var_63', 'var_64', 'var_84', 'var_97'},
 {'var_106', 'var_77'},
 {'var_90', 'var_95'},
 {'var_100', 'var_98'}]

In [11]:
# lets examine the performace of a random forest based on each feature from the second group, to understand what the transformer is doing
# select second group of correlated features
group = sel.correlated_feature_sets_[1]

# build random forest with cross validation for each feature
for f in group:    
    model = cross_validate(rf, X_train[f].to_frame(), y_train, cv=3, return_estimator=False,   scoring='roc_auc', )
    print(f, model["test_score"].mean())

var_28 0.506608912947481
var_5 0.4997681957013689
var_75 0.501355776590588


In [12]:
# The best performing feature is var_28, so that one should be retained. The other 2 can be found in the attribute features_to_drop_
# retained
'var_28' in sel.features_to_drop_

False

In [13]:
# dropped
'var_5' in sel.features_to_drop_

True

In [14]:
# dropped
'var_75' in sel.features_to_drop_

True

### Variance

Alternatively, we can select the feature with the highest variance from each group.

In [15]:
# correlation selector

sel = SmartCorrelatedSelection( variables=None, method="pearson", threshold=0.8, missing_values="raise",
 selection_method="variance", estimator=None, scoring="roc_auc", cv=3,)

sel.fit(X_train, y_train)

In [16]:
# let's examine the variance of the features from the second group of correlated ones

group = sel.correlated_feature_sets_[1]
X_train[group].std()

var_28    1.024728
var_5     0.875302
var_75    3.539938
dtype: float64

In [17]:
# var_75 has the highest variance, so this feature should be kept and the other ones removed.
'var_28' in sel.features_to_drop_

True

In [18]:
'var_5' in sel.features_to_drop_

True

In [19]:
'var_75' in sel.features_to_drop_

False

********************************
********************************
********************************

4.4-Pipeline-with-Feature-engine

## Basic methods plus correlation pipeline with Feature-engine

In this notebook, we will apply basic methods to remove constant, quasi-constant and duplicated features, followed up by removing correlated features, in 1 single step, using Feature-engine and the Scikit-learn Pipeline.

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from feature_engine.selection import ( DropConstantFeatures, DropDuplicateFeatures, SmartCorrelatedSelection, )

In [21]:
data = pd.read_csv('../dataset_1.csv')
data.shape

(50000, 301)

In [22]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split( data.drop(labels=['target'], axis=1), data['target'], test_size=0.3, random_state=0)

X_train.shape, X_test.shape

((35000, 300), (15000, 300))

In [23]:
# we stack all the selection methods inside a pipeline

pipe = Pipeline([
    ('constant', DropConstantFeatures(tol=0.998)),
    ('duplicated', DropDuplicateFeatures()),
    ('correlation', SmartCorrelatedSelection(selection_method='variance')),
])

pipe.fit(X_train)

In [24]:
# remove features
X_train = pipe.transform(X_train)
X_test = pipe.transform(X_test)
X_train.shape, X_test.shape

((35000, 78), (15000, 78))

In [25]:
# create a function to build logistic regression and compare performance in train and test set

def run_logistic(X_train, X_test, y_train, y_test): 
    # function to train and test the performance of logistic regression
    logit = LogisticRegression(random_state=44, max_iter=500)
    logit.fit(X_train, y_train)
    print('Train set')
    pred = logit.predict_proba(X_train)
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = logit.predict_proba(X_test)
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [26]:
# filter methods - correlation
scaler = StandardScaler().fit(X_train)
run_logistic(scaler.transform(X_train), scaler.transform(X_test),y_train, y_test)

Train set
Logistic Regression roc-auc: 0.7919672937450547
Test set
Logistic Regression roc-auc: 0.7886201468896858
