## Feature selection with linear models, review
### Putting it all together

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import roc_auc_score

In [2]:
# load the Santander customer satisfaction dataset from Kaggle

data = pd.read_csv('../dataset_1.csv')
data.shape

(50000, 301)

In [3]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis=1),
    data['target'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((35000, 300), (15000, 300))

In [4]:
# I keep a copy of the dataset with all the variables
# to compare the performance of machine learning models
# at the end of the notebook

X_train_original = X_train.copy()
X_test_original = X_test.copy()

### Remove constant features

In [5]:
constant_features = [
    feat for feat in X_train.columns if X_train[feat].std() == 0
]

X_train.drop(labels=constant_features, axis=1, inplace=True)
X_test.drop(labels=constant_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((35000, 266), (15000, 266))

### Remove quasi-constant features

In [6]:
# find features with low variance
sel = VarianceThreshold(threshold=0.01)
sel.fit(X_train)  

# how many not quasi-constant?
sum(sel.get_support()) 

215

In [7]:
features_to_keep = X_train.columns[sel.get_support()]

In [8]:
# remove the features
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

X_train.shape, X_test.shape

((35000, 215), (15000, 215))

In [9]:
# sklearn transformations lead to numpy arrays
# here we transform the arrays back to dataframes

X_train= pd.DataFrame(X_train)
X_train.columns = features_to_keep

X_test= pd.DataFrame(X_test)
X_test.columns = features_to_keep

### Remove duplicated features

In [10]:
duplicated_feat = []
for i in range(0, len(X_train.columns)):
    if i % 10 == 0:  # this helps me understand how the loop is going
        print(i)

    col_1 = X_train.columns[i]

    for col_2 in X_train.columns[i + 1:]:
        if X_train[col_1].equals(X_train[col_2]):
            duplicated_feat.append(col_2)
            
len(duplicated_feat)

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210


10

In [11]:
# remove duplicated features
X_train.drop(labels=duplicated_feat, axis=1, inplace=True)
X_test.drop(labels=duplicated_feat, axis=1, inplace=True)

X_train.shape, X_test.shape

((35000, 205), (15000, 205))

In [12]:
# I keep a copy of the dataset except constant, quasi-constant and duplicated variables

X_train_basic_filter = X_train.copy()
X_test_basic_filter = X_test.copy()

### Remove correlated features

In [13]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_train, 0.8)
print('correlated features: ', len(set(corr_features)) )

correlated features:  93


In [14]:
# remove correlated features
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((35000, 112), (15000, 112))

In [15]:
# keep a copy of the dataset without correlated features
X_train_corr = X_train.copy()
X_test_corr = X_test.copy()

### Select features by the regression coefficients

In [16]:
scaler = StandardScaler()
scaler.fit(X_train)

In [17]:
# we use regularisation by setting a low value of C

sel_ = SelectFromModel(
    LogisticRegression(C=0.0005, random_state=10, max_iter=1000, penalty='l2'))

sel_.fit(scaler.transform(X_train), y_train)

# select features where coefficient is above the mean
# coefficient value and parse again as dataframe
# (remember that the output of sklearn is a
# numpy array)

X_train_coef = pd.DataFrame(sel_.transform(X_train))
X_test_coef = pd.DataFrame(sel_.transform(X_test))

# add the columns name
X_train_coef.columns = X_train.columns[(sel_.get_support())]
X_test_coef.columns = X_train.columns[(sel_.get_support())]



In [18]:
X_train_coef.shape, X_test_coef.shape

((35000, 28), (15000, 28))

### Compare the performance in machine learning algorithms

In [19]:
# create a function to train a logistic regression 
# and compare its performance in the train and test sets

def run_logistic(X_train, X_test, y_train, y_test):
    
    scaler = StandardScaler().fit(X_train)
    
    logit = LogisticRegression(C=0.0005, random_state=10, max_iter=10000, penalty='l2')
    logit.fit(scaler.transform(X_train), y_train)
    
    print('Train set')
    pred = logit.predict_proba(scaler.transform(X_train))
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = logit.predict_proba(scaler.transform(X_test))
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [20]:
# original dataset - all variables
run_logistic(X_train_original,
             X_test_original,
             y_train,
             y_test)

Train set
Logistic Regression roc-auc: 0.7844213790053062
Test set
Logistic Regression roc-auc: 0.7825685631204591


In [21]:
# filter methods - basic
run_logistic(X_train_basic_filter,
             X_test_basic_filter,
             y_train,
             y_test)

Train set
Logistic Regression roc-auc: 0.7841215084117658
Test set
Logistic Regression roc-auc: 0.7823687162836174


In [22]:
# filter methods - correlation
run_logistic(X_train_corr,
             X_test_corr,
             y_train,
             y_test)

Train set
Logistic Regression roc-auc: 0.7775268841746482
Test set
Logistic Regression roc-auc: 0.7760748408318285


In [23]:
# embedded methods - Logistic regression coefficients
run_logistic(X_train_coef,
             X_test_coef,
             y_train,
             y_test)

Train set
Logistic Regression roc-auc: 0.7754621344319395
Test set
Logistic Regression roc-auc: 0.7719805239762513


We see that a model with 28 features performs almost as well as a model trained with all the 215 variables!

**Exercise**:
Change the C in the LogisticRegression model used to select features, and see how that affects the performance of the model trained at the back of this feature selection procedure.

That is all for this lecture. I hope you enjoyed it!