# Feature Selection - Dropping Constant Features

In [30]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split

VarianceThreshold - Feature selector that removes all low-variance features.

This feature selection algorithm looks only at the features (X), not the desired outputs (y), and can thus be used for unsupervised learning.


In [31]:
data = pd.DataFrame({
    "A":[1,2,4,1,2,4],
    "B":[4,5,6,7,8,9],
    "C":[0,0,0,0,0,0], 
    "D":[1,1,1,1,1,1]
})

data.head()

Unnamed: 0,A,B,C,D
0,1,4,0,1
1,2,5,0,1
2,4,6,0,1
3,1,7,0,1
4,2,8,0,1


In [32]:
var_thres = VarianceThreshold(threshold=0.0) # removes features with variance below the threshold
var_thres.fit(data)

0,1,2
,threshold,0.0


In [33]:
var_thres.get_support()

array([ True,  True, False, False])

In [34]:
data.columns[var_thres.get_support()] # Keep these features

Index(['A', 'B'], dtype='object')

In [35]:
constant_columns = [columns for columns in data.columns if columns not in data.columns[var_thres.get_support()]]

print(len(constant_columns))

2


In [36]:
for feature in constant_columns:
    print(feature) # Remove these features

C
D


In [37]:
data.drop(constant_columns, axis=1)

Unnamed: 0,A,B
0,1,4
1,2,5
2,4,6
3,1,7
4,2,8
5,4,9


## With Bigger Dataset

https://www.kaggle.com/competitions/santander-customer-satisfaction/data?select=train.csv

In [38]:
df = pd.read_csv('../helpers/data/santander.csv', nrows=10000)
df.shape

(10000, 371)

In [39]:
df.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [40]:
X = df.drop(labels=['TARGET'], axis=1)
y = df['TARGET']

In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0)

X_train.shape, X_test.shape

((7000, 370), (3000, 370))

In [42]:
var_thres = VarianceThreshold(threshold=0.0)
var_thres.fit(X_train)

0,1,2
,threshold,0.0


In [43]:
sum(var_thres.get_support()) # features with variance above the threshold

np.int64(284)

In [44]:
constant_columns = [columns for columns in X_train.columns if columns not in X_train.columns[var_thres.get_support()]]

print(len(constant_columns))

86


In [45]:
X_train.drop(constant_columns, axis=1, inplace=True)

In [None]:
# In X_test, we don't do the retest, we just drop the same constant columns to avoid overfitting
X_test.drop(constant_columns, axis=1, inplace=True)

In [47]:
X_train.shape, X_test.shape

((7000, 284), (3000, 284))