## feature selection - dropping constant features

- here, we'll remove features that have constant features that aren't actually important for solving the problem statement

In [1]:
import pandas as pd 

In [3]:
df = pd.DataFrame({'A':[1,2,4,1,2,4],
                   'B':[4,5,6,7,8,9],
                   'C':[0,0,0,0,0,0],
                   'D':[1,1,1,1,1,1]})

In [4]:
df

Unnamed: 0,A,B,C,D
0,1,4,0,1
1,2,5,0,1
2,4,6,0,1
3,1,7,0,1
4,2,8,0,1
5,4,9,0,1


### we use variance threshold from sklearn library to deal with this. 
> Feature selector that removes all low-variance features.
> 
> 
> this feature selection algorithm only looks at the X[or independant] variables, not y[the dependant] variable - meaning it can be used for unsupervised learning.
>

In [6]:
from sklearn.feature_selection import VarianceThreshold
var_thr = VarianceThreshold(threshold=0)
var_thr.fit(df)


VarianceThreshold(threshold=0)

In [45]:
var_thr.get_support()

array([ True,  True, False, False])

In [10]:
df.columns[~var_thr.get_support()]

Index(['C', 'D'], dtype='object')

In [15]:
constant_columns = df.columns[~var_thr.get_support()]

In [16]:
constant_columns

Index(['C', 'D'], dtype='object')

In [17]:
df.drop(constant_columns,axis=1)

Unnamed: 0,A,B
0,1,4
1,2,5
2,4,6
3,1,7
4,2,8
5,4,9


### practising on a real dataset

In [48]:
import pandas as pd 
from sklearn.feature_selection import VarianceThreshold
import numpy as np

In [49]:
df = pd.read_csv('santander.csv',nrows=10000)

In [50]:
X= df.drop('TARGET',axis=1)
y=df['TARGET']

In [51]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.drop(labels=['TARGET'],axis=1),df['TARGET'],test_size=0.3,random_state=0)

In [53]:
X_test.shape, X_train.shape

((3000, 370), (7000, 370))

### now applying variance threshold 

In [54]:
var_t = VarianceThreshold(threshold=0)
var_t.fit(X_train)

VarianceThreshold(threshold=0)

In [55]:
sum(var_t.get_support())

284

In [64]:
# no of non constant features
print(len(X_train.columns[var_t.get_support()]))
# no of constant features
print(len(X_train.columns[~var_t.get_support()]))

284
86


In [65]:
constant_col = X_train.columns[~var_t.get_support()]

In [66]:
constant_col

Index(['ind_var2_0', 'ind_var2', 'ind_var13_medio_0', 'ind_var13_medio',
       'ind_var18_0', 'ind_var18', 'ind_var27_0', 'ind_var28_0', 'ind_var28',
       'ind_var27', 'ind_var34_0', 'ind_var34', 'ind_var41', 'ind_var46_0',
       'ind_var46', 'num_var13_medio_0', 'num_var13_medio', 'num_var18_0',
       'num_var18', 'num_var27_0', 'num_var28_0', 'num_var28', 'num_var27',
       'num_var34_0', 'num_var34', 'num_var41', 'num_var46_0', 'num_var46',
       'saldo_var13_medio', 'saldo_var18', 'saldo_var28', 'saldo_var27',
       'saldo_var34', 'saldo_var41', 'saldo_var46',
       'delta_imp_amort_var18_1y3', 'delta_imp_amort_var34_1y3',
       'delta_imp_reemb_var17_1y3', 'delta_imp_reemb_var33_1y3',
       'delta_imp_trasp_var17_out_1y3', 'delta_imp_trasp_var33_out_1y3',
       'delta_num_reemb_var17_1y3', 'delta_num_reemb_var33_1y3',
       'delta_num_trasp_var17_out_1y3', 'delta_num_trasp_var33_out_1y3',
       'imp_amort_var18_hace3', 'imp_amort_var18_ult1',
       'imp_amort_var34_

In [67]:
X_train.drop(constant_col,axis=1)

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
7681,15431,2,42,840.0,4477.02,4989.54,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37491.21
9031,18181,2,31,0.0,52.32,52.32,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,106685.94
3691,7411,2,51,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,66144.66
202,407,2,36,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,92121.36
5625,11280,2,23,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74650.83
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9225,18564,2,33,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117547.89
4859,9723,2,24,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71050.83
3264,6557,2,24,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,141069.33
9845,19796,2,38,0.0,0.00,0.00,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,86412.15
