## FILTER METHODS 

https://www.codecademy.com/paths/data-science/tracks/dsml-feature-engineeering-for-ds/modules/dsml-filter-methods/articles/fe-filter-methods

 In this article, we will use variance thresholds, correlation, and mutual information to rank and select the top features. 

In [6]:
from sklearn.feature_selection import VarianceThreshold
import pandas as pd

In [7]:

 
df = pd.DataFrame(data={
    'edu_goal': ['bachelors', 'bachelors', 'bachelors', 'masters', 'masters', 'masters', 'masters', 'phd', 'phd', 'phd'],
    'hours_study': [1, 2, 3, 3, 3, 4, 3, 4, 5, 5],
    'hours_TV': [4, 3, 4, 3, 2, 3, 2, 2, 1, 1],
    'hours_sleep': [10, 10, 8, 8, 6, 6, 8, 8, 10, 10],
    'height_cm': [155, 151, 160, 160, 156, 150, 164, 151, 158, 152],
    'grade_level': [8, 8, 8, 8, 8, 8, 8, 8, 8, 8],
    'exam_score': [71, 72, 78, 79, 85, 86, 92, 93, 99, 100]
})
 
print(df)

    edu_goal  hours_study  hours_TV  hours_sleep  height_cm  grade_level  \
0  bachelors            1         4           10        155            8   
1  bachelors            2         3           10        151            8   
2  bachelors            3         4            8        160            8   
3    masters            3         3            8        160            8   
4    masters            3         2            6        156            8   
5    masters            4         3            6        150            8   
6    masters            3         2            8        164            8   
7        phd            4         2            8        151            8   
8        phd            5         1           10        158            8   
9        phd            5         1           10        152            8   

   exam_score  
0          71  
1          72  
2          78  
3          79  
4          85  
5          86  
6          92  
7          93  
8          99  
9  

In [8]:
X = df.drop(columns=['exam_score'])
X

Unnamed: 0,edu_goal,hours_study,hours_TV,hours_sleep,height_cm,grade_level
0,bachelors,1,4,10,155,8
1,bachelors,2,3,10,151,8
2,bachelors,3,4,8,160,8
3,masters,3,3,8,160,8
4,masters,3,2,6,156,8
5,masters,4,3,6,150,8
6,masters,3,2,8,164,8
7,phd,4,2,8,151,8
8,phd,5,1,10,158,8
9,phd,5,1,10,152,8


In [9]:
y = df['exam_score']

In [18]:
X_num = X.drop(columns=['edu_goal'])

In [19]:
X_num

Unnamed: 0,hours_study,hours_TV,hours_sleep,height_cm,grade_level
0,1,4,10,155,8
1,2,3,10,151,8
2,3,4,8,160,8
3,3,3,8,160,8
4,3,2,6,156,8
5,4,3,6,150,8
6,3,2,8,164,8
7,4,2,8,151,8
8,5,1,10,158,8
9,5,1,10,152,8


##  VARIANCE - removes any columns with 0 variance 
#### `grade_level` getting AXED

In [20]:
selector = VarianceThreshold(threshold=0)  # 0 is default 
print(selector.fit_transform(X_num))

[[  1   4  10 155]
 [  2   3  10 151]
 [  3   4   8 160]
 [  3   3   8 160]
 [  3   2   6 156]
 [  4   3   6 150]
 [  3   2   8 164]
 [  4   2   8 151]
 [  5   1  10 158]
 [  5   1  10 152]]


In [21]:
num_cols = list(X_num.columns[selector.get_support(indices=True)])
num_cols

['hours_study', 'hours_TV', 'hours_sleep', 'height_cm']

In [22]:
X_num = X_num[num_cols]

In [23]:
X_num

Unnamed: 0,hours_study,hours_TV,hours_sleep,height_cm
0,1,4,10,155
1,2,3,10,151
2,3,4,8,160
3,3,3,8,160
4,3,2,6,156
5,4,3,6,150
6,3,2,8,164
7,4,2,8,151
8,5,1,10,158
9,5,1,10,152


In [24]:
X = X[['edu_goal'] + num_cols]

In [25]:
X

Unnamed: 0,edu_goal,hours_study,hours_TV,hours_sleep,height_cm
0,bachelors,1,4,10,155
1,bachelors,2,3,10,151
2,bachelors,3,4,8,160
3,masters,3,3,8,160
4,masters,3,2,6,156
5,masters,4,3,6,150
6,masters,3,2,8,164
7,phd,4,2,8,151
8,phd,5,1,10,158
9,phd,5,1,10,152


# Determine correlation between columns 

In [26]:
import matplotlib.pyplot as plt
#import seaborn as sns

corr_matrix = X_num.corr(method='pearson')  # 'pearson' is default
corr_matrix
#sns.heatmap(corr_matrix, annot=True, cmap='RdBu_r')
#plt.show()

Unnamed: 0,hours_study,hours_TV,hours_sleep,height_cm
hours_study,1.0,-0.780763,-0.067522,-0.133004
hours_TV,-0.780763,1.0,-0.13041,0.05427
hours_sleep,-0.067522,-0.13041,1.0,-0.041615
height_cm,-0.133004,0.05427,-0.041615,1.0


## Figure out which pairs have a high correlation 

In [27]:
# Loop over bottom diagonal of correlation matrix
for i in range(len(corr_matrix.columns)):
    for j in range(i):
 
        # Print variables with high correlation
        if abs(corr_matrix.iloc[i, j]) > 0.7:
            print(corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j])

hours_TV hours_study -0.780763315142435


## Correlation with target variable 

In [29]:
X_y = X_num.copy()
X_y['exam_score'] = y

In [30]:
corr_matrix = X_y.corr()
 
# Isolate the column corresponding to `exam_score`
corr_target = corr_matrix[['exam_score']].drop(labels=['exam_score'])
 
#sns.heatmap(corr_target, annot=True, fmt='.3', cmap='RdBu_r')
#plt.show()

In [31]:
corr_target

Unnamed: 0,exam_score
hours_study,0.9048441
hours_TV,-0.9008659
hours_sleep,1.197402e-16
height_cm,0.01458693


In [32]:
X = X.drop(columns=['hours_TV'])

In [34]:
X

Unnamed: 0,edu_goal,hours_study,hours_sleep,height_cm
0,bachelors,1,10,155
1,bachelors,2,10,151
2,bachelors,3,8,160
3,masters,3,8,160
4,masters,3,6,156
5,masters,4,6,150
6,masters,3,8,164
7,phd,4,8,151
8,phd,5,10,158
9,phd,5,10,152


In [33]:
from sklearn.preprocessing import LabelEncoder
 
le = LabelEncoder()
 
# Create copy of `X` for encoded version
X_enc = X.copy()
X_enc['edu_goal'] = le.fit_transform(X['edu_goal'])
 
print(X_enc)

   edu_goal  hours_study  hours_sleep  height_cm
0         0            1           10        155
1         0            2           10        151
2         0            3            8        160
3         1            3            8        160
4         1            3            6        156
5         1            4            6        150
6         1            3            8        164
7         2            4            8        151
8         2            5           10        158
9         2            5           10        152


In [36]:
from sklearn.feature_selection import mutual_info_regression 
print(mutual_info_regression(X_enc, y, random_state=68))

[0.50396825 0.40896825 0.06896825 0.        ]


In [37]:



from sklearn.feature_selection import SelectKBest
from functools import partial
 
score_func = partial(mutual_info_regression, discrete_features=[0], random_state=68)
 
# Select top 3 features with the most mutual information
selection = SelectKBest(score_func=score_func, k=3)
 
print(selection.fit_transform(X_enc, y))

[[ 0  1 10]
 [ 0  2 10]
 [ 0  3  8]
 [ 1  3  8]
 [ 1  3  6]
 [ 1  4  6]
 [ 1  3  8]
 [ 2  4  8]
 [ 2  5 10]
 [ 2  5 10]]


In [38]:
X = X[X.columns[selection.get_support(indices=True)]]

Unnamed: 0,edu_goal,hours_study,hours_sleep
0,bachelors,1,10
1,bachelors,2,10
2,bachelors,3,8
3,masters,3,8
4,masters,3,6
5,masters,4,6
6,masters,3,8
7,phd,4,8
8,phd,5,10
9,phd,5,10
