# Selecting best features using sklearn

In [1]:
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_regression

X_california, y_california = fetch_california_housing(return_X_y=True)

X, y = X_california[:2000, :], y_california[:2000]

X.shape

(2000, 8)

In [2]:
select_k_best = SelectKBest(mutual_info_regression, k=3)
x_new = select_k_best.fit_transform(X, y)
x_new.shape

(2000, 3)

In [3]:
select_k_best.get_feature_names_out()

array(['x0', 'x6', 'x7'], dtype=object)

### Select percentile method

In [4]:
from sklearn.feature_selection import SelectPercentile

select_percentile = SelectPercentile(mutual_info_regression, percentile= 30)

X_new = select_percentile.fit_transform(X, y)
X_new.shape

(2000, 3)

In [5]:
select_percentile.get_feature_names_out() 

array(['x0', 'x6', 'x7'], dtype=object)

### Wrapper Based method
#### REF (Recursive features elemination)
* step1: fits a model
* step2 : ranks the features, afterwards it removes one or more features(depending upon step parameters)

In [6]:
from sklearn.datasets import make_friedman1 
from sklearn.feature_selection import RFE 
from sklearn.linear_model import LinearRegression

estimator = LinearRegression()
selector = RFE(estimator=estimator, n_features_to_select=3, step= 1)
selector = selector.fit(X, y)

print(selector.support_)

print(f'Rank of each feature is: {selector.ranking_}')


[ True False False False False False  True  True]
Rank of each feature is: [1 5 4 3 6 2 1 1]


In [7]:
x_new = selector.transform(X)
x_new.shape

(2000, 3)

### RFE-CV 
This adds another layer of cross validation of RFE

In [8]:
from sklearn.feature_selection import SelectFromModel

estimator = LinearRegression()
estimator.fit(X, y)

print(f'coefficent of features: {estimator.coef_}')
print(f'Indices of top {3} features: {np.argsort(estimator.coef_)[-3:]}')

t = np.argsort(np.abs(estimator.coef_))[-3:]

model = SelectFromModel(estimator, max_features=3, prefit=True)
X_new = model.transform(X)
print(f'Shape of features matrix after features selection :{X_new.shape}')


coefficent of features: [ 3.64048292e-01  5.56221906e-03  5.13591243e-02 -1.64474348e-01
  5.90411479e-05 -1.64573915e-01 -2.17724525e-01 -1.85343265e-01]
Indices of top 3 features: [1 2 0]
Shape of features matrix after features selection :(2000, 3)


In [9]:
from sklearn.decomposition import PCA
pca = PCA(n_components= 2)

pca.fit(X)

In [10]:
print(f'The {pca.n_components_} principal axis are\n', pca.components_)

The 2 principal axis are
 [[ 3.58746278e-04 -5.26626273e-03 -3.55739438e-04 -1.04636644e-04
   9.99985993e-01  1.07189337e-04 -3.72926795e-05 -3.18638902e-05]
 [ 2.83042309e-02 -9.96787172e-01  7.04618553e-02  1.11176863e-02
  -5.23215008e-03 -1.02817045e-03  1.45200590e-02  1.69252465e-02]]


In [11]:
print('VAR 1', pca.explained_variance_)


VAR 1 [9.32990794e+05 1.72499472e+02]


In [12]:
print('mean', pca.mean_)

mean [ 3.88541705e+00  3.13105000e+01  5.89404425e+00  1.15137723e+00
  1.24335450e+03  2.72214812e+00  3.80665650e+01 -1.21940045e+02]


### Chainging transformers

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

estimator = [
  ('simple_imputer', SimpleImputer()),
  ('standard_scaler', StandardScaler())
]

pipe = Pipeline(steps=estimator)
pipe

### Additional features of pipeline

In [14]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression

estimators = [
  ('simple_imputer', SimpleImputer()),
  ('pca', PCA()),
  ('regressor', LinearRegression())
]
pipe_2 = Pipeline(steps= estimators)

In [15]:
pipe_2.steps

[('simple_imputer', SimpleImputer()),
 ('pca', PCA()),
 ('regressor', LinearRegression())]

### Grid Search with pipeline

In [16]:
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = dict(imupter=['passthrough',
                           SimpleImputer(),
                           KNNImputer()],
                           clf=[SVC(), LogisticRegression()],
                           clf__C=[0.1, 10, 100]
                           )

grid_search = GridSearchCV(pipe_2, param_grid=param_grid)
grid_search #TODO: we will see this in the next module

### Visualizing the parameters

In [17]:
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer

num_pipe = Pipeline(
  [
    ('selector', ColumnTransformer(
      [('select_first_4', 'passthrough', slice(0,4))]
    )),
    ('impute', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
  ]
)

cat_pipe = ColumnTransformer([('label_binarizer', LabelBinarizer(), [4])])

full_pipe = FeatureUnion(transformer_list=[
  ('num_pipeline', num_pipe),
  ('cat_pipeline', cat_pipe)
])

In [18]:
from sklearn import set_config

set_config(display='diagram')

full_pipe