# Feature Selection Using SelectFromModel

In [84]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.linear_model import LassoCV
import pandas as pd

In [38]:
X, y = load_iris(return_X_y=True)

In [49]:
X.shape

(150, 4)

In [50]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


### 1. Regresi Logistic

In [54]:
"""
Note : Terdapat warning lbfgs tidak konvergen, sehingga kurang cocok jika pakai regresi logistik.
"""
selector_logistic = SelectFromModel(estimator=LogisticRegression()).fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [55]:
"""
Treshold disini berguna sebagai batas variabel mana yang akan dipilih dan mana yang dibuang.
"""
selector_logistic.threshold_

3.1261240416741725

In [56]:
"""
mendapatkan feature mana yang akan digunakan
"""
selector_logistic.get_support()

array([False, False,  True,  True])

In [62]:
pd.DataFrame(selector_logistic.transform(X))

Unnamed: 0,0,1
0,1.4,0.2
1,1.4,0.2
2,1.3,0.2
3,1.5,0.2
4,1.4,0.2
...,...,...
145,5.2,2.3
146,5.0,1.9
147,5.2,2.0
148,5.4,2.3


### 2. Decision Tree

In [41]:
selector_tree = SelectFromModel(estimator=DecisionTreeClassifier()).fit(X, y)

In [42]:
"""
Feature importance berguna untuk melihat variabel atau feature mana yang paling penting
dalam pembentukan model.
"""
selector_tree.estimator_.feature_importances_

array([0.01333333, 0.01333333, 0.55072262, 0.42261071])

In [58]:
"""
Treshold disini berguna sebagai batas variabel mana yang akan dipilih dan mana yang dibuang.
"""

selector_tree.threshold_

0.25

In [61]:
"""
mendapatkan feature mana yang akan digunakan
"""
selector_tree.get_support()

array([False, False,  True,  True])

In [63]:
pd.DataFrame(selector_tree.transform(X))

Unnamed: 0,0,1
0,1.4,0.2
1,1.4,0.2
2,1.3,0.2
3,1.5,0.2
4,1.4,0.2
...,...,...
145,5.2,2.3
146,5.0,1.9
147,5.2,2.0
148,5.4,2.3


### 3. Decision Tree

In [13]:
selector_RnForest = SelectFromModel(estimator=RandomForestClassifier()).fit(X, y)

In [68]:
print("Treshold = ",selector_RnForest.threshold_)
print("Feature Importance = ",selector_RnForest.estimator_.feature_importances_)
print("Support = ",selector_RnForest.get_support())

Treshold =  0.25
Feature Importance =  [0.08354414 0.02222794 0.44727326 0.44695467]
Support =  [False False  True  True]


In [94]:
pd.DataFrame(selector_RnForest.transform(X))

Unnamed: 0,0,1
0,1.4,0.2
1,1.4,0.2
2,1.3,0.2
3,1.5,0.2
4,1.4,0.2
...,...,...
145,5.2,2.3
146,5.0,1.9
147,5.2,2.0
148,5.4,2.3


### 4. Lasso CV

In [104]:
selector_LassoCV = SelectFromModel(estimator=LassoCV(cv=5)).fit(X, y)

In [105]:
selector_LassoCV.estimator_.coef_

array([-0.10565944, -0.03380355,  0.23463406,  0.5892981 ])

In [106]:
selector_LassoCV.get_support()

array([ True,  True,  True,  True])

In [107]:
pd.DataFrame(selector_LassoCV.transform(X))

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


## Refference

- https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html