In [39]:
import pandas as pd
import numpy as np

In [24]:
from sklearn.feature_selection import SelectFromModel

In [25]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
df = pd.read_csv("sample_dataset.csv")

In [28]:
df.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension',
       'target'],
      dtype='object')

In [30]:
df = df.loc[:,['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension','target']].dropna()

In [31]:
X = df.iloc[:,0:10]
y = df.iloc[:,-1]

In [32]:
model = RandomForestClassifier(random_state = 0)

In [33]:
selector = SelectFromModel(model)

selector.fit_transform(X,y)

array([[0.1859  , 0.09353 ],
       [0.1479  , 0.09498 ],
       [0.04568 , 0.0311  ],
       [0.02956 , 0.02076 ],
       [0.1063  , 0.05439 ],
       [0.09847 , 0.06158 ],
       [0.000692, 0.004167],
       [0.05988 , 0.0218  ],
       [0.1128  , 0.06873 ],
       [0.07789 , 0.05069 ],
       [0.2071  , 0.09601 ],
       [0.1138  , 0.08534 ],
       [0.1153  , 0.06847 ],
       [0.1659  , 0.07415 ],
       [0.01972 , 0.01963 ],
       [0.02531 , 0.01698 ],
       [0.001597, 0.002404],
       [0.0755  , 0.04079 ],
       [0.1682  , 0.06597 ],
       [0.1417  , 0.08811 ],
       [0.02638 , 0.02069 ],
       [0.1266  , 0.08353 ],
       [0.0802  , 0.05843 ],
       [0.02245 , 0.02763 ],
       [0.01288 , 0.01924 ],
       [0.01236 , 0.01369 ],
       [0.2197  , 0.1062  ],
       [0.0683  , 0.03099 ],
       [0.01765 , 0.02733 ],
       [0.007756, 0.008535],
       [0.03738 , 0.02098 ],
       [0.2914  , 0.1242  ],
       [0.02556 , 0.02031 ],
       [0.01271 , 0.01117 ],
       [0.0192

In [34]:
selector.estimator_.feature_importances_

array([0.05864691, 0.05857587, 0.08734768, 0.08573085, 0.03495926,
       0.09806245, 0.27767754, 0.26639098, 0.01451452, 0.01809393])

# Threshold: mean

In [35]:
selector = SelectFromModel(model, threshold='mean')

selector.fit_transform(X,y)

array([[0.1859  , 0.09353 ],
       [0.1479  , 0.09498 ],
       [0.04568 , 0.0311  ],
       [0.02956 , 0.02076 ],
       [0.1063  , 0.05439 ],
       [0.09847 , 0.06158 ],
       [0.000692, 0.004167],
       [0.05988 , 0.0218  ],
       [0.1128  , 0.06873 ],
       [0.07789 , 0.05069 ],
       [0.2071  , 0.09601 ],
       [0.1138  , 0.08534 ],
       [0.1153  , 0.06847 ],
       [0.1659  , 0.07415 ],
       [0.01972 , 0.01963 ],
       [0.02531 , 0.01698 ],
       [0.001597, 0.002404],
       [0.0755  , 0.04079 ],
       [0.1682  , 0.06597 ],
       [0.1417  , 0.08811 ],
       [0.02638 , 0.02069 ],
       [0.1266  , 0.08353 ],
       [0.0802  , 0.05843 ],
       [0.02245 , 0.02763 ],
       [0.01288 , 0.01924 ],
       [0.01236 , 0.01369 ],
       [0.2197  , 0.1062  ],
       [0.0683  , 0.03099 ],
       [0.01765 , 0.02733 ],
       [0.007756, 0.008535],
       [0.03738 , 0.02098 ],
       [0.2914  , 0.1242  ],
       [0.02556 , 0.02031 ],
       [0.01271 , 0.01117 ],
       [0.0192

# Threshold: custom

In [36]:
selector = SelectFromModel(model, threshold=0.02)

selector.fit_transform(X,y)

array([[1.300e+01, 2.182e+01, 8.750e+01, 5.198e+02, 1.273e-01, 1.932e-01,
        1.859e-01, 9.353e-02],
       [1.981e+01, 2.215e+01, 1.300e+02, 1.260e+03, 9.831e-02, 1.027e-01,
        1.479e-01, 9.498e-02],
       [1.308e+01, 1.571e+01, 8.563e+01, 5.200e+02, 1.075e-01, 1.270e-01,
        4.568e-02, 3.110e-02],
       [9.504e+00, 1.244e+01, 6.034e+01, 2.739e+02, 1.024e-01, 6.492e-02,
        2.956e-02, 2.076e-02],
       [1.348e+01, 2.082e+01, 8.840e+01, 5.592e+02, 1.016e-01, 1.255e-01,
        1.063e-01, 5.439e-02],
       [1.328e+01, 2.028e+01, 8.732e+01, 5.452e+02, 1.041e-01, 1.436e-01,
        9.847e-02, 6.158e-02],
       [1.305e+01, 1.931e+01, 8.261e+01, 5.272e+02, 8.060e-02, 3.789e-02,
        6.920e-04, 4.167e-03],
       [9.173e+00, 1.386e+01, 5.920e+01, 2.609e+02, 7.721e-02, 8.751e-02,
        5.988e-02, 2.180e-02],
       [1.268e+01, 2.384e+01, 8.269e+01, 4.990e+02, 1.122e-01, 1.262e-01,
        1.128e-01, 6.873e-02],
       [1.380e+01, 1.579e+01, 9.043e+01, 5.841e+02, 1.0

# Threshold: median

In [37]:
selector = SelectFromModel(model, threshold='median')

selector.fit_transform(X,y)

array([[8.750e+01, 5.198e+02, 1.932e-01, 1.859e-01, 9.353e-02],
       [1.300e+02, 1.260e+03, 1.027e-01, 1.479e-01, 9.498e-02],
       [8.563e+01, 5.200e+02, 1.270e-01, 4.568e-02, 3.110e-02],
       [6.034e+01, 2.739e+02, 6.492e-02, 2.956e-02, 2.076e-02],
       [8.840e+01, 5.592e+02, 1.255e-01, 1.063e-01, 5.439e-02],
       [8.732e+01, 5.452e+02, 1.436e-01, 9.847e-02, 6.158e-02],
       [8.261e+01, 5.272e+02, 3.789e-02, 6.920e-04, 4.167e-03],
       [5.920e+01, 2.609e+02, 8.751e-02, 5.988e-02, 2.180e-02],
       [8.269e+01, 4.990e+02, 1.262e-01, 1.128e-01, 6.873e-02],
       [9.043e+01, 5.841e+02, 1.280e-01, 7.789e-02, 5.069e-02],
       [8.721e+01, 5.302e+02, 1.765e-01, 2.071e-01, 9.601e-02],
       [9.958e+01, 6.745e+02, 1.807e-01, 1.138e-01, 8.534e-02],
       [1.202e+02, 1.075e+03, 9.709e-02, 1.153e-01, 6.847e-02],
       [7.899e+01, 4.320e+02, 1.700e-01, 1.659e-01, 7.415e-02],
       [7.684e+01, 4.486e+02, 5.241e-02, 1.972e-02, 1.963e-02],
       [6.877e+01, 3.576e+02, 5.736e-02,

# Limiting the number of returned features

In [38]:
selector = SelectFromModel(model, threshold='median', max_features = 3)

selector.fit_transform(X,y)

array([[0.1932  , 0.1859  , 0.09353 ],
       [0.1027  , 0.1479  , 0.09498 ],
       [0.127   , 0.04568 , 0.0311  ],
       [0.06492 , 0.02956 , 0.02076 ],
       [0.1255  , 0.1063  , 0.05439 ],
       [0.1436  , 0.09847 , 0.06158 ],
       [0.03789 , 0.000692, 0.004167],
       [0.08751 , 0.05988 , 0.0218  ],
       [0.1262  , 0.1128  , 0.06873 ],
       [0.128   , 0.07789 , 0.05069 ],
       [0.1765  , 0.2071  , 0.09601 ],
       [0.1807  , 0.1138  , 0.08534 ],
       [0.09709 , 0.1153  , 0.06847 ],
       [0.17    , 0.1659  , 0.07415 ],
       [0.05241 , 0.01972 , 0.01963 ],
       [0.05736 , 0.02531 , 0.01698 ],
       [0.04695 , 0.001597, 0.002404],
       [0.09588 , 0.0755  , 0.04079 ],
       [0.1146  , 0.1682  , 0.06597 ],
       [0.1298  , 0.1417  , 0.08811 ],
       [0.05991 , 0.02638 , 0.02069 ],
       [0.1041  , 0.1266  , 0.08353 ],
       [0.05884 , 0.0802  , 0.05843 ],
       [0.0434  , 0.02245 , 0.02763 ],
       [0.08393 , 0.01288 , 0.01924 ],
       [0.04721 , 0.01236

# Selecting the N most important features

In [40]:
selector = SelectFromModel(model, threshold=-np.inf, max_features = 3)

selector.fit_transform(X,y)

array([[0.1932  , 0.1859  , 0.09353 ],
       [0.1027  , 0.1479  , 0.09498 ],
       [0.127   , 0.04568 , 0.0311  ],
       [0.06492 , 0.02956 , 0.02076 ],
       [0.1255  , 0.1063  , 0.05439 ],
       [0.1436  , 0.09847 , 0.06158 ],
       [0.03789 , 0.000692, 0.004167],
       [0.08751 , 0.05988 , 0.0218  ],
       [0.1262  , 0.1128  , 0.06873 ],
       [0.128   , 0.07789 , 0.05069 ],
       [0.1765  , 0.2071  , 0.09601 ],
       [0.1807  , 0.1138  , 0.08534 ],
       [0.09709 , 0.1153  , 0.06847 ],
       [0.17    , 0.1659  , 0.07415 ],
       [0.05241 , 0.01972 , 0.01963 ],
       [0.05736 , 0.02531 , 0.01698 ],
       [0.04695 , 0.001597, 0.002404],
       [0.09588 , 0.0755  , 0.04079 ],
       [0.1146  , 0.1682  , 0.06597 ],
       [0.1298  , 0.1417  , 0.08811 ],
       [0.05991 , 0.02638 , 0.02069 ],
       [0.1041  , 0.1266  , 0.08353 ],
       [0.05884 , 0.0802  , 0.05843 ],
       [0.0434  , 0.02245 , 0.02763 ],
       [0.08393 , 0.01288 , 0.01924 ],
       [0.04721 , 0.01236

In [42]:
X.columns[selector.get_support()]

Index(['mean compactness', 'mean concavity', 'mean concave points'], dtype='object')