In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('PimaIndians.csv')
df.head(2)

Unnamed: 0,pregnant,glucose,diastolic,triceps,insulin,bmi,family,age,test
0,1,89,66,23,94,28.1,0.167,21,negative
1,0,137,40,35,168,43.1,2.288,33,positive


In [3]:
X = df.iloc[:,:-1]
y = df.iloc[:, -1]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [5]:
rf = RandomForestClassifier(random_state=0)

### Fit the random forest model to the training data

In [6]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### Calculate the accuracy

In [7]:
acc = accuracy_score(y_test, rf.predict(X_test))

In [8]:
print(dict(zip(X.columns, rf.feature_importances_.round(2))))
print("{0:.1%} accuracy on test set.".format(acc))

{'pregnant': 0.07, 'glucose': 0.25, 'diastolic': 0.09, 'triceps': 0.09, 'insulin': 0.14, 'bmi': 0.12, 'family': 0.12, 'age': 0.11}
77.6% accuracy on test set.


### Create a mask for features importances above the threshold

In [9]:
mask = rf.feature_importances_ > 0.15

In [10]:
reduced_X = X.loc[:,mask]
print(reduced_X.columns)

Index(['glucose'], dtype='object')


### Recursive Feature Elimination with random forests

In [11]:
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=2, step=2, verbose=1)
rfe.fit(X_train, y_train)

Fitting estimator with 8 features.
Fitting estimator with 6 features.
Fitting estimator with 4 features.


RFE(estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                     class_weight=None, criterion='gini',
                                     max_depth=None, max_features='auto',
                                     max_leaf_nodes=None, max_samples=None,
                                     min_impurity_decrease=0.0,
                                     min_impurity_split=None,
                                     min_samples_leaf=1, min_samples_split=2,
                                     min_weight_fraction_leaf=0.0,
                                     n_estimators=100, n_jobs=None,
                                     oob_score=False, random_state=None,
                                     verbose=0, warm_start=False),
    n_features_to_select=2, step=2, verbose=1)

In [12]:
mask = rfe.support_
reduced_X = X.loc[:, mask]
print(reduced_X.columns)

Index(['glucose', 'age'], dtype='object')
