In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('PimaIndians.csv')
df.head(2)

Unnamed: 0,pregnant,glucose,diastolic,triceps,insulin,bmi,family,age,test
0,1,89,66,23,94,28.1,0.167,21,negative
1,0,137,40,35,168,43.1,2.288,33,positive


In [3]:
X = df.iloc[:,:-1]
y = df.iloc[:, -1]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [5]:
scaler = StandardScaler()

In [6]:
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

### Create the RFE with a LogisticRegression estimator and 3 features to select

In [7]:
rfe = RFE(estimator=LogisticRegression(), n_features_to_select=3, verbose=1)

In [8]:
rfe.fit(X_train_std, y_train)

Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.


RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                 fit_intercept=True, intercept_scaling=1,
                                 l1_ratio=None, max_iter=100,
                                 multi_class='auto', n_jobs=None, penalty='l2',
                                 random_state=None, solver='lbfgs', tol=0.0001,
                                 verbose=0, warm_start=False),
    n_features_to_select=3, step=1, verbose=1)

In [9]:
print(dict(zip(X.columns, rfe.ranking_)))

{'pregnant': 3, 'glucose': 1, 'diastolic': 4, 'triceps': 6, 'insulin': 5, 'bmi': 1, 'family': 2, 'age': 1}


In [10]:
print(X.columns[rfe.support_])

Index(['glucose', 'bmi', 'age'], dtype='object')


In [11]:
acc = accuracy_score(y_test, rfe.predict(X_test_std))
print("{0:.1%} accuracy on test set.".format(acc))

75.5% accuracy on test set.
