In [6]:
import pandas as pd
from pandas import read_csv
import numpy as np
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings("ignore")

### Feature Selection with Univariate chi2 test

In [1]:
diabetes = "pima-indians-diabetes.data.csv"

In [2]:
# There are no columns names in the above dataset so we will create a list of column names

In [3]:
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

In [4]:
# Now we will join this this list of columns names with the original data

In [7]:
dataframe = read_csv(diabetes, names=names)

In [8]:
array = dataframe.values

In [9]:
x = array[:,0:8]
y = array[:,8]

In [10]:
# Feature extraction

In [11]:
test = SelectKBest(score_func = chi2, k = 4)       ## Here we are suppose to mention how many features we want to select and the function we are suppose to mention that means on what basis we are doing feature extarction here it is chi2

In [13]:
fit = test.fit(x,y)
fit

SelectKBest(k=4, score_func=<function chi2 at 0x000001E44C645A60>)

In [14]:
# As we want scores upto 3 decimals, we will use set_printoptons

In [15]:
set_printoptions(precision = 3)

In [16]:
# Finally we will transform our Input data which is stored in x based on the fitted values

In [17]:
features = fit.transform(x)

In [43]:
print(fit.scores_)

[ 111.52  1411.887   17.605   53.108 2175.565  127.669    5.393  181.304]


Inference: Here our aim was to detect the features having high chi2 scores, from the above scores we can say that test is the most useful feature in our data as the chi2 value is more for test and it can be used for determining whether the patient is having diabetes or not, also plasma, mass pregnancy and pedi, these are also the useful features, although we can neglect pressure, skin and age as the chi2 value for them is very less

### Recursive Feature Elimination

In [8]:
# As RFE works better with logistic regression we will invoke logistic regression function for that 

In [9]:
model = LogisticRegression(max_iter = 400)    # max_iter means this logisctic regression fuction will run 400 times, RFE will built multiple logistic regression and these LR models will be compared with each other internally

In [12]:
rfe = RFE(model,3)                            # Syntax = RFE(model on which rfe to be applied, no. of features to be selected at a time),This rfe will calculate the accuracy of the combinations made internally 

In [13]:
fit = rfe.fit(x,y)

In [14]:
fit.n_features_                               # This will extract the number of features

3

In [15]:
# We can see the support for each feature

In [16]:
fit.support_                  # Here we are getting only three True values as we have selected 3 features

array([ True, False, False, False, False,  True,  True, False])

Inference : From this we can infer that pregnancy, pediatric and age are the columns used for predicting the class

In [18]:
fit.ranking_

array([1, 2, 4, 6, 5, 1, 1, 3])

### Tree Based methods

In [20]:
# Using CART method, so GINI Impurity index will be calculated

In [37]:
model1 = DecisionTreeClassifier()                   # If we do not pass anything inside the decision tree classifier by default it will select cart method

In [38]:
model1.fit(x,y)

DecisionTreeClassifier()

In [39]:
# Extracting the important features

model1.feature_importances_

array([0.06363197, 0.32425016, 0.07398163, 0.01499212, 0.04172211,
       0.23502093, 0.12829627, 0.11810481])

Infernces : From the above scores we can extract the features having less scores, so we can conclude that skin,test,pregnancy and mass are the most important variables for predicting the class variable