In [9]:
import sys
import numpy as np
import pandas as pd
import scipy
import scipy.stats as stats
import matplotlib
import matplotlib.pyplot as plt
import sklearn
from numpy import set_printoptions
from sklearn.preprocessing import MinMaxScaler

dataset = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")
print(dataset.shape)

(768, 9)


In [23]:
# Rescale - referred to as normalization of attributes are often rescaled into the range between 0 and 1.
# You can rescale your data using scikit-learn using the MinMaxScaler class
array = dataset.values
# separate array into input and output components
X = array[:,0:8]
Y = array[:,8]
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)
# summarize transformed data
set_printoptions(precision=3)
print(rescaledX[0:5,:])

[[ 0.353  0.744  0.59   0.354  0.     0.501  0.234  0.483]
 [ 0.059  0.427  0.541  0.293  0.     0.396  0.117  0.167]
 [ 0.471  0.92   0.525  0.     0.     0.347  0.254  0.183]
 [ 0.059  0.447  0.541  0.232  0.111  0.419  0.038  0.   ]
 [ 0.     0.688  0.328  0.354  0.199  0.642  0.944  0.2  ]]


In [21]:
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
# feature extraction
# Select 4 best attributes/features to get better accuracy
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)
# summarize scores
set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)
# summarize selected features
print(features[0:5,:])

[  111.52   1411.887    17.605    53.108  2175.565   127.669     5.393
   181.304]
[[ 148.     0.    33.6   50. ]
 [  85.     0.    26.6   31. ]
 [ 183.     0.    23.3   32. ]
 [  89.    94.    28.1   21. ]
 [ 137.   168.    43.1   33. ]]


In [24]:
# Feature selection using Recursive feature elimination (RFE)
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression, LinearRegression

model = LogisticRegression()

rfe = RFE(model, 3)
fit = rfe.fit(X, Y)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

model1 = LinearRegression()
rfe1 = RFE(model1, 3)
fit1 = rfe1.fit(X, Y)
print("Num Features: %d" % fit1.n_features_)
print("Selected Features: %s" % fit1.support_)
print("Feature Ranking: %s" % fit1.ranking_)

Num Features: 3
Selected Features: [ True False False False False  True  True False]
Feature Ranking: [1 2 3 5 6 1 1 4]
Num Features: 3
Selected Features: [ True False False False False  True  True False]
Feature Ranking: [1 2 4 6 5 1 1 3]


In [26]:
# Principal Component Analysis - It is actually a data reduction technique
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
fit = pca.fit(X)
# summarize components
print("Explained Variance: %s" % fit.explained_variance_ratio_)
print(fit.components_)

Explained Variance: [ 0.889  0.062  0.026]
[[ -2.022e-03   9.781e-02   1.609e-02   6.076e-02   9.931e-01   1.401e-02
    5.372e-04  -3.565e-03]
 [ -2.265e-02  -9.722e-01  -1.419e-01   5.786e-02   9.463e-02  -4.697e-02
   -8.168e-04  -1.402e-01]
 [ -2.246e-02   1.434e-01  -9.225e-01  -3.070e-01   2.098e-02  -1.324e-01
   -6.400e-04  -1.255e-01]]


In [30]:
# Bagged decision trees like Random Forest and Extra Trees can be used to estimate the importance
#of features. Following example uses ExtraTreesClassifier to estimate the importance of features
from sklearn.ensemble import ExtraTreesClassifier

array = dataset.values
# separate array into input and output components
X = array[:,0:8]
Y = array[:,8]
model = ExtraTreesClassifier()
model.fit(X, Y)
print(dataset.columns)
print(model.feature_importances_)


Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')
[ 0.102  0.263  0.103  0.082  0.073  0.136  0.102  0.139]
