In [1]:
from pandas import read_csv
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.decomposition import PCA
import os

In [2]:
# load data
filename = os.getcwd() + '/datasets_228_482_diabetes.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = read_csv(filename, names=names)

In [3]:
df

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [4]:
# Splitting by independent and dependent variables
array = df.values
X = array[:, 0:8]
Y = array[:, 8]

#### 1. Select K Best

In [5]:
# feature extraction
test = SelectKBest(score_func=f_classif, k=4) # f_classif is for ANOVA
fit = test.fit(X, Y)

In [6]:
# summarize scores
print(fit.scores_)

[ 39.67022739 213.16175218   3.2569504    4.30438091  13.28110753
  71.7720721   23.8713002   46.14061124]


In [7]:
# summarize selected features
features = fit.transform(X)
print(features)

[[  6.  148.   33.6  50. ]
 [  1.   85.   26.6  31. ]
 [  8.  183.   23.3  32. ]
 ...
 [  5.  121.   26.2  30. ]
 [  1.  126.   30.1  47. ]
 [  1.   93.   30.4  23. ]]


#### 2. Recursive Feature Elimination

In [8]:
# Select Model
model = LogisticRegression(solver = "liblinear")

In [9]:
# feature extraction
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)

In [10]:
# summarize selected features
features = fit.transform(X)
print(features)

[[ 6.    33.6    0.627]
 [ 1.    26.6    0.351]
 [ 8.    23.3    0.672]
 ...
 [ 5.    26.2    0.245]
 [ 1.    30.1    0.349]
 [ 1.    30.4    0.315]]


#### 3. Feature Importances based on Bagged Decision Trees

In [11]:
model = ExtraTreesClassifier(n_estimators = 10)

In [12]:
fit = model.fit(X, Y)

In [13]:
fit.feature_importances_

array([0.11230582, 0.22668248, 0.09695742, 0.07798527, 0.07687186,
       0.14325024, 0.11465534, 0.15129156])

#### 4. Dimensionality Reduction with PCA

In [14]:
pca = PCA(n_components = 3)

In [15]:
fit = pca.fit(X, Y)

In [18]:
fit.explained_variance_ratio_

array([0.88854663, 0.06159078, 0.02579012])

In [19]:
fit.components_

array([[-2.02176587e-03,  9.78115765e-02,  1.60930503e-02,
         6.07566861e-02,  9.93110844e-01,  1.40108085e-02,
         5.37167919e-04, -3.56474430e-03],
       [-2.26488861e-02, -9.72210040e-01, -1.41909330e-01,
         5.78614699e-02,  9.46266913e-02, -4.69729766e-02,
        -8.16804621e-04, -1.40168181e-01],
       [-2.24649003e-02,  1.43428710e-01, -9.22467192e-01,
        -3.07013055e-01,  2.09773019e-02, -1.32444542e-01,
        -6.39983017e-04, -1.25454310e-01]])

In [16]:
fit.transform(X)

array([[-75.71465491, -35.95078264,  -7.26078895],
       [-82.3582676 ,  28.90821322,  -5.49667139],
       [-74.63064344, -67.90649647,  19.46180812],
       ...,
       [ 32.11319827,   3.3766648 ,  -1.58786446],
       [-80.21449431, -14.18601977,  12.3512639 ],
       [-81.30814972,  21.62149606,  -8.15276833]])