In [1]:
import numpy as np
import pandas as pd

In [2]:
# Dataset: -- A classification problem
#url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv("pima-indians-diabetes.data", names=names)
dataframe.head(10)

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [3]:
X= dataframe[['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age']]
y= dataframe['class']

In [4]:
# Missing values: 
# there are zeros in places where they are biologically impossible, such as the blood pressure attribute.
# It seems very likely that zero values encode missing data. 
# However, since the dataset donors made no such statement we encourage you to use your best judgement 
# and state your assumptions.

In [5]:
# Apply NB classifier, and evaluate the model using Cross Validation
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score

clf = GaussianNB()

# fit the classifier on the training features and labels
#clf.fit(X, y) 

scoresAcc= cross_val_score(clf, X, y, cv=5, scoring='accuracy')
print("Accuracy values: "+ str(list(scoresAcc)))
print('Mean:%.2f' %scoresAcc.mean())

Accuracy values: [0.75324675324675328, 0.72727272727272729, 0.74675324675324672, 0.78431372549019607, 0.74509803921568629]
Mean:0.75


In [6]:
X.shape

(768, 8)

# Feature Selection:

# 1. Univariate Selection: 


## Example 1: 
## Univariate Selection: F-test

(https://en.wikipedia.org/wiki/F-test)

In [7]:
from sklearn.feature_selection import SelectKBest # Select features according to the k highest scores.
from sklearn.feature_selection import f_classif

# feature extraction
#X_new = SelectKBest(score_func=chi2, k=4).fit_transform(X, y)
feat_selector= SelectKBest(score_func=f_classif, k=3)
X_new= feat_selector.fit_transform(X, y)

print(X_new.shape)
print(feat_selector.pvalues_)
# summarize scores
print(feat_selector.scores_)

(768, 3)
[  5.06512730e-10   8.93543165e-43   7.15139001e-02   3.83477048e-02
   2.86186460e-04   1.22980749e-16   1.25460701e-06   2.20997546e-11]
[  39.67022739  213.16175218    3.2569504     4.30438091   13.28110753
   71.7720721    23.8713002    46.14061124]


In [14]:
feature_names= X.columns.values # column names
mask = feat_selector.get_support() #list of booleans
new_features = [] # The list of your K best features

for bool, feature in zip(mask, feature_names):
    if bool:
        new_features.append(feature)
        
X_new= pd.DataFrame(X_new, columns= new_features)

In [15]:
# After feature selection: Apply NB classifier, and evaluate the model using Cross Validation
clf = GaussianNB()

# fit the classifier on the training features and labels
#clf.fit(X, y) 

scoresAcc= cross_val_score(clf, X_new, y, cv=5, scoring='accuracy')
print("Accuracy values: "+ str(list(scoresAcc)))
print('Mean:%.2f' %scoresAcc.mean())

Accuracy values: [0.76623376623376627, 0.72727272727272729, 0.74675324675324672, 0.80392156862745101, 0.78431372549019607]
Mean:0.77


## Example 2: 
## Univariate Selection: Chi-square test of independence

    (http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html)
    
    Appropriate for only non-negative features such as booleans or frequencies (e.g., term counts in document classification), relative to the classes.

## Example 3: 
## Univariate Selection: mutual_info_classif

(http://scikitlearn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html#sklearn.feature_selection.mutual_info_classif)

In [16]:
from sklearn.feature_selection import SelectKBest # Select features according to the k highest scores.
from sklearn.feature_selection import mutual_info_classif

# feature extraction
#X_new = SelectKBest(score_func=chi2, k=4).fit_transform(X, y)
feat_selector= SelectKBest(score_func=mutual_info_classif, k=4)
X_new= feat_selector.fit_transform(X, y)

print(X_new.shape)
print(feat_selector.pvalues_)
# summarize scores
print(feat_selector.scores_)

(768, 4)
None
[ 0.0313025   0.10903278  0.          0.          0.02628396  0.07192999
  0.01164758  0.0471049 ]


In [17]:
feature_names= X.columns.values # column names
mask = feat_selector.get_support() #list of booleans
new_features = [] # The list of your K best features

for bool, feature in zip(mask, feature_names):
    if bool:
        new_features.append(feature)
        
X_new= pd.DataFrame(X_new, columns= new_features)

In [18]:
# After feature selection: Apply NB classifier, and evaluate the model using Cross Validation
clf = GaussianNB()

# fit the classifier on the training features and labels
#clf.fit(X, y) 

scoresAcc= cross_val_score(clf, X_new, y, cv=5, scoring='accuracy')
print("Accuracy values: "+ str(list(scoresAcc)))
print('Mean:%.2f' %scoresAcc.mean())

Accuracy values: [0.75974025974025972, 0.69480519480519476, 0.75324675324675328, 0.77124183006535951, 0.77777777777777779]
Mean:0.75


# 2. Removing features with low variance:

In [23]:
from sklearn.feature_selection import VarianceThreshold
# VarianceThreshold removes all features whose variance doesn’t meet some threshold. 
# By default, it removes all zero-variance features, i.e. features that have the same value in all samples.

feat_selector= VarianceThreshold()

X_new= feat_selector.fit_transform(X, y)

print(X_new.shape)

(768, 8)


# 3. Recursive feature elimination: 

# 4. Feature selection using SelectFromModel