## Feature Selection and Classification version1

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#from sklearn.preprocessing import LabelEncoder #string labels to int
from sklearn.feature_selection import chi2, mutual_info_classif, SelectKBest

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

### Load the data
load the preprocessed data (filtering, feature extraction).   
and take a look at the dataframe structure

In [2]:
sampling_rate = 128
path = '../data/preprocessed/'
features = pd.read_csv(path+'featuresCarlo.csv', ',')
features.head()

Unnamed: 0,AF3_beta,F7_beta,F3_beta,FC5_beta,T7_beta,P7_beta,O1_beta,O2_beta,P8_beta,T8_beta,...,FC5_mu,T7_mu,P7_mu,O1_mu,O2_mu,P8_mu,T8_mu,FC6_mu,F4_mu,F8_mu
0,9.617036,9.626228,9.590239,9.623327,9.589991,9.590804,9.567983,9.582804,9.574996,9.617209,...,9.972018,9.978088,9.94664,9.907507,9.920496,9.902075,9.984438,9.98215,9.949355,9.957458
1,10.281531,10.350626,10.272987,10.294057,10.321789,10.265471,10.242162,10.268789,10.245133,10.33949,...,10.157041,10.185575,10.147617,10.12731,10.135965,10.117657,10.178475,10.168443,10.142686,10.148809
2,9.704346,9.717879,9.685493,9.684669,9.691322,9.686065,9.678658,9.688431,9.678609,9.711365,...,9.624257,9.659974,9.642986,9.636403,9.645297,9.629779,9.650872,9.650194,9.641817,9.641468
3,10.062664,10.014825,9.952124,9.974973,9.990027,9.940493,9.940601,9.94762,9.951701,10.029456,...,10.596914,10.614033,10.56346,10.550088,10.57149,10.559683,10.656713,10.620954,10.602026,10.608514
4,9.925013,9.979357,9.939241,9.950245,9.985936,9.932586,9.921234,9.930336,9.923825,9.978828,...,10.294519,10.312448,10.267675,10.250802,10.261787,10.250422,10.305293,10.302618,10.269625,10.282871


Get the values and labels

In [10]:
X = features[features.columns.difference(['id', 'label'])]
y = features.label
print("Number of labels 0, 1, -1: ", np.sum(y==0), np.sum(y==1), np.sum(y==-1))
print("Number of features in total:", X.shape[1])
print("Number of data samples:", X.shape[0])

('Number of labels 0, 1, -1: ', 80, 60, 60)
('Number of features in total:', 26)
('Number of data samples:', 200)


### Feature Selection
Get the scores for each feature  
(i used "mutual information measure" because it seemed the most informative but the current data we have is garbage)

In [4]:
K = 10 #number of features selected
selector = SelectKBest(mutual_info_classif, k=K).fit(X, y) #fit returns object
scores_selector = selector.scores_
print("Scores corresponding to features:\n", scores_selector)

('Scores corresponding to features:\n', array([0.08203516, 0.04751304, 0.05753327, 0.0266582 , 0.        ,
       0.        , 0.0384269 , 0.03721635, 0.04866487, 0.02020333,
       0.07332651, 0.        , 0.07447342, 0.04072608, 0.        ,
       0.        , 0.        , 0.        , 0.02495864, 0.        ,
       0.        , 0.        , 0.08316939, 0.        , 0.        ,
       0.00628824]))


You can just make the selector object return the reduced features matrix and the indices.   
Be aware that **selected features are not in ranked order** but this shouldn't be a problem because if you want to increase the number of selected features just do it with the selector object   
*During application, we will create the selector with the train data (X_train) and transform the validation data (X_val) with it.*

In [5]:
X_reduced = selector.transform(X)
selected_features = selector.get_support(indices=True)
print("Selected feature indices:", selected_features)
print("New reduced feature matrix shape:", X_reduced.shape)

('Selected feature indices:', array([ 0,  1,  2,  6,  7,  8, 10, 12, 13, 22]))
('New reduced feature matrix shape:', (200, 10))


## Classification
Comparing training accuracies of different methods.

In [7]:
names = ["LDA", "Linear SVM", "Quadratic SVM", "Nearest Neighbor", "Naive Bayes"]
classifiers = [LDA(), SVC(kernel="linear"), SVC(gamma=2), KNeighborsClassifier(3), GaussianNB()]

X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=.2, random_state=7)

print("Cross validation accuracies:")
for n, c in zip(names, classifiers):
        c.fit(X_train, y_train)
        scores = cross_val_score(c, X_test, y_test,  cv=5)
        print(n, "%0.2f (+/- %0.2f)" % (scores.mean(), scores.std())) 

Cross validation accuracies:
('LDA', '0.35 (+/- 0.13)')
('Linear SVM', '0.45 (+/- 0.04)')
('Quadratic SVM', '0.45 (+/- 0.04)')
('Nearest Neighbor', '0.40 (+/- 0.12)')
('Naive Bayes', '0.38 (+/- 0.08)')
