## Module to apply various Feature Selection Algorithms to get a subset of features

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import mutual_info_classif as mf

In [3]:
X = np.load("FINAL_scaled_data_sub1.npy")
y = np.load("label_sub1.npy")

## LogisticRegression Selection

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
#lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
lsvc = LogisticRegression()
lsvc.fit(X,y)

#print lsvc.coef_
lsvc.coef_ = np.array(lsvc.coef_)

model = SelectFromModel(lsvc, prefit = True)
X_new = model.transform(X)
feature1 = X_new

np.save("LogisticReg_data_selected", X_new)
print X_new.shape

(720, 329)


## Mutual Information selection

In [6]:
relation = mf(X,y)

index = []
for i in range(740):
    if relation[i] > 1.00e-01:
        index.append(i)
print len(index)
X_mf = []
for j in index:
    X_mf.append(X[:,j])
X_mf = np.transpose(np.array(X_mf))
print X_mf.shape
np.save("MF_data_selected", X_mf)

# Principal Component Analysis

In [11]:
import numpy as np
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA


kpca = KernelPCA(n_components = 10,kernel="rbf", fit_inverse_transform=True, gamma=10)
X4 = kpca.fit_transform(X)

print X.shape
print X4.shape

np.save("PCA_data_selected", X4)

### L1 Norm based Feature Selection

In [12]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel


lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
model = SelectFromModel(lsvc, prefit=True)
X_filtered4 = model.transform(X)
print X_filtered4.shape

np.save("L1_Norm_data_selected", X_filtered4)

(720, 2)


### Tree based Feature Selection

In [14]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel


clf = ExtraTreesClassifier()
clf = clf.fit(X, y)

model = SelectFromModel(clf, prefit=True)
X_filtered5 = model.transform(X)       

print X_filtered5.shape
np.save("Tree_based_data_selected.npy", X_filtered5)

## Data after Selection for classification  :

In [17]:
X1 = np.load("LogisticReg_data_selected.npy")
X2 = np.load("L1_Norm_data_selected.npy")
X3 = np.load("PCA_data_selected.npy")
X4 = np.load("Tree_based_data_selected.npy")
X5 = np.load("MF_data_selected.npy")
X6 = np.load("FINAL_scaled_data_sub1.npy")
print X1.shape
print X2.shape
print X3.shape
print X4.shape
print X5.shape
print X6.shape
X = [] 
X.append(X1)
X.append(X2)
X.append(X3)
X.append(X4)
X.append(X5)
X.append(X6)



(720, 329)
(720, 2)
(720, 10)
(720, 301)
(720, 51)
(720, 740)


In [21]:
X_all = np.load("data_scaled_all_channels.npy")
X_all = X_all[range(720),:]
y_all = np.load("outfile_label.npy")
y_all = y_all[range(720)]
print X_all.shape
print y_all.shape

(720, 2220)
(720,)


## Classification of motor Imagery

## Naive Bayes Classifier

In [22]:
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

for x in X:
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=10)

    gnb = GaussianNB()
    y_pred = gnb.fit(X_train, y_train).predict(X_test)
    y_cross = cross_validate(gnb, x, y, cv =10)
    print ("cross validation result" )
    print np.mean(y_cross["test_score"])
    print np.mean(y_cross["train_score"])
    print ("normal spliting result")
    print metrics.accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)


  n_ij = - 0.5 * np.sum(np.log(2. * np.pi * self.sigma_[i, :]))
  (self.sigma_[i, :]), 1)


cross validation result
0.401388888889
0.61049382716
normal spliting result
0.555555555556
cross validation result
0.277777777778
0.27299382716
normal spliting result
0.244444444444
cross validation result
0.25
0.25
normal spliting result
0.238888888889
cross validation result
0.351388888889
0.495061728395
normal spliting result
0.466666666667
cross validation result
0.355555555556
0.486111111111
normal spliting result
0.444444444444
cross validation result
0.327777777778
0.508641975309
normal spliting result
0.422222222222


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.25, random_state=10)

gnb = GaussianNB()

y_pred = gnb.fit(X_train, y_train).predict(X_test)

y_cross = cross_validate(gnb, x, y, cv =10)
print ("cross validation result" )
print np.mean(y_cross["test_score"])
print np.mean(y_cross["train_score"])
print ("normal spliting result")
print metrics.accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)

cross validation result
0.327777777778
0.508641975309
normal spliting result
0.611111111111


### Decision Tree Classification

In [25]:
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

for x in X:
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=40)

    modelDecisionTree = tree.DecisionTreeClassifier()
    modelDecisionTree = modelDecisionTree.fit(X_train, y_train)
    y_cross = cross_validate(modelDecisionTree, x, y, cv =10)
    print ("cross validation result" )
    print ("1. test:")
    print np.mean(y_cross["test_score"])
    print ("2. train")
    print np.mean(y_cross["train_score"])
    print ("normal spliting result")

    y_test_pred = modelDecisionTree.predict(X_test)

    print metrics.accuracy_score(y_test, y_test_pred, normalize=True, sample_weight=None)



cross validation result
1. test:
0.320833333333
2. train
1.0
normal spliting result
0.422222222222
cross validation result
1. test:
0.297222222222
2. train
1.0
normal spliting result
0.272222222222
cross validation result
1. test:
0.25
2. train
0.25
normal spliting result
0.238888888889
cross validation result
1. test:
0.319444444444
2. train
1.0
normal spliting result
0.488888888889
cross validation result
1. test:
0.319444444444
2. train
1.0
normal spliting result
0.505555555556
cross validation result
1. test:
0.315277777778
2. train
1.0
normal spliting result
0.433333333333


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.25, random_state=40)

modelDecisionTree = tree.DecisionTreeClassifier()
modelDecisionTree = modelDecisionTree.fit(X_train, y_train)
y_cross = cross_validate(modelDecisionTree, x, y, cv =10)
print ("cross validation result" )
print ("1. test:")
print np.mean(y_cross["test_score"])
print ("2. train:")
print np.mean(y_cross["train_score"])
print ("normal spliting result")

y_test_pred = modelDecisionTree.predict(X_test)

print metrics.accuracy_score(y_test, y_test_pred, normalize=True, sample_weight=None)


cross validation result
1. test:
0.325
2. train:
1.0
normal spliting result
0.638888888889


### Stocastic Gradient Decent

In [27]:
from sklearn.linear_model import SGDClassifier
for x in X:
    X_train, X_test, y_train, y_test = train_test_split(x, y , test_size=0.25, random_state=40)

    modelSGD = SGDClassifier(loss="hinge", penalty="l1")
    modelSGD.fit(X_train, y_train)
    y_cross = cross_validate(modelSGD, x, y, cv =10)
    print ("cross validation result" )
    print ("1. test:")
    print np.mean(y_cross["test_score"])
    print ("2. train")
    print np.mean(y_cross["train_score"])
    print ("normal spliting result")
    y_test_pred = modelSGD.predict(X_test)

    print metrics.accuracy_score(y_test, y_test_pred, normalize=True, sample_weight=None)



cross validation result
1. test:
0.463888888889
2. train
0.741512345679
normal spliting result
0.533333333333
cross validation result
1. test:
0.2625
2. train
0.267901234568
normal spliting result
0.283333333333
cross validation result
1. test:
0.25
2. train
0.25
normal spliting result
0.238888888889
cross validation result
1. test:
0.406944444444
2. train
0.691512345679
normal spliting result
0.533333333333
cross validation result
1. test:
0.320833333333
2. train
0.493981481481
normal spliting result
0.527777777778
cross validation result
1. test:
0.386111111111
2. train
0.679475308642
normal spliting result
0.55


### Support Vector Machines

In [28]:
from sklearn import svm
for x in X:
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=40)

    modelSVM = svm.LinearSVC()
    modelSVM.fit(X_train,y_train)
    y_cross = cross_validate(modelSVM, x, y, cv =10)
    print ("cross validation result" )
    print ("1. test:")
    print np.mean(y_cross["test_score"])
    print ("2. train:")
    print np.mean(y_cross["train_score"])
    print ("normal spliting result")

    y_test_pred = modelSVM.predict(X_test)

    print metrics.accuracy_score(y_test, y_test_pred, normalize=True, sample_weight=None)


cross validation result
1. test:
0.465277777778
2. train:
0.99274691358
normal spliting result
0.727777777778
cross validation result
1. test:
0.315277777778
2. train:
0.309413580247
normal spliting result
0.316666666667
cross validation result
1. test:
0.25
2. train:
0.25
normal spliting result
0.238888888889
cross validation result
1. test:
0.391666666667
2. train:
0.946604938272
normal spliting result
0.672222222222
cross validation result
1. test:
0.363888888889
2. train:
0.581327160494
normal spliting result
0.5
cross validation result
1. test:
0.338888888889
2. train:
1.0
normal spliting result
0.638888888889


In [29]:
    X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.25, random_state=40)

    modelSVM = svm.LinearSVC()
    modelSVM.fit(X_train,y_train)
    y_cross = cross_validate(modelSVM, X_all, y_all, cv =10)
    print ("cross validation result" )
    print ("1. test:")
    print np.mean(y_cross["test_score"])
    print ("2. train:")
    print np.mean(y_cross["train_score"])
    print ("normal spliting result")

    y_test_pred = modelSVM.predict(X_test)

    print metrics.accuracy_score(y_test, y_test_pred, normalize=True, sample_weight=None)


cross validation result
1. test:
0.584722222222
2. train:
0.999845679012
normal spliting result
0.833333333333


In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.25, random_state=40)

modelSVM = svm.LinearSVC()
modelSVM.fit(X_train,y_train)
y_cross = cross_validate(modelSVM, X_all, y_all, cv =10)
print ("cross validation result" )
print ("1. test:")
print np.mean(y_cross["test_score"])
print ("2. train:")
print np.mean(y_cross["train_score"])
print ("normal spliting result")

y_test_pred = modelSVM.predict(X_test)

print metrics.accuracy_score(y_test, y_test_pred, normalize=True, sample_weight=None)


cross validation result
1. test:
0.583333333333
2. train:
1.0
normal spliting result
0.822222222222


In [96]:
for x in X:
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=40)
    modelSVM = svm.NuSVC()
    modelSVM.fit(X_train,y_train)

    y_test_pred = modelSVM.predict(X_test)

    print metrics.accuracy_score(y_test, y_test_pred, normalize=True, sample_weight=None)

0.633333333333
0.203333333333
0.246666666667
0.59
0.23
0.596666666667


In [34]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

for x in X:
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=40)

    modelknn = KNeighborsClassifier(n_neighbors=3)
    modelknn.fit(X_train,y_train)
    y_cross = cross_validate(modelknn, x, y, cv =10)
    """
    print ("cross validation result" )
    print ("1. test:")
    print (y_cross["test_score"])
    print ("2. train")
    print (y_cross["train_score"])"""
    print ("\n")
    print np.mean(y_cross["test_score"])
    
 
    y_test_pred = modelknn.predict(X_test)

    print metrics.accuracy_score(y_test, y_test_pred, normalize=True, sample_weight=None)





0.368055555556
0.883333333333


0.297222222222
0.261111111111


0.25
0.261111111111


0.383333333333
0.866666666667


0.345833333333
0.688888888889


0.333333333333
0.85


In [35]:
    # knn
    X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.25, random_state=40)

    modelknn = KNeighborsClassifier(n_neighbors=3)
    modelknn.fit(X_train,y_train)
    y_cross = cross_validate(modelknn, X_all, y_all, cv =10)
    """
    print ("cross validation result" )
    print ("1. test:")
    print (y_cross["test_score"])
    print ("2. train")
    print (y_cross["train_score"])"""
    print ("\n")
    print np.mean(y_cross["test_score"])
    
 
    y_test_pred = modelknn.predict(X_test)

    print metrics.accuracy_score(y_test, y_test_pred, normalize=True, sample_weight=None)



0.559722222222
0.75


In [66]:
for x in X:
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=40)
    modelSVM = svm.LinearSVC()
    modelSVM.fit(X_train,y_train)

    y_test_pred = modelSVM.predict(X_test)

    print metrics.accuracy_score(y_test, y_test_pred, normalize=True, sample_weight=None)

0.62
0.39
0.246666666667
0.6
0.303333333333


### Random Forest Classifier

In [67]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

for x in X:
    modelRF = RandomForestClassifier(n_estimators=5, max_depth=None,min_samples_split=3, random_state=10)
    scores = cross_val_score(modelRF, x, y)
    print scores.mean()                             



0.34
0.288333333333
0.248333333333
0.3375
0.344166666667


### Extra Trees Classifier

In [68]:
from sklearn.ensemble import ExtraTreesClassifier
for x in X:
    modelET = ExtraTreesClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0)
    scores = cross_val_score(modelET, x, y)
    print scores.mean()

0.330833333333
0.29
0.388333333333
0.344166666667
0.368333333333


### AdaBoost

In [69]:
from sklearn.ensemble import AdaBoostClassifier

for x in X:
    modelAda = AdaBoostClassifier(n_estimators=100)
    scores = cross_val_score(modelAda,x,y)
    print scores.mean()                             

0.37
0.293333333333
0.2425
0.37
0.3175


### Gradient Tree Boosting Classifier

In [74]:
from sklearn.ensemble import GradientBoostingClassifier

for x in X:
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=40)

    clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0).fit(X_train, y_train)
    print clf.score(X_test, y_test) 

0.59
0.36
0.29
0.623333333333
0.63


### Two Class AdaBoost

In [75]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_gaussian_quantiles
for x in X:
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=40)

    bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                             algorithm="SAMME",
                             n_estimators=200)

    bdt.fit(X_train, y_train)

    y_test_pred = bdt.predict(X_test)

    print metrics.accuracy_score(y_test, y_test_pred, normalize=True, sample_weight=None)


0.516666666667
0.363333333333
0.256666666667
0.526666666667
0.493333333333


### Multi Layer Perceptron

In [70]:
from sklearn.neural_network import MLPClassifier

for x in X:
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=40)

    modelMLP = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)

    modelMLP.fit(X_train, y_train)

    y_test_pred = modelMLP.predict(X_test)

    print metrics.accuracy_score(y_test, y_test_pred, normalize=True, sample_weight=None)

0.233333333333
0.24
0.236666666667
0.233333333333
0.24


### Linear Discriminant Analysis

In [73]:

from sklearn.decomposition import LatentDirichletAllocation as LDA

for x in X:
    X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.25, random_state=40)

    modelLDA = LDA()

    modelLDA.fit(X_train, y_train)

    y_test_pred = modelLDA.predict(X_test)

    print metrics.accuracy_score(y_test, y_test_pred, normalize=True, sample_weight=None)



AttributeError: 'LatentDirichletAllocation' object has no attribute 'predict'

In [7]:
X2

array([[ 0.0006142 ,  0.01422066],
       [ 0.00030224,  0.0062485 ],
       [ 0.00081061,  0.01698939],
       ..., 
       [ 0.00583448,  0.12693771],
       [ 0.00166921,  0.02382961],
       [ 0.00152293,  0.03716975]])