### Classification using KNN

##### Hybrid PSO is used for Feature Selection

##### Naive Bayes is used for calculation of accuracy

In [1]:
#Load the necessory packages and libraries
# Need to install all the libraries that are not in your system

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pywt
from scipy import signal
from sklearn.metrics import accuracy_score
import random
import pyswarms as ps
%matplotlib inline

In [3]:
#Read the extracted features csv file

In [4]:
features = pd.read_csv("features.csv", sep = ',')

In [5]:
features.head()

Unnamed: 0,mean_cA,energy_cA,std_cA,var_cA,absvalue_cA,avgpower_cA,mean_cD1,energy_cD1,std_cD1,var_cD1,...,var_cD3,absvalue_cD3,avgpower_cD3,mean_cD4,energy_cD4,std_cD4,var_cD4,absvalue_cD4,avgpower_cD4,y
0,168.657765,3690802.0,434.350715,188660.5,379.379385,217106.0,42.936816,153526.3,84.778474,7187.389589,...,2178.476128,36.189001,2184.247992,-0.101826,7569.491844,9.0701,82.266717,7.226628,82.277085,0
1,526.897609,29828450.0,1215.316304,1476994.0,1157.791397,1754615.0,-140.260084,11645730.0,815.702345,665370.315473,...,46511.947059,106.431487,46516.016342,-0.642356,74217.980651,28.395502,806.30456,12.574428,806.717181,1
2,-165.463089,595484.0,87.466787,7650.439,166.27727,35028.47,-31.466787,94365.42,67.533312,4560.748192,...,379.181988,16.762068,379.790965,-0.263038,1716.899579,4.311933,18.592763,3.476872,18.661952,0
3,-309.218794,1686014.0,59.674572,3561.055,309.218794,99177.32,8.360773,18258.1,31.687591,1004.103428,...,51.104956,5.85184,51.254145,-0.0406,813.688912,2.973684,8.842796,2.419046,8.844445,0
4,-114.470121,595041.7,147.98326,21899.05,150.707093,35002.45,-16.875378,52989.87,53.219103,2832.27295,...,768.233538,20.67428,770.06384,0.208534,2503.114777,5.211937,27.164283,4.083667,27.207769,0


In [6]:
#EEG Signal Data
eeg_X = features.drop('y',axis=1)
eeg_X.head(1)

Unnamed: 0,mean_cA,energy_cA,std_cA,var_cA,absvalue_cA,avgpower_cA,mean_cD1,energy_cD1,std_cD1,var_cD1,...,std_cD3,var_cD3,absvalue_cD3,avgpower_cD3,mean_cD4,energy_cD4,std_cD4,var_cD4,absvalue_cD4,avgpower_cD4
0,168.657765,3690802.0,434.350715,188660.543812,379.379385,217105.985661,42.936816,153526.315353,84.778474,7187.389589,...,46.674148,2178.476128,36.189001,2184.247992,-0.101826,7569.491844,9.0701,82.266717,7.226628,82.277085


In [7]:
#EEG Signal Label
eeg_Y = features['y']

In [8]:
# Normalization of Features
#standardization
#standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(eeg_X)
scaled_features = scaler.transform(eeg_X)
eeg_features = pd.DataFrame(scaled_features)
eeg_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.949482,-0.042852,0.436261,-0.019769,0.329815,-0.042852,0.784123,-0.306785,-0.357107,-0.310274,...,0.159414,-0.137772,0.245653,-0.137582,-0.139863,-0.099876,0.234121,-0.099132,0.42065,-0.099876
1,2.625738,2.182559,2.695884,2.25353,2.860447,2.182559,-2.495066,2.218741,2.660073,2.215925,...,3.068352,2.214875,2.059336,2.21368,-0.931173,1.653685,2.494261,1.659859,1.431335,1.653685
2,-0.613918,-0.306393,-0.567403,-0.339167,-0.362984,-0.306393,-0.547687,-0.319787,-0.428294,-0.320355,...,-0.308818,-0.233256,-0.255957,-0.233286,-0.37587,-0.253861,-0.322355,-0.253823,-0.28802,-0.253861
3,-1.286572,-0.213543,-0.647816,-0.346382,0.101721,-0.213543,0.165219,-0.336512,-0.576261,-0.334006,...,-0.520954,-0.250666,-0.537662,-0.250711,-0.05023,-0.277625,-0.478865,-0.27751,-0.487939,-0.277625
4,-0.375314,-0.306431,-0.392306,-0.314025,-0.413603,-0.306431,-0.286504,-0.328879,-0.487381,-0.326989,...,-0.166903,-0.21261,-0.154942,-0.212587,0.314491,-0.233175,-0.217098,-0.232999,-0.173341,-0.233175


In [9]:
#Cnvert Dataframe into Numpy Array for PSO
X = np.array(eeg_features)
y = np.array(eeg_Y)
print(X.shape)
print(y.shape)

(11500, 30)
(11500,)


##### Feature Selection using PSO (Naive Bayes)

In [10]:
#Calculating Fitness using Naive Bayes
from sklearn.naive_bayes import GaussianNB
# Create an instance of Naive Bayes
classifier = GaussianNB()

# Define objective function
def f_per_particle(m, alpha):
    """Computes for the objective function per particle

    Inputs
    ------
    m : numpy.ndarray
        Binary mask that can be obtained from BinaryPSO, will
        be used to mask features.
    alpha: float (default is 0.5)
        Constant weight for trading-off classifier performance
        and number of features

    Returns
    -------
    numpy.ndarray
        Computed objective function
    """
    total_features = 30
    # Get the subset of the features from the binary mask
    if np.count_nonzero(m) == 0:
        X_subset = X
    else:
        X_subset = X[:,m==1]
    # Perform classification and store performance in P
    classifier.fit(X_subset, y)
    P = (classifier.predict(X_subset) == y).mean()
    # Compute for the objective function
    j = (alpha * (1.0 - P)
        + (1.0 - alpha) * (1 - (X_subset.shape[1] / total_features)))
    return j

In [11]:
# The PSO Process
def f(x, alpha=0.88):
    """Higher-level method to do classification in the
    whole swarm.

    Inputs
    ------
    x: numpy.ndarray of shape (n_particles, dimensions)
        The swarm that will perform the search

    Returns
    -------
    numpy.ndarray of shape (n_particles, )
        The computed loss for each particle
    """
    n_particles = x.shape[0]
    j = [f_per_particle(x[i], alpha) for i in range(n_particles)]
    return np.array(j)

In [12]:
# Initialize swarm, arbitrary
options = {'c1': 0.5, 'c2': 0.5, 'w':0.9, 'k': 30, 'p':2}

# Call instance of PSO
dimensions = 30 # dimensions should be the number of features

#Optimize
optimizer = ps.discrete.BinaryPSO(n_particles=30, dimensions=dimensions, options=options)
#Note : We can change the number of iterations to optimize more
# Perform optimization
cost, pos = optimizer.optimize(f, iters=100)

2019-05-07 23:03:55,800 - pyswarms.discrete.binary - INFO - Optimize for 100 iters with {'c1': 0.5, 'c2': 0.5, 'w': 0.9, 'k': 30, 'p': 2}
pyswarms.discrete.binary: 100%|██████████|100/100, best_cost=0.0597
2019-05-07 23:04:34,410 - pyswarms.discrete.binary - INFO - Optimization finished | best cost: 0.0596730434782609, best pos: [1 1 1 1 1 1 0 1 0 1 1 1 0 1 1 1 0 1 0 1 1 1 0 1 0 1 1 0 1 0]


In [14]:
# Apply KNN on selected features to caclulate the accuracy
# Get the selected features from the final positions
selected_features = X[:,pos==1]  # subset

In [15]:
selected_features.shape

(11500, 21)

In [17]:
# Create dataframe of selected features
selected_features = pd.DataFrame(selected_features)

In [18]:
selected_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0.949482,-0.042852,0.436261,-0.019769,0.329815,-0.042852,-0.306785,-0.310274,-0.322722,-0.306785,...,0.20629,-0.138815,-0.1391,-0.137582,0.159414,-0.137772,-0.137582,-0.099876,0.234121,0.42065
1,2.625738,2.182559,2.695884,2.25353,2.860447,2.182559,2.218741,2.215925,2.301297,2.218741,...,1.939113,1.070068,1.064722,2.21368,3.068352,2.214875,2.21368,1.653685,2.494261,1.431335
2,-0.613918,-0.306393,-0.567403,-0.339167,-0.362984,-0.306393,-0.319787,-0.320355,-0.463506,-0.319787,...,-0.272174,-0.258945,-0.259174,-0.233286,-0.308818,-0.233256,-0.233286,-0.253861,-0.322355,-0.28802
3,-1.286572,-0.213543,-0.647816,-0.346382,0.101721,-0.213543,-0.336512,-0.334006,-0.569652,-0.336512,...,-0.533629,-0.285521,-0.285623,-0.250711,-0.520954,-0.250666,-0.250711,-0.277625,-0.478865,-0.487939
4,-0.375314,-0.306431,-0.392306,-0.314025,-0.413603,-0.306431,-0.328879,-0.326989,-0.456352,-0.328879,...,-0.378197,-0.27305,-0.273104,-0.212587,-0.166903,-0.21261,-0.212587,-0.233175,-0.217098,-0.173341


#### Train Test Split

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(selected_features,eeg_Y,test_size=0.20)

In [20]:
##Training and Predicting

In [21]:
#KNN
from sklearn.neighbors import KNeighborsClassifier

In [22]:
# Create an instance of the classifier
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [23]:
predictions = clf.predict(X_test)

##### Performance Evaluation

In [24]:
from sklearn.metrics import confusion_matrix
# Confusion Matrix
cm = confusion_matrix(y_test, predictions)

In [25]:
cm

array([[1822,   36],
       [  29,  413]])

In [26]:
True_Positive = cm[0][0]
True_Negative = cm[1][1]
False_Positive = cm[0][1]
False_Negative = cm[1][0]

In [27]:
Accuracy = (True_Positive + True_Negative) / (True_Positive + True_Negative + False_Positive + False_Negative) * 100
print("%.2f" % Accuracy)

97.17


In [28]:
Sensitivity = True_Positive / (True_Positive + False_Negative) * 100
print("%.2f" % Sensitivity)

98.43


In [29]:
Specificity = True_Negative / (True_Negative + False_Positive) * 100
print("%.2f" % Specificity) 

91.98
