# Classification Implementation

In [1]:
import numpy as np
import mne
from scipy import signal
from scipy.interpolate import RectBivariateSpline
from mne.filter import resample, filter_data
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from lspopt import spectrogram_lspopt
from matplotlib.colors import Normalize, ListedColormap

import logging
LOGGING_TYPES = dict(DEBUG=logging.DEBUG, INFO=logging.INFO, WARNING=logging.WARNING,
                     ERROR=logging.ERROR, CRITICAL=logging.CRITICAL)
logger = logging.getLogger('yasa')

%matplotlib qt


In [2]:
# load reference_df     
reference_df = pd.read_csv("reference_df.csv", index_col="name")
reference_df.head(3)

Unnamed: 0_level_0,hypno,df_feat
name,Unnamed: 1_level_1,Unnamed: 2_level_1
P18_N3 L,/Users/amirhosseindaraie/Desktop/data/synced-h...,feature/P18_N3 L.csv
P18_N2 R,/Users/amirhosseindaraie/Desktop/data/synced-h...,feature/P18_N2 R.csv
P17_N2 L,/Users/amirhosseindaraie/Desktop/data/synced-h...,feature/P17_N2 L.csv


In [3]:
# load csv    
rankings_df = pd.read_csv("rankings_df.csv", index_col="method_name")
rankings_df.head(3)

Unnamed: 0_level_0,sb,ab,bs,ag,sg,gs,ba,ta_b,alpha,ga,...,mean_psd,kurt,E,WEn,renyi,mean_distance,diffEnt,skew,tsallisEnt,mean
method_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
f_classif,1.0,2.0,3.0,4.0,5.0,10.0,11.0,6.0,7.0,22.0,...,66.0,72.0,67.0,69.0,70.0,65.0,71.0,73.0,74.0,75.0
MI,1.0,2.0,3.0,4.0,8.0,9.0,5.0,7.0,17.0,6.0,...,70.0,67.0,73.0,68.0,71.0,74.0,72.0,66.0,69.0,75.0
chiSqr,2.0,1.0,3.0,6.0,9.0,4.0,8.0,16.0,5.0,7.0,...,66.0,64.0,67.0,71.0,68.0,72.0,69.0,73.0,74.0,75.0


In [4]:
# to see id's
idx = reference_df.index.to_list()

# to load hypno:
hypno_loc = reference_df.loc[idx[10], "hypno"]
hypno_30s = np.loadtxt(hypno_loc, dtype ='int')[:, 0]

# # to load features:
df_feat_loc = reference_df.loc[idx[10], "df_feat"]
df_feat = pd.read_csv(df_feat_loc, index_col=False)


In [5]:
hypno_30s.shape

(822,)

In [6]:
df_feat.shape

(822, 75)

Omitting artifcat epochs

In [7]:
df_feat = df_feat.loc[hypno_30s!=-1]
hypno_30s = hypno_30s[hypno_30s!=-1]

In [8]:
print('Class labels:', np.unique(hypno_30s))

Class labels: [0 1 2 3 5]


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_feat, hypno_30s, test_size=0.2, random_state=1, stratify=hypno_30s
)


In [11]:
print("Labels counts in y:", np.bincount(hypno_30s))
print("Labels counts in y_train:", np.bincount(y_train))
print("Labels counts in y_test:", np.bincount(y_test))

Labels counts in y: [ 24  12 343 131   0 149]
Labels counts in y_train: [ 19  10 274 105   0 119]
Labels counts in y_test: [ 5  2 69 26  0 30]


In [12]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(df_feat)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

# Perceptron

In [13]:
from sklearn.linear_model import Perceptron

ppn = Perceptron(eta0=0.1, random_state=1)
ppn.fit(X_train_std, y_train)

Perceptron(eta0=0.1, random_state=1)

In [14]:
y_pred = ppn.predict(X_test_std)
print('Misclassified examples: %d' % (y_test != y_pred).sum())

Misclassified examples: 14


In [15]:
from sklearn.metrics import accuracy_score

print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Accuracy: 0.894


In [16]:
print('Accuracy: %.3f' % ppn.score(X_test_std, y_test))

Accuracy: 0.894


# Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()  # C=100.0, solver='lbfgs', multi_class='ovr')
lr.fit(X_train_std, y_train)

print(lr.predict_proba(X_test_std[:3, :]).argmax(axis=1))


[3 2 4]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
y_pred = lr.predict(X_test_std)
print("Misclassified examples: %d" % (y_test != y_pred).sum())

print("Accuracy: %.3f" % accuracy_score(y_test, y_pred))


Misclassified examples: 6
Accuracy: 0.955


In [19]:
weights, params = [], []
for c in np.arange(-5, 5):
    lr = LogisticRegression(C=10.0**c, multi_class="ovr")
    lr.fit(X_train_std, y_train)
    weights.append(lr.coef_[1])
    params.append(10.0**c)

weights = np.array(weights)
params = np.array(params)
plt.figure()
plt.plot(params, weights[:, 0], label="std")
plt.plot(params, weights[:, 1], linestyle="--", label="mean")
plt.ylabel("Weight coefficient")
plt.xlabel("C")
plt.legend(loc="upper left")
plt.xscale("log")
# plt.savefig('figures/03_08.png', dpi=300)
plt.show()


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

# Maximum margin classification with support vector machines

In [20]:
from sklearn.svm import SVC

svm = SVC(kernel='linear', C=1.0, random_state=1)
svm.fit(X_train_std, y_train)

SVC(kernel='linear', random_state=1)

In [21]:
y_pred = lr.predict(X_test_std)
print('Misclassified examples: %d' % (y_test != y_pred).sum())

print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Misclassified examples: 5
Accuracy: 0.962


In [22]:
from sklearn.linear_model import SGDClassifier

ppn = SGDClassifier(loss='perceptron')
ppn.fit(X_train_std, y_train)
y_pred = ppn.predict(X_test_std)
print('ppn Misclassified examples: %d' % (y_test != y_pred).sum())
print('ppn Accuracy: %.3f' % accuracy_score(y_test, y_pred))

lr = SGDClassifier(loss='log')
lr.fit(X_train_std, y_train)
y_pred = lr.predict(X_test_std)
print('lr Misclassified examples: %d' % (y_test != y_pred).sum())
print('lr Accuracy: %.3f' % accuracy_score(y_test, y_pred))

svm = SGDClassifier(loss='hinge')
svm.fit(X_train_std, y_train)
y_pred = svm.predict(X_test_std)
print('svm Misclassified examples: %d' % (y_test != y_pred).sum())
print('svm Accuracy: %.3f' % accuracy_score(y_test, y_pred))


ppn Misclassified examples: 8
ppn Accuracy: 0.939
lr Misclassified examples: 9
lr Accuracy: 0.932
svm Misclassified examples: 6
svm Accuracy: 0.955


# Plotting feature space 

In [23]:
feat1 = "sb"
feat2 = "alpha"

df_feat1 = df_feat.copy()
df_feat1 = df_feat1[[feat1, feat2]]

sc1 = StandardScaler()
sc1.fit(df_feat1)
X = sc1.transform(df_feat1)
y = hypno_30s

plt.figure(figsize=(7,5))
plt.scatter(X[y == 0, 0], X[y == 0, 1], marker="s", label="Class 0")
plt.scatter(X[y == 1, 0], X[y == 1, 1], marker="o", label="Class 1")
plt.scatter(X[y == 2, 0], X[y == 2, 1], marker="o", label="Class 2")
plt.scatter(X[y == 3, 0], X[y == 3, 1], marker="o", label="Class 3")
plt.scatter(X[y == 4, 0], X[y == 4, 1], marker="o", label="Class 4")

plt.xlabel(f"Feature 1 - {feat1}")
plt.ylabel(f"Feature 2 - {feat2}")
plt.legend(loc="best")
plt.tight_layout()
# plt.savefig(f"{feat1}_vs_{feat2} features.png", dpi=300)
# plt.savefig(f"{feat1}_vs_{feat2} features.svg")
plt.show()


In [24]:
fig, axes = plt.subplots(5, 5, figsize=(10, 10))
for i, ax in enumerate(axes.ravel()):
    feat1 = rankings_df.columns.values.tolist()[int(i // 5 + 7)]
    feat2 = rankings_df.columns.values.tolist()[int(np.random.randint(1, 65))]

    df_feat1 = df_feat.copy()
    df_feat1 = df_feat1[[feat1, feat2]]

    sc1 = StandardScaler()
    sc1.fit(df_feat1)
    X = sc1.transform(df_feat1)
    y = hypno_30s

    ax.scatter(X[y == 0, 0], X[y == 0, 1], marker="o", label="Class 0")
    ax.scatter(X[y == 1, 0], X[y == 1, 1], marker="v", label="Class 1")
    ax.scatter(X[y == 2, 0], X[y == 2, 1], marker="s", label="Class 2")
    ax.scatter(X[y == 3, 0], X[y == 3, 1], marker="*", label="Class 3")
    ax.scatter(X[y == 4, 0], X[y == 4, 1], marker="p", label="Class 4")

    ax.set_yticks([])
    ax.set_xticks([])

    ax.set_xlabel(f"{feat1}")
    ax.set_ylabel(f"{feat2}")

handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels, loc='lower right')

plt.tight_layout()
# plt.savefig(f"2D features space.png", dpi=300)
# plt.savefig(f"2D features space.svg")
plt.show()


In [None]:
# def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):

#     # setup marker generator and color map
#     markers = ('o', 's', '^', 'v', '<')
#     colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
#     cmap = ListedColormap(colors[:len(np.unique(y))])

#     # plot the decision surface
#     x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
#     x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
#     xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
#                            np.arange(x2_min, x2_max, resolution))
#     lab = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
#     lab = lab.reshape(xx1.shape)
#     plt.contourf(xx1, xx2, lab, alpha=0.3, cmap=cmap)
#     plt.xlim(xx1.min(), xx1.max())
#     plt.ylim(xx2.min(), xx2.max())

#     # plot class examples
#     for idx, cl in enumerate(np.unique(y)):
#         plt.scatter(x=X[y == cl, 0], 
#                     y=X[y == cl, 1],
#                     alpha=0.8, 
#                     c=colors[idx],
#                     marker=markers[idx], 
#                     label=f'Class {cl}', 
#                     edgecolor='black')

#     # highlight test examples
#     if test_idx:
#         # plot all examples
#         X_test, y_test = X[test_idx, :], y[test_idx]

#         plt.scatter(X_test[:, 0],
#                     X_test[:, 1],
#                     c='none',
#                     edgecolor='black',
#                     alpha=1.0,
#                     linewidth=1,
#                     marker='o',
#                     s=100, 
#                     label='Test set')     

# svm = SVC(kernel='rbf', random_state=1, gamma=0.10, C=10.0)
# svm.fit(X, y)
# plot_decision_regions(X, y,
#                       classifier=svm)

# plt.legend(loc='upper left')
# plt.tight_layout()
# #plt.savefig('figures/03_14.png', dpi=300)
# plt.show()   

# Solving non-linear problems using a kernel SVM

In [83]:
columns = rankings_df.columns[:20]

X_train, X_test, y_train, y_test = train_test_split(
    df_feat[columns], hypno_30s, test_size=0.2, random_state=1, stratify=hypno_30s
)

X_train, X_test = X_train[columns], X_test[columns]

sc = StandardScaler()
sc.fit(df_feat[columns])
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [84]:
X_train.head(3)

Unnamed: 0,sb,ab,bs,ag,sg,gs,ba,ta_b,alpha,ga,at,tb,ad,iqr,higuchi,bubbleEnt1,sigma,bubbleEnt2,lziv,ta
272,0.622033,1.000696,1.607631,4.626121,2.875599,0.347754,0.999304,4.26084,0.019391,0.216164,0.306948,3.260144,0.02199,9.84453,1.476796,-0.0064,0.012053,-0.010514,836,3.257876
262,1.141445,3.360351,0.876083,14.167325,4.812359,0.207798,0.297588,9.55043,0.018938,0.070585,0.542861,6.190079,0.020303,11.814632,1.282473,0.009255,0.006433,0.011808,774,1.842093
397,1.571452,4.727456,0.636354,14.600504,4.853347,0.206043,0.21153,11.205612,0.070165,0.068491,0.729753,6.478156,0.088736,9.94286,1.312383,0.003868,0.023323,0.004823,734,1.370326


In [85]:
from sklearn.svm import SVC

svm = SVC(kernel='rbf', random_state=1, gamma=0.2, C=1.0)
svm.fit(X_train_std, y_train)
y_pred = svm.predict(X_test_std)
print('Misclassified examples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))


Misclassified examples: 6
Accuracy: 0.955


## Building a decision tree

In [86]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier(criterion='gini', 
                                    max_depth=5, 
                                    random_state=1)
tree_model.fit(X_train, y_train)
y_pred = tree_model.predict(X_test)
print('Misclassified examples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Misclassified examples: 11
Accuracy: 0.917


In [87]:
from sklearn import tree

tree.plot_tree(tree_model, filled=True)

# plt.savefig('figures/03_21_1.pdf')
plt.show()


## Combining weak to strong learners via random forests

In [88]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=25, 
                                random_state=1,
                                n_jobs=2)
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)
print('Misclassified examples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Misclassified examples: 11
Accuracy: 0.917


# K-nearest neighbors - a lazy learning algorithm

In [89]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5, 
                           p=2, 
                           metric='minkowski')
knn.fit(X_train_std, y_train)
y_pred = knn.predict(X_test_std)
print('Misclassified examples: %d' % (y_test != y_pred).sum())
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Misclassified examples: 8
Accuracy: 0.939


# Assessing feature importance with Random Forests

In [95]:
from sklearn.ensemble import RandomForestClassifier

columns = rankings_df.columns[:50]

X_train, X_test, y_train, y_test = train_test_split(
    df_feat[columns], hypno_30s, test_size=0.2, random_state=1, stratify=hypno_30s
)

X_train, X_test = X_train[columns], X_test[columns]

feat_labels = df_feat[columns].columns[:]

forest = RandomForestClassifier(n_estimators=500, random_state=1)

forest.fit(X_train, y_train)
importances = forest.feature_importances_

indices = np.argsort(importances)[::-1]

for f in range(X_train.shape[1]):
    print(
        "%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]])
    )

plt.title("Feature importance")
plt.bar(range(X_train.shape[1]), importances[indices], align="center")

plt.xticks(range(X_train.shape[1]), feat_labels[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.tight_layout()
# plt.savefig('figures/04_10.png', dpi=300)
plt.show()


 1) ab                             0.067876
 2) ba                             0.066054
 3) spec_entropy                   0.048812
 4) ta                             0.045599
 5) at                             0.045500
 6) bs                             0.042016
 7) sb                             0.041040
 8) beta                           0.040812
 9) nzc                            0.038547
10) bd                             0.034809
11) ga                             0.033013
12) gb_da                          0.029125
13) ag                             0.028449
14) db                             0.028225
15) ta_b                           0.023249
16) gb                             0.020812
17) alpha                          0.020804
18) ad                             0.019912
19) gs                             0.018976
20) bg                             0.018525
21) lziv                           0.017723
22) higuchi                        0.017459
23) sd                          

In [96]:
from sklearn.feature_selection import SelectFromModel

sfm = SelectFromModel(forest, threshold=0.02, prefit=True)
X_selected = sfm.transform(X_train)
print('Number of features that meet this threshold criterion:', 
      X_selected.shape[1])

for f in range(X_selected.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, 
                            feat_labels[indices[f]], 
                            importances[indices[f]]))

Number of features that meet this threshold criterion: 17
 1) ab                             0.067876
 2) ba                             0.066054
 3) spec_entropy                   0.048812
 4) ta                             0.045599
 5) at                             0.045500
 6) bs                             0.042016
 7) sb                             0.041040
 8) beta                           0.040812
 9) nzc                            0.038547
10) bd                             0.034809
11) ga                             0.033013
12) gb_da                          0.029125
13) ag                             0.028449
14) db                             0.028225
15) ta_b                           0.023249
16) gb                             0.020812
17) alpha                          0.020804


