# Classification Implementation

In [7]:
import numpy as np
import mne
from scipy import signal
from scipy.interpolate import RectBivariateSpline
from mne.filter import resample, filter_data
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from lspopt import spectrogram_lspopt
from matplotlib.colors import Normalize, ListedColormap

import logging
LOGGING_TYPES = dict(DEBUG=logging.DEBUG, INFO=logging.INFO, WARNING=logging.WARNING,
                     ERROR=logging.ERROR, CRITICAL=logging.CRITICAL)
logger = logging.getLogger('yasa')

%matplotlib qt


In [8]:
# load reference_df     
reference_df = pd.read_csv("reference_df.csv", index_col="name")
reference_df.head(3)

Unnamed: 0_level_0,hypno,df_feat,eeg
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P18_N3 L,/Users/amirhosseindaraie/Desktop/data/synced-h...,feature/P18_N3 L.csv,/Users/amirhosseindaraie/Desktop/data/autoscor...
P18_N2 R,/Users/amirhosseindaraie/Desktop/data/synced-h...,feature/P18_N2 R.csv,/Users/amirhosseindaraie/Desktop/data/autoscor...
P17_N2 L,/Users/amirhosseindaraie/Desktop/data/synced-h...,feature/P17_N2 L.csv,/Users/amirhosseindaraie/Desktop/data/autoscor...


In [9]:
# load csv    
rankings_df = pd.read_csv("rankings_df aug.csv", index_col="method_name")
rankings_df.head(3)

Unnamed: 0_level_0,ab,sb,ag,sg,lziv,iqr,bs,ta_b,gs,alpha,...,median,mean_psd,E,WEn,ds,mean_distance,diffEnt,renyi,skew,mean
method_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
f_classif,2.0,1.0,3.0,4.0,6.0,9.0,7.0,5.0,15.0,8.0,...,61.0,64.0,66.0,62.0,71.0,63.0,67.0,69.0,72.0,73.0
chiSqr,1.0,2.0,4.0,8.0,7.0,6.0,9.0,12.0,3.0,10.0,...,69.0,66.0,65.0,70.0,62.0,71.0,68.0,67.0,72.0,73.0


# Train on 60 nights, test of 1 night
(1560*60 x 75)

In [10]:
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from scipy.stats import kruskal
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
idx_all_recordings = np.random.permutation(len(reference_df))
idx_train_recordings = idx_all_recordings[:-11]
idx_test_recordings = idx_all_recordings[-11:]
print(">>>>>>>> train recordings (index): ")
print(idx_train_recordings)
print(">>>>>>>> test recordings: ")
print(idx_test_recordings)


>>>>>>>> train recordings (index): 
[32 42 14 34 13 24 23 47 36  4 48 41 22 37 17 29 10 53  2 43 25  5 19 55
  8 16 56 60 52 27 11 46 12 39  7 44 45  3 26 33 51 30 57  1 54  0  6 35
 20 58]
>>>>>>>> test recordings: 
[28 40 31 49 15  9 50 38 59 18 21]


To split the dataset to train and test + shuffle each night

In [13]:
def train_test_split(test_prop=0.2, n_feat=40):

    idx_all_recordings = np.random.permutation(len(reference_df))
    idx_train_recordings = idx_all_recordings[: -int(test_prop * 61)]
    idx_test_recordings = idx_all_recordings[-int(test_prop * 61) :]
    # print(">>>>>>>> train recordings (index): ")
    # print(idx_train_recordings)
    # print(">>>>>>>> test recordings: ")
    # print(idx_test_recordings)

    df_feat_X_train = np.array([])
    df_feat_X_test = np.array([])
    hypno_y_train = np.array([])
    hypno_y_test = np.array([])

    columns = rankings_df.columns[:n_feat]  # for selecting top n_feat columns

    # to loop over all recording files:
    for i in idx_train_recordings:
        ### to load augmented hypnos for train:
        name = reference_df.iloc[i].name
        hypno_30s_loc = reference_df.iloc[i].hypno
        hypno_30s_loc = hypno_30s_loc.split(".")[0] + " aug.txt"
        hypno_30s = np.loadtxt(hypno_30s_loc, delimiter="\n")

        ### to load features of augmented eeg for train:
        df_feat_loc = reference_df.iloc[i].df_feat
        df_feat_loc = df_feat_loc.split(".")[0] + " aug.csv"
        df_feat = pd.read_csv(df_feat_loc, index_col=False)

        df_feat = df_feat.replace(
            [np.inf, -np.inf], 0
        )  # Replacing infinite values in features

        ### select top n_feat ranks columns
        df_feat = df_feat[columns]

        ### shuffle X
        permut = np.random.permutation(df_feat.shape[0])
        df_feat = df_feat.iloc[permut]

        ### to load features for train: append df_feat to df_feat_X_train
        if i == idx_train_recordings[0]:
            df_feat_X_train = df_feat.to_numpy()
        else:
            df_feat_X_train = np.vstack([df_feat_X_train, df_feat.to_numpy()])

        ### shuffle y
        hypno_30s = hypno_30s[permut]

        ### to load labels for train: append hypno to hypno_y_train
        hypno_y_train = np.append(hypno_y_train, hypno_30s)

    for i in idx_test_recordings:
        ### to load features for test:
        df_feat_loc = reference_df.iloc[i].df_feat
        df_feat_loc = df_feat_loc.split(".")[0] + " aug.csv"
        df_feat = pd.read_csv(df_feat_loc, index_col=False)

        ### to load labels for test:
        hypno_30s_loc = reference_df.iloc[i].hypno
        hypno_30s_loc = hypno_30s_loc.split(".")[0] + " aug.txt"
        hypno_30s = np.loadtxt(hypno_30s_loc, delimiter="\n")

        df_feat = df_feat.replace(
            [np.inf, -np.inf], 0
        )  # Replacing infinite values in features

        ### select top n_feat ranks columns
        df_feat = df_feat[columns].to_numpy()

        ### to load features for train: append df_feat to df_feat_X_train
        if i == idx_test_recordings[0]:
            df_feat_X_test = df_feat
        else:
            df_feat_X_test = np.vstack([df_feat_X_test, df_feat])

        ### to load labels for train: append hypno to hypno_y_train
        hypno_y_test = np.append(hypno_y_test, hypno_30s)

    print(f"Train set: X={df_feat_X_train.shape} y={hypno_y_train.shape}")
    print(f"Test set: X={df_feat_X_test.shape} y={hypno_y_test.shape}")

    ### To standardize all dataset including train and test, after train/test split
    # Generate a numpy array including all epochs:
    df_feat_all = np.array([])

    # # to loop over all recording files:
    # for i in range(len(reference_df)):
    #     ### to load augmented hypno:
    #     name = reference_df.iloc[i].name
    #     hypno_30s_loc = reference_df.iloc[i].hypno
    #     hypno_30s_loc = hypno_30s_loc.split(".")[0] + " aug.txt"
    #     hypno_30s = np.loadtxt(hypno_30s_loc, delimiter="\n")

    #     ### to load features for augmented eeg:
    #     df_feat_loc = reference_df.iloc[i].df_feat
    #     df_feat_loc = df_feat_loc.split(".")[0] + " aug.csv"
    #     df_feat = pd.read_csv(df_feat_loc, index_col=False)

    #     df_feat = df_feat.replace(
    #         [np.inf, -np.inf], 0
    #     )  # Replacing infinite values in features

    #     ### select top 25 ranks columns
    #     df_feat = df_feat[columns]

    #     ### to load features for train: append df_feat to df_feat_X_train
    #     if i == 0:
    #         df_feat_all = df_feat.to_numpy()
    #     else:
    #         df_feat_all = np.vstack([df_feat_all, df_feat.to_numpy()])

    # print(f"All: {df_feat_all.shape}")

    from sklearn.preprocessing import StandardScaler

    # we will standardize the columns in dataset before we feed them to a classifier
    sc = StandardScaler()
    sc.fit(df_feat_X_train)  # first fit all the dataset
    X_train_std = sc.transform(df_feat_X_train)  # then transform train
    X_test_std = sc.transform(df_feat_X_test)  # and test

    return X_train_std, X_test_std, hypno_y_train, hypno_y_test


X_train_std, X_test_std, y_train, y_test = train_test_split(0.1, n_feat=50)


Train set: X=(110073, 50) y=(110073,)
Test set: X=(10199, 50) y=(10199,)


Train simple 

In [14]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

svm = SVC(kernel="rbf", C=10, random_state=1)
svm.fit(X_train_std, y_train)
y_pred = svm.predict(X_test_std)
print("Misclassified examples: %d" % (y_test != y_pred).sum())
print("Accuracy: %.3f" % accuracy_score(y_test, y_pred))

Misclassified examples: 1952
Accuracy: 0.809


In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

def confmat_f(confmat):
    fig, ax = plt.subplots(figsize=(5, 5))

    ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)

    for i in range(confmat.shape[0]):
        for j in range(confmat.shape[1]):
            ax.text(x=j, y=i, s=confmat[i, j], va="center", ha="center")
    ax.set(
        xticklabels=["Wake", "N1", "N2", "N3", "REM"],
        xticks=range(5),
        yticklabels=["Wake", "N1", "N2", "N3", "REM"],
        yticks=range(5),
    )
    ax.xaxis.set_label_position("top")
    ax.xaxis.labelpad = 15
    ax.xaxis.set_tick_params(labeltop=True)
    plt.xlabel("Predicted label")
    plt.ylabel("True label")
    plt.title("Confusion Matrix", y=-0.1)
    plt.tight_layout()
    # plt.savefig("confmat.png")
    # plt.savefig("confmat.svg")
    plt.show()

### Using:
# report = classification_report(y_test, y_pred)
# print(report)

# confmat = confusion_matrix(y_test, y_pred)
# confmat_f(confmat)


In [18]:
# This cell took 5 hours to execute 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

n_feat_arr = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 72]
accuracy_arr = np.array([])
confmat_arr = []
report_arr = []

for i, n_feat in enumerate(n_feat_arr):
    # To split dataset into train/test set:
    X_train_std, X_test_std, y_train, y_test = train_test_split(test_prop=0.1, n_feat=n_feat)
    # To initiate model
    svm = SVC(kernel="rbf", C=10, random_state=1)
    # To fit the model to train set:
    svm.fit(X_train_std, y_train)
    # To predit on the test set:
    y_pred = svm.predict(X_test_std)
    # To print results
    print
    (
        f"Fold {i}, {n_feat} features => Misclassified: {(y_test != y_pred).sum()}, Acc.: {accuracy_score(y_test, y_pred)}"
    )
    # To append accuracy to array
    accuracy_arr = np.append(accuracy_arr, accuracy_score(y_test, y_pred))
    # to save report and confmat
    report = classification_report(y_test, y_pred)
    confmat = confusion_matrix(y_test, y_pred)
    report_arr.append(report)
    confmat_arr.append(confmat)
    print(report)
    confmat_f(confmat)


Train set: X=(106994, 5) y=(106994,)
Test set: X=(13278, 5) y=(13278,)
              precision    recall  f1-score   support

         0.0       0.56      0.69      0.62      1733
         1.0       0.45      0.18      0.26      2623
         2.0       0.62      0.47      0.53      2861
         3.0       0.59      0.75      0.66      2941
         4.0       0.59      0.80      0.68      3120

    accuracy                           0.58     13278
   macro avg       0.56      0.58      0.55     13278
weighted avg       0.57      0.58      0.55     13278



  ax.set(


Train set: X=(108545, 10) y=(108545,)
Test set: X=(11727, 10) y=(11727,)
              precision    recall  f1-score   support

         0.0       0.79      0.71      0.75      1978
         1.0       0.51      0.49      0.50      1954
         2.0       0.70      0.70      0.70      2411
         3.0       0.85      0.90      0.88      2588
         4.0       0.79      0.83      0.81      2796

    accuracy                           0.74     11727
   macro avg       0.73      0.73      0.73     11727
weighted avg       0.74      0.74      0.74     11727



  ax.set(


Train set: X=(108298, 15) y=(108298,)
Test set: X=(11974, 15) y=(11974,)
              precision    recall  f1-score   support

         0.0       0.62      0.75      0.67      1218
         1.0       0.58      0.51      0.54      1965
         2.0       0.84      0.69      0.76      2814
         3.0       0.83      0.90      0.86      2826
         4.0       0.82      0.88      0.85      3151

    accuracy                           0.77     11974
   macro avg       0.74      0.75      0.74     11974
weighted avg       0.77      0.77      0.76     11974



  ax.set(


Train set: X=(107687, 20) y=(107687,)
Test set: X=(12585, 20) y=(12585,)
              precision    recall  f1-score   support

         0.0       0.76      0.70      0.73      1890
         1.0       0.56      0.39      0.46      2200
         2.0       0.74      0.78      0.76      2717
         3.0       0.88      0.91      0.89      2856
         4.0       0.75      0.89      0.81      2922

    accuracy                           0.75     12585
   macro avg       0.74      0.73      0.73     12585
weighted avg       0.74      0.75      0.75     12585



  ax.set(


Train set: X=(108546, 25) y=(108546,)
Test set: X=(11726, 25) y=(11726,)
              precision    recall  f1-score   support

         0.0       0.75      0.68      0.71      1684
         1.0       0.58      0.55      0.56      1978
         2.0       0.77      0.71      0.74      2580
         3.0       0.86      0.90      0.88      2714
         4.0       0.80      0.90      0.85      2770

    accuracy                           0.77     11726
   macro avg       0.75      0.75      0.75     11726
weighted avg       0.76      0.77      0.76     11726



  ax.set(


Train set: X=(106935, 30) y=(106935,)
Test set: X=(13337, 30) y=(13337,)
              precision    recall  f1-score   support

         0.0       0.72      0.71      0.72      2019
         1.0       0.60      0.44      0.51      2302
         2.0       0.72      0.66      0.69      2804
         3.0       0.79      0.87      0.83      3020
         4.0       0.75      0.89      0.82      3192

    accuracy                           0.73     13337
   macro avg       0.72      0.72      0.71     13337
weighted avg       0.73      0.73      0.73     13337



  ax.set(


Train set: X=(108028, 35) y=(108028,)
Test set: X=(12244, 35) y=(12244,)
              precision    recall  f1-score   support

         0.0       0.78      0.73      0.75      1396
         1.0       0.65      0.53      0.59      2322
         2.0       0.79      0.76      0.78      2834
         3.0       0.89      0.89      0.89      2860
         4.0       0.75      0.90      0.82      2832

    accuracy                           0.78     12244
   macro avg       0.77      0.76      0.76     12244
weighted avg       0.77      0.78      0.77     12244



  ax.set(


Train set: X=(109030, 40) y=(109030,)
Test set: X=(11242, 40) y=(11242,)
              precision    recall  f1-score   support

         0.0       0.80      0.58      0.67      1891
         1.0       0.59      0.54      0.56      1836
         2.0       0.73      0.78      0.76      2371
         3.0       0.88      0.84      0.86      2595
         4.0       0.70      0.87      0.78      2549

    accuracy                           0.74     11242
   macro avg       0.74      0.72      0.73     11242
weighted avg       0.75      0.74      0.74     11242



  ax.set(


Train set: X=(107545, 45) y=(107545,)
Test set: X=(12727, 45) y=(12727,)
              precision    recall  f1-score   support

         0.0       0.81      0.67      0.73      2025
         1.0       0.57      0.51      0.54      2294
         2.0       0.72      0.80      0.76      2566
         3.0       0.91      0.90      0.90      2860
         4.0       0.80      0.89      0.85      2982

    accuracy                           0.77     12727
   macro avg       0.76      0.75      0.76     12727
weighted avg       0.77      0.77      0.77     12727



  ax.set(


Train set: X=(108721, 50) y=(108721,)
Test set: X=(11551, 50) y=(11551,)
              precision    recall  f1-score   support

         0.0       0.77      0.65      0.71      1445
         1.0       0.56      0.46      0.50      2066
         2.0       0.74      0.76      0.75      2516
         3.0       0.89      0.87      0.88      2722
         4.0       0.73      0.89      0.80      2802

    accuracy                           0.75     11551
   macro avg       0.74      0.73      0.73     11551
weighted avg       0.75      0.75      0.74     11551



  ax.set(


Train set: X=(107545, 55) y=(107545,)
Test set: X=(12727, 55) y=(12727,)
              precision    recall  f1-score   support

         0.0       0.79      0.74      0.76      1777
         1.0       0.66      0.45      0.53      2408
         2.0       0.78      0.75      0.76      2787
         3.0       0.90      0.92      0.91      2890
         4.0       0.71      0.93      0.80      2865

    accuracy                           0.77     12727
   macro avg       0.77      0.76      0.75     12727
weighted avg       0.77      0.77      0.76     12727



  ax.set(


Train set: X=(107708, 60) y=(107708,)
Test set: X=(12564, 60) y=(12564,)
              precision    recall  f1-score   support

         0.0       0.82      0.73      0.77      2229
         1.0       0.61      0.52      0.56      2088
         2.0       0.78      0.77      0.77      2603
         3.0       0.89      0.94      0.91      2681
         4.0       0.78      0.88      0.82      2963

    accuracy                           0.78     12564
   macro avg       0.77      0.77      0.77     12564
weighted avg       0.78      0.78      0.78     12564



  ax.set(


Train set: X=(108750, 65) y=(108750,)
Test set: X=(11522, 65) y=(11522,)
              precision    recall  f1-score   support

         0.0       0.81      0.58      0.67      1570
         1.0       0.64      0.63      0.63      1796
         2.0       0.71      0.79      0.75      2605
         3.0       0.89      0.82      0.85      2858
         4.0       0.77      0.88      0.82      2693

    accuracy                           0.76     11522
   macro avg       0.76      0.74      0.74     11522
weighted avg       0.77      0.76      0.76     11522



  ax.set(


Train set: X=(108330, 70) y=(108330,)
Test set: X=(11942, 70) y=(11942,)
              precision    recall  f1-score   support

         0.0       0.76      0.69      0.73      1973
         1.0       0.63      0.60      0.62      2147
         2.0       0.76      0.69      0.73      2485
         3.0       0.81      0.85      0.83      2577
         4.0       0.77      0.88      0.82      2760

    accuracy                           0.75     11942
   macro avg       0.75      0.74      0.74     11942
weighted avg       0.75      0.75      0.75     11942



  ax.set(


Train set: X=(106580, 72) y=(106580,)
Test set: X=(13692, 72) y=(13692,)
              precision    recall  f1-score   support

         0.0       0.76      0.66      0.71      2112
         1.0       0.61      0.44      0.51      2533
         2.0       0.74      0.78      0.76      2891
         3.0       0.88      0.92      0.90      3004
         4.0       0.74      0.90      0.81      3152

    accuracy                           0.76     13692
   macro avg       0.75      0.74      0.74     13692
weighted avg       0.75      0.76      0.75     13692



  ax.set(


In [36]:
# Write results to a file 
# textfile = open("report_arr hyperparam feat_num.txt", "w")
# for element in report_arr:
#     textfile.write(element + ",\n")
# textfile.close()

# load results from that file
f = open("report_arr hyperparam feat_num.txt")
p = ('').join(f.readlines()).split(',\n')[:-1]


In [54]:
# Write results to a file 
# np.savetxt(
#     "confmat_arr hyperparam feat_num.csv",
#     np.array(confmat_arr).reshape((3, -1)),
#     delimiter=",",
# )

# load results from that file
p = np.loadtxt("confmat_arr hyperparam feat_num.csv", delimiter=',')
p = p.reshape(15,5,5)


In [179]:
# class_i: [precision  recall   f1-score   support]
x = report_arr[0].split('0.0')[1].split("\n")[0].split("   ")
class_0 = np.array([i.strip() for i in list(filter(None, x))])
for j in range(1,15):
    x = report_arr[j].split('0.0')[1].split("\n")[0].split("   ")
    class_0 = np.vstack([class_0,np.array([i.strip() for i in list(filter(None, x))])])
class_0 = class_0.astype('float')
class_0 = class_0*100

x = report_arr[0].split('0.0')[1].split("\n")[1].split("   ")
class_1 = np.array([i.strip() for i in list(filter(None, x))])
for j in range(1,15):
    x = report_arr[j].split('0.0')[1].split("\n")[1].split("   ")
    class_1 = np.vstack([class_1,np.array([i.strip() for i in list(filter(None, x))])])
class_1 = class_1[:,1:]
class_1 = class_1.astype('float')
class_1 = class_1*100

x = report_arr[0].split('0.0')[1].split("\n")[2].split("   ")
class_2 = np.array([i.strip() for i in list(filter(None, x))])
for j in range(1,15):
    x = report_arr[j].split('0.0')[1].split("\n")[2].split("   ")
    class_2 = np.vstack([class_2,np.array([i.strip() for i in list(filter(None, x))])])
class_2 = class_2[:,1:]
class_2 = class_2.astype('float')
class_2 = class_2*100

x = report_arr[0].split('0.0')[1].split("\n")[3].split("   ")
class_3 = np.array([i.strip() for i in list(filter(None, x))])
for j in range(1,15):
    x = report_arr[j].split('0.0')[1].split("\n")[3].split("   ")
    class_3 = np.vstack([class_3,np.array([i.strip() for i in list(filter(None, x))])])
class_3 = class_3[:,1:]
class_3 = class_3.astype('float')
class_3 = class_3*100

x = report_arr[0].split('0.0')[1].split("\n")[4].split("   ")
class_4 = np.array([i.strip() for i in list(filter(None, x))])
for j in range(1,15):
    x = report_arr[j].split('0.0')[1].split("\n")[4].split("   ")
    class_4 = np.vstack([class_4,np.array([i.strip() for i in list(filter(None, x))])])
class_4 = class_4[:,1:]
class_4 = class_4.astype('float')
class_4 = class_4*100

In [236]:
fig, ax = plt.subplots(3,1,figsize=(10, 5), sharex=True)
ylabel = ["Precision","Recall","F1-score"]
for i in range(3):
    ax[i].plot(n_feat_arr,class_0[:,i], label="Wake", color="tomato")
    ax[i].plot(n_feat_arr,class_0[:,i],'o', color="red")
    ax[i].plot(n_feat_arr,class_1[:,i], label="N1", color="gold")
    ax[i].plot(n_feat_arr,class_1[:,i],'o', color="goldenrod")
    ax[i].plot(n_feat_arr,class_2[:,i], label="N2", color="limegreen")
    ax[i].plot(n_feat_arr,class_2[:,i],'o', color="olivedrab")
    ax[i].plot(n_feat_arr,class_3[:,i], label="N3", color="dodgerblue")
    ax[i].plot(n_feat_arr,class_3[:,i],'o', color="royalblue")
    ax[i].plot(n_feat_arr,class_4[:,i], label="REM", color="mediumslateblue")
    ax[i].plot(n_feat_arr,class_4[:,i],'o', color="darkviolet")
    ax[i].set(ylim=[25,100 ], xticks=n_feat_arr)
    ax[i].grid(alpha=0.4)
    ax[i].set(ylabel=ylabel[i])
plt.xlabel('Number of top features')
ax[1].legend()
# ax[1].legend(loc=(1.01,0.05))
ax[0].set(title="SVM RBF C=10 performance metrics with different number of features")
plt.tight_layout()
plt.savefig('svm performace metrics plot.png')
plt.savefig('svm performace metrics plot.svg')
plt.show()

<matplotlib.legend.Legend at 0x29925ef40>

In [None]:
def train_test_split(test_prop=0.2):

    idx_all_recordings = np.random.permutation(len(reference_df))
    idx_train_recordings = idx_all_recordings[: -int(test_prop * 61)]
    idx_test_recordings = idx_all_recordings[-int(test_prop * 61) :]
    # print(">>>>>>>> train recordings (index): ")
    # print(idx_train_recordings)
    # print(">>>>>>>> test recordings: ")
    # print(idx_test_recordings)

    df_feat_X_train = np.array([])
    df_feat_X_test = np.array([])
    hypno_y_train = np.array([])
    hypno_y_test = np.array([])

    columns = rankings_df.columns[:40]  # for selecting top columns

    # to loop over all recording files:
    for i in idx_train_recordings:
        ### to load augmented hypnos for train:
        name = reference_df.iloc[i].name
        hypno_30s_loc = reference_df.iloc[i].hypno
        hypno_30s_loc = hypno_30s_loc.split(".")[0] + " aug.txt"
        hypno_30s = np.loadtxt(hypno_30s_loc, delimiter="\n")

        ### to load features of augmented eeg for train:
        df_feat_loc = reference_df.iloc[i].df_feat
        df_feat_loc = df_feat_loc.split(".")[0] + " aug.csv"
        df_feat = pd.read_csv(df_feat_loc, index_col=False)

        df_feat = df_feat.replace(
            [np.inf, -np.inf], 0
        )  # Replacing infinite values in features

        ### select top 25 ranks columns
        df_feat = df_feat[columns]

        ### shuffle X
        permut = np.random.permutation(df_feat.shape[0])
        df_feat = df_feat.iloc[permut]

        ### to load features for train: append df_feat to df_feat_X_train
        if i == idx_train_recordings[0]:
            df_feat_X_train = df_feat.to_numpy()
        else:
            df_feat_X_train = np.vstack([df_feat_X_train, df_feat.to_numpy()])

        ### shuffle y
        hypno_30s = hypno_30s[permut]

        ### to load labels for train: append hypno to hypno_y_train
        hypno_y_train = np.append(hypno_y_train, hypno_30s)

    for i in idx_test_recordings:
        ### to load features for test:
        df_feat_loc = reference_df.iloc[i].df_feat
        df_feat_loc = df_feat_loc.split(".")[0] + " aug.csv"
        df_feat = pd.read_csv(df_feat_loc, index_col=False)

        ### to load labels for test:
        hypno_30s_loc = reference_df.iloc[i].hypno
        hypno_30s_loc = hypno_30s_loc.split(".")[0] + " aug.txt"
        hypno_30s = np.loadtxt(hypno_30s_loc, delimiter="\n")

        df_feat = df_feat.replace(
            [np.inf, -np.inf], 0
        )  # Replacing infinite values in features

        ### select top 25 ranks columns
        df_feat = df_feat[columns].to_numpy()

        ### to load features for train: append df_feat to df_feat_X_train
        if i == idx_test_recordings[0]:
            df_feat_X_test = df_feat
        else:
            df_feat_X_test = np.vstack([df_feat_X_test, df_feat])

        ### to load labels for train: append hypno to hypno_y_train
        hypno_y_test = np.append(hypno_y_test, hypno_30s)

    print(f"Train set: X={df_feat_X_train.shape} y={hypno_y_train.shape}")
    print(f"Test set: X={df_feat_X_test.shape} y={hypno_y_test.shape}")

    return df_feat_X_train, df_feat_X_test, hypno_y_train, hypno_y_test

df_feat_X_train, df_feat_X_test, hypno_y_train, hypno_y_test = train_test_split(0.1)

Train set: X=(108244, 40) y=(108244,)
Test set: X=(12028, 40) y=(12028,)


[(0.0, 100.0)]

In [75]:

for i in range(len(n_feat_arr)):
    # print(feat_n)
    ax.plot(i, int(confmat_arr[i][0, 0] / np.sum(confmat_arr[i][:, 0]) * 100), 'o', color='g')


# Tune C parameter with learning curve

In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

from sklearn.pipeline import make_pipeline

pipe_lr = make_pipeline(StandardScaler(), SVC(kernel="rbf", C=10))

kfold = StratifiedKFold(n_splits=10).split(df_feat_X_train, hypno_y_train)

scores = []
for k, (train, test) in enumerate(kfold):
    pipe_lr.fit(df_feat_X_train[train], hypno_y_train[train])
    score = pipe_lr.score(df_feat_X_train[test], hypno_y_train[test])
    scores.append(score)

    print(
        f"Fold: {k+1:02d}, "
        f"Class distr.: {np.bincount(hypno_y_train[train].astype(int))}, "
        f"Acc.: {score:.3f}"
    )

mean_acc = np.mean(scores)
std_acc = np.std(scores)
print(f"\nCV accuracy: {mean_acc:.3f} +/- {std_acc:.3f}")


In [None]:
plt.figure()
plt.plot(scores, c="darkturquoise")
plt.plot(range(len(scores)), scores, "s", c="darkslategrey")
plt.ylim([0, 1])
plt.xlim([-0.5, len(scores) - 0.5])
plt.grid()
plt.title(f"Stratified 10-fold CV to estimate accuracy: {mean_acc:.3f} +/- {std_acc:.3f} ")
plt.xticks(range(len(scores)))
plt.xlabel("Folds")
plt.ylabel("Accuracy")
plt.tight_layout()
# plt.savefig("10-fold CV C2.svg")
# plt.savefig("10-fold CV C2.png")
plt.show()


In [None]:
acc_train_arr = np.array([])
acc_test_arr = np.array([])

param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

for i in range(len(param_range)):
    # train/test datasets
    df_feat_X_train, df_feat_X_test, hypno_y_train, hypno_y_test = train_test_split()

    # standardize the columns
    sc = StandardScaler()
    sc.fit(df_feat_all)  # first fit all the dataset
    X_train_std = sc.transform(df_feat_X_train)  # then transform train
    X_test_std = sc.transform(df_feat_X_test)  # and test

    # train the classifier
    svm = SVC(kernel="rbf", C=param_range[i])
    svm.fit(X_train_std, hypno_y_train)
    y_pred = svm.predict(X_test_std)
    print("Misclassified examples: %d" % (hypno_y_test != y_pred).sum())
    print("Accuracy: %.3f" % accuracy_score(hypno_y_test, y_pred))
    acc_train = accuracy_score(hypno_y_test, y_pred)

    acc_train_arr = np.append(acc_train_arr, acc_train)

    print(f"Gridsearch {i}: C= {param_range[i]}, Acc.: {np.round(acc_train,3):03d}")


In [None]:
fig, ax = plt.subplots()
plt.plot(param_range, acc_train_arr, "--", color="yellowgreen", linewidth=2)
plt.plot(param_range, acc_train_arr, "s", color="darkolivegreen")
plt.grid()
plt.xscale("log")
plt.xlabel("Parameter C")
plt.ylabel("Accuracy")
plt.title("Tuning C hyperparameter, C=10 is optimum")
plt.ylim([0, 1.0])
plt.tight_layout()
# plt.savefig('hyperparam C.svg')
# plt.savefig('hyperparam C.png')
plt.show()


In [None]:
acc_train_arr = acc_train_arr[:-1]