# Classification Implementation

In [2]:
import numpy as np
import mne
from scipy import signal
from scipy.interpolate import RectBivariateSpline
from mne.filter import resample, filter_data
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from lspopt import spectrogram_lspopt
from matplotlib.colors import Normalize, ListedColormap

import logging
LOGGING_TYPES = dict(DEBUG=logging.DEBUG, INFO=logging.INFO, WARNING=logging.WARNING,
                     ERROR=logging.ERROR, CRITICAL=logging.CRITICAL)
logger = logging.getLogger('yasa')

%matplotlib qt


In [3]:
# load reference_df     
reference_df = pd.read_csv("reference_df.csv", index_col="name")
reference_df.head(3)

Unnamed: 0_level_0,hypno,df_feat,eeg
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P18_N3 L,/Users/amirhosseindaraie/Desktop/data/synced-h...,feature/P18_N3 L.csv,/Users/amirhosseindaraie/Desktop/data/autoscor...
P18_N2 R,/Users/amirhosseindaraie/Desktop/data/synced-h...,feature/P18_N2 R.csv,/Users/amirhosseindaraie/Desktop/data/autoscor...
P17_N2 L,/Users/amirhosseindaraie/Desktop/data/synced-h...,feature/P17_N2 L.csv,/Users/amirhosseindaraie/Desktop/data/autoscor...


In [4]:
# load csv    
rankings_df = pd.read_csv("rankings_df aug.csv", index_col="method_name")
rankings_df.head(3)

Unnamed: 0_level_0,iqr,ab,ag,sb,sg,bs,ta_b,gs,ga,std,...,mean,ts,da,dfa,std_psd,ds,dt,katz,mean_psd,mean_distance
method_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
f_classif,9.0,2.0,3.0,1.0,4.0,7.0,5.0,15.0,26.0,39.0,...,73.0,54.0,68.0,45.0,60.0,71.0,65.0,56.0,64.0,63.0
MI,1.0,19.0,18.0,22.0,27.0,24.0,23.0,28.0,17.0,2.0,...,14.0,52.0,47.0,70.0,65.0,50.0,63.0,72.0,68.0,73.0
chiSqr,6.0,1.0,4.0,2.0,8.0,9.0,12.0,3.0,5.0,13.0,...,73.0,61.0,58.0,60.0,54.0,62.0,63.0,64.0,66.0,71.0


Plot sleep stage distribution

In [5]:
### to see id's
idx = reference_df.index.to_list()

epochs_count = 0
hypno_30s_all = np.array([])

# to loop over all recording files:
for i in range(0, len(reference_df)):
    # To load information of each night:
    name = reference_df.iloc[i].name
    hypno_30s_loc = reference_df.iloc[i].hypno
    hypno_30s_loc = hypno_30s_loc.split(".")[0] + " aug.txt"
    hypno_30s = np.loadtxt(hypno_30s_loc, delimiter="\n")
    # to append current hypno to array of all hypnos - to plotting histogram later:
    hypno_30s_all = np.append(hypno_30s_all, hypno_30s)
    # count the number of epochs
    epochs_count += len(hypno_30s)

print(f"{epochs_count} epochs available across {len(idx)} recordings.")

# plotting histogram of classes in all hypnos:
stages, counts = np.unique(hypno_30s_all, return_counts=True)

fig, ax = plt.subplots(figsize=(7, 6))
ax.bar(stages, counts, color="blueviolet")
ax.set(xticks=np.arange(0, 4 + 1, 1), xticklabels=["Wake", "N1", "N2", "N3", "REM"])
ax.tick_params(axis="x", labelsize=13, labelrotation=20, labelcolor="green", width=3)
ax.tick_params(axis="y", labelsize=13, labelrotation=20, labelcolor="orangered")
plt.xlabel("Sleep stage")
plt.ylabel("Count")
plt.title(f"Sleep stages for {epochs_count} epochs across {len(idx)} recordings")
plt.tight_layout()
# plt.savefig("stage_distribution_count aug.svg")
# plt.savefig("stage_distribution_count aug.png")
plt.show()


120272 epochs available across 61 recordings.


In [7]:
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from scipy.stats import kruskal
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
import seaborn as sns

# Train on 60 nights, test of 1 night
(1560*60 x 75)

In [19]:
idx_all_recordings = np.random.permutation(len(reference_df))
idx_train_recordings = idx_all_recordings[:-11]
idx_test_recordings = idx_all_recordings[-11:]
print(">>>>>>>> train recordings (index): ")
print(idx_train_recordings)
print(">>>>>>>> test recordings: ")
print(idx_test_recordings)


>>>>>>>> train recordings (index): 
[39 32 15 27 52 41 19 49  5 60 55  8 26 40 10 16 18 30 42 58 31 34 25 11
 36 22 23 59  1 14 44  2 37 57  4 38  7 33 54  6  0 43 56 47 17 45 51 35
 24 21]
>>>>>>>> test recordings: 
[28  9 20 46  3 53 29 50 12 13 48]


To split the dataset to train and test + shuffle each night

In [24]:
df_feat_X_train = np.array([])
df_feat_X_test = np.array([])
hypno_y_train = np.array([])
hypno_y_test = np.array([])

columns = rankings_df.columns[:40]  # for selecting top columns

# to loop over all recording files:
for i in idx_train_recordings:
    ### to load augmented hypnos for train:
    name = reference_df.iloc[i].name
    hypno_30s_loc = reference_df.iloc[i].hypno
    hypno_30s_loc = hypno_30s_loc.split(".")[0] + " aug.txt"
    hypno_30s = np.loadtxt(hypno_30s_loc, delimiter="\n")

    ### to load features of augmented eeg for train:
    df_feat_loc = reference_df.iloc[i].df_feat
    df_feat_loc = df_feat_loc.split(".")[0] + " aug.csv"
    df_feat = pd.read_csv(df_feat_loc, index_col=False)

    df_feat = df_feat.replace(
        [np.inf, -np.inf], 0
    )  # Replacing infinite values in features

    ### select top 25 ranks columns
    df_feat = df_feat[columns]

    ### shuffle X
    permut = np.random.permutation(df_feat.shape[0])
    df_feat = df_feat.iloc[permut]

    ### to load features for train: append df_feat to df_feat_X_train
    if i == idx_train_recordings[0]:
        df_feat_X_train = df_feat.to_numpy()
    else:
        df_feat_X_train = np.vstack([df_feat_X_train, df_feat.to_numpy()])

    ### shuffle y
    hypno_30s = hypno_30s[permut]

    ### to load labels for train: append hypno to hypno_y_train
    hypno_y_train = np.append(hypno_y_train, hypno_30s)


for i in idx_test_recordings:
    ### to load features for test:
    df_feat_loc = reference_df.iloc[i].df_feat
    df_feat_loc = df_feat_loc.split(".")[0] + " aug.csv"
    df_feat = pd.read_csv(df_feat_loc, index_col=False)

    ### to load labels for test:
    hypno_30s_loc = reference_df.iloc[i].hypno
    hypno_30s_loc = hypno_30s_loc.split(".")[0] + " aug.txt"
    hypno_30s = np.loadtxt(hypno_30s_loc, delimiter="\n")

    df_feat = df_feat.replace(
        [np.inf, -np.inf], 0
    )  # Replacing infinite values in features

    ### select top 25 ranks columns
    df_feat = df_feat[columns].to_numpy()

    ### to load features for train: append df_feat to df_feat_X_train
    if i == idx_test_recordings[0]:
        df_feat_X_test = df_feat
    else:
        df_feat_X_test = np.vstack([df_feat_X_test, df_feat])

    ### to load labels for train: append hypno to hypno_y_train
    hypno_y_test = np.append(hypno_y_test, hypno_30s)


print(
    f"Train set: X={df_feat_X_train.shape} y={hypno_y_train.shape}"
)
print(f"Test set: X={df_feat_X_test.shape} y={hypno_y_test.shape}")


Train set: X=(97206, 40) y=(97206,)
Test set: X=(23066, 40) y=(23066,)


Generate a numpy array including all epochs 

To standardize all dataset including train and test, after train/test split

In [25]:
df_feat_all = np.array([])

# to loop over all recording files:
for i in range(len(reference_df)):
    ### to load augmented hypno:
    name = reference_df.iloc[i].name
    hypno_30s_loc = reference_df.iloc[i].hypno
    hypno_30s_loc = hypno_30s_loc.split(".")[0] + " aug.txt"
    hypno_30s = np.loadtxt(hypno_30s_loc, delimiter="\n")

    ### to load features for augmented eeg:
    df_feat_loc = reference_df.iloc[i].df_feat
    df_feat_loc = df_feat_loc.split(".")[0] + " aug.csv"
    df_feat = pd.read_csv(df_feat_loc, index_col=False)

    df_feat = df_feat.replace(
        [np.inf, -np.inf], 0
    )  # Replacing infinite values in features

    ### select top 25 ranks columns
    df_feat = df_feat[columns]

    ### to load features for train: append df_feat to df_feat_X_train
    if i == 0:
        df_feat_all = df_feat.to_numpy()
    else:
        df_feat_all = np.vstack([df_feat_all, df_feat.to_numpy()])

print(df_feat_all.shape)


(120272, 40)


In [26]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# we will standardize the columns in dataset before we feed them to a classifier
sc = StandardScaler()
sc.fit(df_feat_all) # first fit all the dataset
X_train_std = sc.transform(df_feat_X_train) # then transform train 
X_test_std = sc.transform(df_feat_X_test) # and test

svm = SVC(kernel="rbf")
svm.fit(X_train_std, hypno_y_train)
y_pred = svm.predict(X_test_std)
print("Misclassified examples: %d" % (hypno_y_test != y_pred).sum())
print("Accuracy: %.3f" % accuracy_score(hypno_y_test, y_pred))


Misclassified examples: 5055
Accuracy: 0.781


Calculate classification metrics 

In [93]:
def confmat_f(confmat):
    fig, ax = plt.subplots(figsize=(10, 10))

    ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)

    for i in range(confmat.shape[0]):
        for j in range(confmat.shape[1]):
            ax.text(x=j, y=i, s=confmat[i, j], va="center", ha="center")
    ax.set(
        xticklabels=["Wake", "N1", "N2", "N3", "REM"],
        xticks=range(5),
        yticklabels=["Wake", "N1", "N2", "N3", "REM"],
        yticks=range(5),
    )
    ax.xaxis.set_label_position("top")
    ax.xaxis.labelpad = 15
    ax.xaxis.set_tick_params(labeltop=True)
    plt.xlabel("Predicted label")
    plt.ylabel("True label")
    plt.title("Confusion Matrix", y=-0.1)
    plt.tight_layout()
    plt.savefig("confmat.png")
    plt.savefig("confmat.svg")
    plt.show()


from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

report = classification_report(hypno_y_test, y_pred)
print(report)

confmat = confusion_matrix(hypno_y_test, y_pred)
confmat_f(confmat)


              precision    recall  f1-score   support

         0.0       0.69      0.59      0.64      3393
         1.0       0.60      0.43      0.50      4270
         2.0       0.58      0.77      0.66      5576
         3.0       0.86      0.76      0.81      5906
         4.0       0.75      0.83      0.79      6037

    accuracy                           0.70     25182
   macro avg       0.70      0.68      0.68     25182
weighted avg       0.71      0.70      0.70     25182



  ax.set(


# Tune C parameter with learning curve

In [127]:
def train_test_split(test_prop=0.2):

    idx_all_recordings = np.random.permutation(len(reference_df))
    idx_train_recordings = idx_all_recordings[: -int(test_prop * 61)]
    idx_test_recordings = idx_all_recordings[-int(test_prop * 61) :]
    # print(">>>>>>>> train recordings (index): ")
    # print(idx_train_recordings)
    # print(">>>>>>>> test recordings: ")
    # print(idx_test_recordings)

    df_feat_X_train = np.array([])
    df_feat_X_test = np.array([])
    hypno_y_train = np.array([])
    hypno_y_test = np.array([])

    columns = rankings_df.columns[:40]  # for selecting top columns

    # to loop over all recording files:
    for i in idx_train_recordings:
        ### to load augmented hypnos for train:
        name = reference_df.iloc[i].name
        hypno_30s_loc = reference_df.iloc[i].hypno
        hypno_30s_loc = hypno_30s_loc.split(".")[0] + " aug.txt"
        hypno_30s = np.loadtxt(hypno_30s_loc, delimiter="\n")

        ### to load features of augmented eeg for train:
        df_feat_loc = reference_df.iloc[i].df_feat
        df_feat_loc = df_feat_loc.split(".")[0] + " aug.csv"
        df_feat = pd.read_csv(df_feat_loc, index_col=False)

        df_feat = df_feat.replace(
            [np.inf, -np.inf], 0
        )  # Replacing infinite values in features

        ### select top 25 ranks columns
        df_feat = df_feat[columns]

        ### shuffle X
        permut = np.random.permutation(df_feat.shape[0])
        df_feat = df_feat.iloc[permut]

        ### to load features for train: append df_feat to df_feat_X_train
        if i == idx_train_recordings[0]:
            df_feat_X_train = df_feat.to_numpy()
        else:
            df_feat_X_train = np.vstack([df_feat_X_train, df_feat.to_numpy()])

        ### shuffle y
        hypno_30s = hypno_30s[permut]

        ### to load labels for train: append hypno to hypno_y_train
        hypno_y_train = np.append(hypno_y_train, hypno_30s)

    for i in idx_test_recordings:
        ### to load features for test:
        df_feat_loc = reference_df.iloc[i].df_feat
        df_feat_loc = df_feat_loc.split(".")[0] + " aug.csv"
        df_feat = pd.read_csv(df_feat_loc, index_col=False)

        ### to load labels for test:
        hypno_30s_loc = reference_df.iloc[i].hypno
        hypno_30s_loc = hypno_30s_loc.split(".")[0] + " aug.txt"
        hypno_30s = np.loadtxt(hypno_30s_loc, delimiter="\n")

        df_feat = df_feat.replace(
            [np.inf, -np.inf], 0
        )  # Replacing infinite values in features

        ### select top 25 ranks columns
        df_feat = df_feat[columns].to_numpy()

        ### to load features for train: append df_feat to df_feat_X_train
        if i == idx_test_recordings[0]:
            df_feat_X_test = df_feat
        else:
            df_feat_X_test = np.vstack([df_feat_X_test, df_feat])

        ### to load labels for train: append hypno to hypno_y_train
        hypno_y_test = np.append(hypno_y_test, hypno_30s)

    print(f"Train set: X={df_feat_X_train.shape} y={hypno_y_train.shape}")
    print(f"Test set: X={df_feat_X_test.shape} y={hypno_y_test.shape}")

    return df_feat_X_train, df_feat_X_test, hypno_y_train, hypno_y_test

df_feat_X_train, df_feat_X_test, hypno_y_train, hypno_y_test = train_test_split(0.1)

Train set: X=(108244, 40) y=(108244,)
Test set: X=(12028, 40) y=(12028,)


In [128]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

from sklearn.pipeline import make_pipeline

pipe_lr = make_pipeline(StandardScaler(), SVC(kernel="rbf", C=10))

kfold = StratifiedKFold(n_splits=10).split(df_feat_X_train, hypno_y_train)

scores = []
for k, (train, test) in enumerate(kfold):
    pipe_lr.fit(df_feat_X_train[train], hypno_y_train[train])
    score = pipe_lr.score(df_feat_X_train[test], hypno_y_train[test])
    scores.append(score)

    print(
        f"Fold: {k+1:02d}, "
        f"Class distr.: {np.bincount(hypno_y_train[train].astype(int))}, "
        f"Acc.: {score:.3f}"
    )

mean_acc = np.mean(scores)
std_acc = np.std(scores)
print(f"\nCV accuracy: {mean_acc:.3f} +/- {std_acc:.3f}")


In [125]:
plt.figure()
plt.plot(scores, c="darkturquoise")
plt.plot(range(len(scores)), scores, "s", c="darkslategrey")
plt.ylim([0, 1])
plt.xlim([-0.5, len(scores) - 0.5])
plt.grid()
plt.title(f"Stratified 10-fold CV to estimate accuracy: {mean_acc:.3f} +/- {std_acc:.3f} ")
plt.xticks(range(len(scores)))
plt.xlabel("Folds")
plt.ylabel("Accuracy")
plt.tight_layout()
# plt.savefig("10-fold CV C1.svg")
# plt.savefig("10-fold CV C1.png")
plt.show()


In [87]:
acc_train_arr = np.array([])
acc_test_arr = np.array([])

param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

for i in range(len(param_range)):
    # train/test datasets
    df_feat_X_train, df_feat_X_test, hypno_y_train, hypno_y_test = train_test_split()

    # standardize the columns
    sc = StandardScaler()
    sc.fit(df_feat_all)  # first fit all the dataset
    X_train_std = sc.transform(df_feat_X_train)  # then transform train
    X_test_std = sc.transform(df_feat_X_test)  # and test

    # train the classifier
    svm = SVC(kernel="rbf", C=param_range[i])
    svm.fit(X_train_std, hypno_y_train)
    y_pred = svm.predict(X_test_std)
    print("Misclassified examples: %d" % (hypno_y_test != y_pred).sum())
    print("Accuracy: %.3f" % accuracy_score(hypno_y_test, y_pred))
    acc_train = accuracy_score(hypno_y_test, y_pred)

    acc_train_arr = np.append(acc_train_arr, acc_train)

    print(f"Gridsearch {i}: C= {param_range[i]}, Acc.: {np.round(acc_train,3)}")


Train set: X=(97372, 40) y=(97372,)
Test set: X=(22900, 40) y=(22900,)
Misclassified examples: 6930
Accuracy: 0.697
Gridsearch 0: C= 0.001, Acc.: 0.697
Train set: X=(95806, 40) y=(95806,)
Test set: X=(24466, 40) y=(24466,)
Misclassified examples: 6955
Accuracy: 0.716
Gridsearch 1: C= 0.01, Acc.: 0.716
Train set: X=(96272, 40) y=(96272,)
Test set: X=(24000, 40) y=(24000,)
Misclassified examples: 6179
Accuracy: 0.743
Gridsearch 2: C= 0.1, Acc.: 0.743
Train set: X=(96745, 40) y=(96745,)
Test set: X=(23527, 40) y=(23527,)
Misclassified examples: 5900
Accuracy: 0.749
Gridsearch 3: C= 1.0, Acc.: 0.749
Train set: X=(97712, 40) y=(97712,)
Test set: X=(22560, 40) y=(22560,)
Misclassified examples: 5522
Accuracy: 0.755
Gridsearch 4: C= 10.0, Acc.: 0.755
Train set: X=(95090, 40) y=(95090,)
Test set: X=(25182, 40) y=(25182,)
Misclassified examples: 7541
Accuracy: 0.701
Gridsearch 5: C= 100.0, Acc.: 0.701


In [124]:
fig, ax = plt.subplots()
plt.plot(param_range, acc_train_arr, "--", color="yellowgreen", linewidth=2)
plt.plot(param_range, acc_train_arr, "s", color="darkolivegreen")
plt.grid()
plt.xscale("log")
plt.xlabel("Parameter C")
plt.ylabel("Accuracy")
plt.title("Tuning C hyperparameter, C=10 is optimum")
plt.ylim([0, 1.0])
plt.tight_layout()
# plt.savefig('hyperparam C.svg')
# plt.savefig('hyperparam C.png')
plt.show()


In [103]:
acc_train_arr = acc_train_arr[:-1]