# Classification Implementation

In [7]:
import numpy as np
import mne
from scipy import signal
from scipy.interpolate import RectBivariateSpline
from mne.filter import resample, filter_data
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from lspopt import spectrogram_lspopt
from matplotlib.colors import Normalize, ListedColormap

import logging
LOGGING_TYPES = dict(DEBUG=logging.DEBUG, INFO=logging.INFO, WARNING=logging.WARNING,
                     ERROR=logging.ERROR, CRITICAL=logging.CRITICAL)
logger = logging.getLogger('yasa')

%matplotlib qt


In [8]:
# load reference_df     
reference_df = pd.read_csv("reference_df.csv", index_col="name")
reference_df.head(3)

Unnamed: 0_level_0,hypno,df_feat,eeg
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P18_N3 L,/Users/amirhosseindaraie/Desktop/data/synced-h...,feature/P18_N3 L.csv,/Users/amirhosseindaraie/Desktop/data/autoscor...
P18_N2 R,/Users/amirhosseindaraie/Desktop/data/synced-h...,feature/P18_N2 R.csv,/Users/amirhosseindaraie/Desktop/data/autoscor...
P17_N2 L,/Users/amirhosseindaraie/Desktop/data/synced-h...,feature/P17_N2 L.csv,/Users/amirhosseindaraie/Desktop/data/autoscor...


In [9]:
# load csv    
rankings_df = pd.read_csv("rankings_df aug.csv", index_col="method_name")
rankings_df.head(3)

Unnamed: 0_level_0,ab,sb,ag,sg,lziv,iqr,bs,ta_b,gs,alpha,...,median,mean_psd,E,WEn,ds,mean_distance,diffEnt,renyi,skew,mean
method_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
f_classif,2.0,1.0,3.0,4.0,6.0,9.0,7.0,5.0,15.0,8.0,...,61.0,64.0,66.0,62.0,71.0,63.0,67.0,69.0,72.0,73.0
chiSqr,1.0,2.0,4.0,8.0,7.0,6.0,9.0,12.0,3.0,10.0,...,69.0,66.0,65.0,70.0,62.0,71.0,68.0,67.0,72.0,73.0


# Train on 60 nights, test of 1 night
(1560*60 x 75)

In [10]:
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from scipy.stats import kruskal
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
idx_all_recordings = np.random.permutation(len(reference_df))
idx_train_recordings = idx_all_recordings[:-11]
idx_test_recordings = idx_all_recordings[-11:]
print(">>>>>>>> train recordings (index): ")
print(idx_train_recordings)
print(">>>>>>>> test recordings: ")
print(idx_test_recordings)


>>>>>>>> train recordings (index): 
[32 42 14 34 13 24 23 47 36  4 48 41 22 37 17 29 10 53  2 43 25  5 19 55
  8 16 56 60 52 27 11 46 12 39  7 44 45  3 26 33 51 30 57  1 54  0  6 35
 20 58]
>>>>>>>> test recordings: 
[28 40 31 49 15  9 50 38 59 18 21]


To split the dataset to train and test + shuffle each night

In [13]:
def train_test_split(test_prop=0.2, n_feat=40):

    idx_all_recordings = np.random.permutation(len(reference_df))
    idx_train_recordings = idx_all_recordings[: -int(test_prop * 61)]
    idx_test_recordings = idx_all_recordings[-int(test_prop * 61) :]
    # print(">>>>>>>> train recordings (index): ")
    # print(idx_train_recordings)
    # print(">>>>>>>> test recordings: ")
    # print(idx_test_recordings)

    df_feat_X_train = np.array([])
    df_feat_X_test = np.array([])
    hypno_y_train = np.array([])
    hypno_y_test = np.array([])

    columns = rankings_df.columns[:n_feat]  # for selecting top n_feat columns

    # to loop over all recording files:
    for i in idx_train_recordings:
        ### to load augmented hypnos for train:
        name = reference_df.iloc[i].name
        hypno_30s_loc = reference_df.iloc[i].hypno
        hypno_30s_loc = hypno_30s_loc.split(".")[0] + " aug.txt"
        hypno_30s = np.loadtxt(hypno_30s_loc, delimiter="\n")

        ### to load features of augmented eeg for train:
        df_feat_loc = reference_df.iloc[i].df_feat
        df_feat_loc = df_feat_loc.split(".")[0] + " aug.csv"
        df_feat = pd.read_csv(df_feat_loc, index_col=False)

        df_feat = df_feat.replace(
            [np.inf, -np.inf], 0
        )  # Replacing infinite values in features

        ### select top n_feat ranks columns
        df_feat = df_feat[columns]

        ### shuffle X
        permut = np.random.permutation(df_feat.shape[0])
        df_feat = df_feat.iloc[permut]

        ### to load features for train: append df_feat to df_feat_X_train
        if i == idx_train_recordings[0]:
            df_feat_X_train = df_feat.to_numpy()
        else:
            df_feat_X_train = np.vstack([df_feat_X_train, df_feat.to_numpy()])

        ### shuffle y
        hypno_30s = hypno_30s[permut]

        ### to load labels for train: append hypno to hypno_y_train
        hypno_y_train = np.append(hypno_y_train, hypno_30s)

    for i in idx_test_recordings:
        ### to load features for test:
        df_feat_loc = reference_df.iloc[i].df_feat
        df_feat_loc = df_feat_loc.split(".")[0] + " aug.csv"
        df_feat = pd.read_csv(df_feat_loc, index_col=False)

        ### to load labels for test:
        hypno_30s_loc = reference_df.iloc[i].hypno
        hypno_30s_loc = hypno_30s_loc.split(".")[0] + " aug.txt"
        hypno_30s = np.loadtxt(hypno_30s_loc, delimiter="\n")

        df_feat = df_feat.replace(
            [np.inf, -np.inf], 0
        )  # Replacing infinite values in features

        ### select top n_feat ranks columns
        df_feat = df_feat[columns].to_numpy()

        ### to load features for train: append df_feat to df_feat_X_train
        if i == idx_test_recordings[0]:
            df_feat_X_test = df_feat
        else:
            df_feat_X_test = np.vstack([df_feat_X_test, df_feat])

        ### to load labels for train: append hypno to hypno_y_train
        hypno_y_test = np.append(hypno_y_test, hypno_30s)

    print(f"Train set: X={df_feat_X_train.shape} y={hypno_y_train.shape}")
    print(f"Test set: X={df_feat_X_test.shape} y={hypno_y_test.shape}")

    ### To standardize all dataset including train and test, after train/test split
    # Generate a numpy array including all epochs:
    df_feat_all = np.array([])

    # # to loop over all recording files:
    # for i in range(len(reference_df)):
    #     ### to load augmented hypno:
    #     name = reference_df.iloc[i].name
    #     hypno_30s_loc = reference_df.iloc[i].hypno
    #     hypno_30s_loc = hypno_30s_loc.split(".")[0] + " aug.txt"
    #     hypno_30s = np.loadtxt(hypno_30s_loc, delimiter="\n")

    #     ### to load features for augmented eeg:
    #     df_feat_loc = reference_df.iloc[i].df_feat
    #     df_feat_loc = df_feat_loc.split(".")[0] + " aug.csv"
    #     df_feat = pd.read_csv(df_feat_loc, index_col=False)

    #     df_feat = df_feat.replace(
    #         [np.inf, -np.inf], 0
    #     )  # Replacing infinite values in features

    #     ### select top 25 ranks columns
    #     df_feat = df_feat[columns]

    #     ### to load features for train: append df_feat to df_feat_X_train
    #     if i == 0:
    #         df_feat_all = df_feat.to_numpy()
    #     else:
    #         df_feat_all = np.vstack([df_feat_all, df_feat.to_numpy()])

    # print(f"All: {df_feat_all.shape}")

    from sklearn.preprocessing import StandardScaler

    # we will standardize the columns in dataset before we feed them to a classifier
    sc = StandardScaler()
    sc.fit(df_feat_X_train)  # first fit all the dataset
    X_train_std = sc.transform(df_feat_X_train)  # then transform train
    X_test_std = sc.transform(df_feat_X_test)  # and test

    return X_train_std, X_test_std, hypno_y_train, hypno_y_test


X_train_std, X_test_std, y_train, y_test = train_test_split(0.1, n_feat=50)


Train set: X=(110073, 50) y=(110073,)
Test set: X=(10199, 50) y=(10199,)


Train simple 

In [14]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

svm = SVC(kernel="rbf", C=10, random_state=1)
svm.fit(X_train_std, y_train)
y_pred = svm.predict(X_test_std)
print("Misclassified examples: %d" % (y_test != y_pred).sum())
print("Accuracy: %.3f" % accuracy_score(y_test, y_pred))

Misclassified examples: 1952
Accuracy: 0.809


In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

def confmat_f(confmat):
    fig, ax = plt.subplots(figsize=(5, 5))

    ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)

    for i in range(confmat.shape[0]):
        for j in range(confmat.shape[1]):
            ax.text(x=j, y=i, s=confmat[i, j], va="center", ha="center")
    ax.set(
        xticklabels=["Wake", "N1", "N2", "N3", "REM"],
        xticks=range(5),
        yticklabels=["Wake", "N1", "N2", "N3", "REM"],
        yticks=range(5),
    )
    ax.xaxis.set_label_position("top")
    ax.xaxis.labelpad = 15
    ax.xaxis.set_tick_params(labeltop=True)
    plt.xlabel("Predicted label")
    plt.ylabel("True label")
    plt.title("Confusion Matrix", y=-0.1)
    plt.tight_layout()
    # plt.savefig("confmat.png")
    # plt.savefig("confmat.svg")
    plt.show()

### Using:
# report = classification_report(y_test, y_pred)
# print(report)

# confmat = confusion_matrix(y_test, y_pred)
# confmat_f(confmat)


In [18]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

n_feat_arr = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 72]
accuracy_arr = np.array([])
confmat_arr = []
report_arr = []

for i, n_feat in enumerate(n_feat_arr):
    # To split dataset into train/test set:
    X_train_std, X_test_std, y_train, y_test = train_test_split(test_prop=0.1, n_feat=n_feat)
    # To initiate model
    svm = SVC(kernel="rbf", C=10, random_state=1)
    # To fit the model to train set:
    svm.fit(X_train_std, y_train)
    # To predit on the test set:
    y_pred = svm.predict(X_test_std)
    # To print results
    print
    (
        f"Fold {i}, {n_feat} features: "
        f"Misclassified examples: {(y_test != y_pred).sum()}"
        f"Accuracy: {accuracy_score(y_test, y_pred)}"
    )
    # To append accuracy to array
    accuracy_arr = np.append(accuracy_arr, accuracy_score(y_test, y_pred))
    # to save report and confmat
    report = classification_report(y_test, y_pred)
    confmat = confusion_matrix(y_test, y_pred)
    report_arr.append(report)
    confmat_arr.append(confmat)
    print(report)
    confmat_f(confmat)


Train set: X=(106994, 5) y=(106994,)
Test set: X=(13278, 5) y=(13278,)


# Tune C parameter with learning curve

In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

from sklearn.pipeline import make_pipeline

pipe_lr = make_pipeline(StandardScaler(), SVC(kernel="rbf", C=10))

kfold = StratifiedKFold(n_splits=10).split(df_feat_X_train, hypno_y_train)

scores = []
for k, (train, test) in enumerate(kfold):
    pipe_lr.fit(df_feat_X_train[train], hypno_y_train[train])
    score = pipe_lr.score(df_feat_X_train[test], hypno_y_train[test])
    scores.append(score)

    print(
        f"Fold: {k+1:02d}, "
        f"Class distr.: {np.bincount(hypno_y_train[train].astype(int))}, "
        f"Acc.: {score:.3f}"
    )

mean_acc = np.mean(scores)
std_acc = np.std(scores)
print(f"\nCV accuracy: {mean_acc:.3f} +/- {std_acc:.3f}")


In [None]:
plt.figure()
plt.plot(scores, c="darkturquoise")
plt.plot(range(len(scores)), scores, "s", c="darkslategrey")
plt.ylim([0, 1])
plt.xlim([-0.5, len(scores) - 0.5])
plt.grid()
plt.title(f"Stratified 10-fold CV to estimate accuracy: {mean_acc:.3f} +/- {std_acc:.3f} ")
plt.xticks(range(len(scores)))
plt.xlabel("Folds")
plt.ylabel("Accuracy")
plt.tight_layout()
# plt.savefig("10-fold CV C2.svg")
# plt.savefig("10-fold CV C2.png")
plt.show()


In [None]:
acc_train_arr = np.array([])
acc_test_arr = np.array([])

param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

for i in range(len(param_range)):
    # train/test datasets
    df_feat_X_train, df_feat_X_test, hypno_y_train, hypno_y_test = train_test_split()

    # standardize the columns
    sc = StandardScaler()
    sc.fit(df_feat_all)  # first fit all the dataset
    X_train_std = sc.transform(df_feat_X_train)  # then transform train
    X_test_std = sc.transform(df_feat_X_test)  # and test

    # train the classifier
    svm = SVC(kernel="rbf", C=param_range[i])
    svm.fit(X_train_std, hypno_y_train)
    y_pred = svm.predict(X_test_std)
    print("Misclassified examples: %d" % (hypno_y_test != y_pred).sum())
    print("Accuracy: %.3f" % accuracy_score(hypno_y_test, y_pred))
    acc_train = accuracy_score(hypno_y_test, y_pred)

    acc_train_arr = np.append(acc_train_arr, acc_train)

    print(f"Gridsearch {i}: C= {param_range[i]}, Acc.: {np.round(acc_train,3):03d}")


In [None]:
fig, ax = plt.subplots()
plt.plot(param_range, acc_train_arr, "--", color="yellowgreen", linewidth=2)
plt.plot(param_range, acc_train_arr, "s", color="darkolivegreen")
plt.grid()
plt.xscale("log")
plt.xlabel("Parameter C")
plt.ylabel("Accuracy")
plt.title("Tuning C hyperparameter, C=10 is optimum")
plt.ylim([0, 1.0])
plt.tight_layout()
# plt.savefig('hyperparam C.svg')
# plt.savefig('hyperparam C.png')
plt.show()


In [None]:
acc_train_arr = acc_train_arr[:-1]