# Classification Implementation

In [1]:
import numpy as np
import mne
from scipy import signal
from scipy.interpolate import RectBivariateSpline
from mne.filter import resample, filter_data
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from lspopt import spectrogram_lspopt
from matplotlib.colors import Normalize, ListedColormap

import logging
LOGGING_TYPES = dict(DEBUG=logging.DEBUG, INFO=logging.INFO, WARNING=logging.WARNING,
                     ERROR=logging.ERROR, CRITICAL=logging.CRITICAL)
logger = logging.getLogger('yasa')

%matplotlib qt


In [2]:
# load reference_df     
reference_df = pd.read_csv("reference_df.csv", index_col="name")
reference_df.head(3)

Unnamed: 0_level_0,hypno,df_feat,eeg
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P18_N3 L,/Users/amirhosseindaraie/Desktop/data/synced-h...,feature/P18_N3 L.csv,/Users/amirhosseindaraie/Desktop/data/autoscor...
P18_N2 R,/Users/amirhosseindaraie/Desktop/data/synced-h...,feature/P18_N2 R.csv,/Users/amirhosseindaraie/Desktop/data/autoscor...
P17_N2 L,/Users/amirhosseindaraie/Desktop/data/synced-h...,feature/P17_N2 L.csv,/Users/amirhosseindaraie/Desktop/data/autoscor...


In [3]:
# load csv    
rankings_df = pd.read_csv("rankings_df aug.csv", index_col="method_name")
rankings_df.head(3)

Unnamed: 0_level_0,ab,sb,ag,sg,lziv,iqr,bs,ta_b,gs,alpha,...,median,mean_psd,E,WEn,ds,mean_distance,diffEnt,renyi,skew,mean
method_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
f_classif,2.0,1.0,3.0,4.0,6.0,9.0,7.0,5.0,15.0,8.0,...,61.0,64.0,66.0,62.0,71.0,63.0,67.0,69.0,72.0,73.0
chiSqr,1.0,2.0,4.0,8.0,7.0,6.0,9.0,12.0,3.0,10.0,...,69.0,66.0,65.0,70.0,62.0,71.0,68.0,67.0,72.0,73.0


# Train on 60 nights, test of 1 night
(1560*60 x 75)

In [4]:
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from scipy.stats import kruskal
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
idx_all_recordings = np.random.permutation(len(reference_df))
idx_train_recordings = idx_all_recordings[:-11]
idx_test_recordings = idx_all_recordings[-11:]
print(">>>>>>>> train recordings (index): ")
print(idx_train_recordings)
print(">>>>>>>> test recordings: ")
print(idx_test_recordings)


>>>>>>>> train recordings (index): 
[39 28 24 17 41  5 10 60 20 14 42 49  0 30 35 54 11 23 43 45 50  7 40 32
 18 15 52 13 59 48  8 47 56 27  9 55 25 53 37 46 16 38 58 26 22 36  6  4
 21 44]
>>>>>>>> test recordings: 
[33  3 31 57  2  1 19 12 51 34 29]


To split the dataset to train and test + shuffle each night

In [6]:
def train_test_split(test_prop=0.2, n_feat=40):

    idx_all_recordings = np.random.permutation(len(reference_df))
    idx_train_recordings = idx_all_recordings[: -int(test_prop * 61)]
    idx_test_recordings = idx_all_recordings[-int(test_prop * 61) :]
    print(">>>>>>>> train recordings (index): ")
    print(idx_train_recordings)
    print(">>>>>>>> test recordings: ")
    print(idx_test_recordings)

    df_feat_X_train = np.array([])
    df_feat_X_test = np.array([])
    hypno_y_train = np.array([])
    hypno_y_test = np.array([])

    columns = rankings_df.columns[:n_feat]  # for selecting top n_feat columns

    # to loop over all recording files:
    for i in idx_train_recordings:
        ### to load augmented hypnos for train:
        name = reference_df.iloc[i].name
        hypno_30s_loc = reference_df.iloc[i].hypno
        hypno_30s_loc = hypno_30s_loc.split(".")[0] + " aug.txt"
        hypno_30s = np.loadtxt(hypno_30s_loc, delimiter="\n")

        ### to load features of augmented eeg for train:
        df_feat_loc = reference_df.iloc[i].df_feat
        df_feat_loc = df_feat_loc.split(".")[0] + " aug.csv"
        df_feat = pd.read_csv(df_feat_loc, index_col=False)

        df_feat = df_feat.replace(
            [np.inf, -np.inf], 0
        )  # Replacing infinite values in features

        ### select top n_feat ranks columns
        df_feat = df_feat[columns]

        ### shuffle X
        permut = np.random.permutation(df_feat.shape[0])
        df_feat = df_feat.iloc[permut]

        ### to load features for train: append df_feat to df_feat_X_train
        if i == idx_train_recordings[0]:
            df_feat_X_train = df_feat.to_numpy()
        else:
            df_feat_X_train = np.vstack([df_feat_X_train, df_feat.to_numpy()])

        ### shuffle y
        hypno_30s = hypno_30s[permut]

        ### to load labels for train: append hypno to hypno_y_train
        hypno_y_train = np.append(hypno_y_train, hypno_30s)

    for i in idx_test_recordings:
        ### to load features for test:
        df_feat_loc = reference_df.iloc[i].df_feat
        df_feat_loc = df_feat_loc.split(".")[0] + " aug.csv"
        df_feat = pd.read_csv(df_feat_loc, index_col=False)

        ### to load labels for test:
        hypno_30s_loc = reference_df.iloc[i].hypno
        hypno_30s_loc = hypno_30s_loc.split(".")[0] + " aug.txt"
        hypno_30s = np.loadtxt(hypno_30s_loc, delimiter="\n")

        df_feat = df_feat.replace(
            [np.inf, -np.inf], 0
        )  # Replacing infinite values in features

        ### select top n_feat ranks columns
        df_feat = df_feat[columns].to_numpy()

        ### to load features for train: append df_feat to df_feat_X_train
        if i == idx_test_recordings[0]:
            df_feat_X_test = df_feat
        else:
            df_feat_X_test = np.vstack([df_feat_X_test, df_feat])

        ### to load labels for train: append hypno to hypno_y_train
        hypno_y_test = np.append(hypno_y_test, hypno_30s)

    print(f"Train set: X={df_feat_X_train.shape} y={hypno_y_train.shape}")
    print(f"Test set: X={df_feat_X_test.shape} y={hypno_y_test.shape}")

    ### To standardize all dataset including train and test, after train/test split
    # Generate a numpy array including all epochs:
    df_feat_all = np.array([])

    from sklearn.preprocessing import StandardScaler

    # we will standardize the columns in dataset before we feed them to a classifier
    sc = StandardScaler()
    sc.fit(df_feat_X_train)  # first fit all the dataset
    X_train_std = sc.transform(df_feat_X_train)  # then transform train
    X_test_std = sc.transform(df_feat_X_test)  # and test

    return X_train_std, X_test_std, hypno_y_train, hypno_y_test


X_train_std, X_test_std, y_train, y_test = train_test_split(0.1, n_feat=50)


>>>>>>>> train recordings (index): 
[14 35 16 47 44 38 33 58  8 34  4 59 60 41  1 43 50 40 15  0 54 55 36 21
 45  7 46 17 51 25  9 18 39 12  3 22 49 29 31  2 20  5 52 37 42 48  6 57
 24 28 56 53 32 26 13]
>>>>>>>> test recordings: 
[10 30 27 19 23 11]
Train set: X=(108915, 50) y=(108915,)
Test set: X=(11357, 50) y=(11357,)


Train simple 

In [7]:
# from sklearn.svm import SVC
# from sklearn.metrics import accuracy_score

# svm = SVC(kernel="rbf", C=10, random_state=1)
# svm.fit(X_train_std, y_train)
# y_pred = svm.predict(X_test_std)
# print("Misclassified examples: %d" % (y_test != y_pred).sum())
# print("Accuracy: %.3f" % accuracy_score(y_test, y_pred))

In [8]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

def confmat_f(confmat):
    fig, ax = plt.subplots(figsize=(5, 5))

    ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)

    for i in range(confmat.shape[0]):
        for j in range(confmat.shape[1]):
            ax.text(x=j, y=i, s=confmat[i, j], va="center", ha="center")
    ax.set(
        xticklabels=["Wake", "N1", "N2", "N3", "REM"],
        xticks=range(5),
        yticklabels=["Wake", "N1", "N2", "N3", "REM"],
        yticks=range(5),
    )
    ax.xaxis.set_label_position("top")
    ax.xaxis.labelpad = 15
    ax.xaxis.set_tick_params(labeltop=True)
    plt.xlabel("Predicted label")
    plt.ylabel("True label")
    plt.title("Confusion Matrix", y=-0.1)
    plt.tight_layout()
    # plt.savefig("confmat.png")
    # plt.savefig("confmat.svg")
    plt.show()

### Using:
# report = classification_report(y_test, y_pred)
# print(report)

# confmat = confusion_matrix(y_test, y_pred)
# confmat_f(confmat)


## Train with different top feature numbers 

In [9]:
# This cell took 5 hours to execute 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

n_feat_arr = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 72]
accuracy_arr = np.array([])
confmat_arr = []
report_arr = []

# for i, n_feat in enumerate(n_feat_arr):
#     # To split dataset into train/test set:
#     X_train_std, X_test_std, y_train, y_test = train_test_split(test_prop=0.1, n_feat=n_feat)
#     # To initiate model
#     svm = SVC(kernel="rbf", C=10, random_state=1)
#     # To fit the model to train set:
#     svm.fit(X_train_std, y_train)
#     # To predit on the test set:
#     y_pred = svm.predict(X_test_std)
#     # To print results
#     print
#     (
#         f"Fold {i}, {n_feat} features => Misclassified: {(y_test != y_pred).sum()}, Acc.: {accuracy_score(y_test, y_pred)}"
#     )
#     # To append accuracy to array
#     accuracy_arr = np.append(accuracy_arr, accuracy_score(y_test, y_pred))
#     # to save report and confmat
#     report = classification_report(y_test, y_pred)
#     confmat = confusion_matrix(y_test, y_pred)
#     report_arr.append(report)
#     confmat_arr.append(confmat)
#     print(report)
#     confmat_f(confmat)


In [10]:
# Write results to a file 
# textfile = open("report_arr hyperparam feat_num.txt", "w")
# for element in report_arr:
#     textfile.write(element + ",\n")
# textfile.close()

# load results from that file
report_arr = open("report_arr hyperparam feat_num.txt")
report_arr = ('').join(report_arr.readlines()).split(',\n')[:-1]

for el in report_arr:
    print(el)


              precision    recall  f1-score   support

         0.0       0.56      0.69      0.62      1733
         1.0       0.45      0.18      0.26      2623
         2.0       0.62      0.47      0.53      2861
         3.0       0.59      0.75      0.66      2941
         4.0       0.59      0.80      0.68      3120

    accuracy                           0.58     13278
   macro avg       0.56      0.58      0.55     13278
weighted avg       0.57      0.58      0.55     13278

              precision    recall  f1-score   support

         0.0       0.79      0.71      0.75      1978
         1.0       0.51      0.49      0.50      1954
         2.0       0.70      0.70      0.70      2411
         3.0       0.85      0.90      0.88      2588
         4.0       0.79      0.83      0.81      2796

    accuracy                           0.74     11727
   macro avg       0.73      0.73      0.73     11727
weighted avg       0.74      0.74      0.74     11727

              precisio

In [11]:
# Write results to a file 
# np.savetxt(
#     "confmat_arr hyperparam feat_num.csv",
#     np.array(confmat_arr).reshape((3, -1)),
#     delimiter=",",
# )

# load results from that file
confmat_arr = np.loadtxt("confmat_arr hyperparam feat_num.csv", delimiter=',')
confmat_arr = confmat_arr.reshape(15,5,5)


In [12]:
# class_i: [precision  recall   f1-score   support]
x = report_arr[0].split('0.0')[1].split("\n")[0].split("   ")
class_0 = np.array([i.strip() for i in list(filter(None, x))])
for j in range(1,15):
    x = report_arr[j].split('0.0')[1].split("\n")[0].split("   ")
    class_0 = np.vstack([class_0,np.array([i.strip() for i in list(filter(None, x))])])
class_0 = class_0.astype('float')
class_0 = class_0*100

x = report_arr[0].split('0.0')[1].split("\n")[1].split("   ")
class_1 = np.array([i.strip() for i in list(filter(None, x))])
for j in range(1,15):
    x = report_arr[j].split('0.0')[1].split("\n")[1].split("   ")
    class_1 = np.vstack([class_1,np.array([i.strip() for i in list(filter(None, x))])])
class_1 = class_1[:,1:]
class_1 = class_1.astype('float')
class_1 = class_1*100

x = report_arr[0].split('0.0')[1].split("\n")[2].split("   ")
class_2 = np.array([i.strip() for i in list(filter(None, x))])
for j in range(1,15):
    x = report_arr[j].split('0.0')[1].split("\n")[2].split("   ")
    class_2 = np.vstack([class_2,np.array([i.strip() for i in list(filter(None, x))])])
class_2 = class_2[:,1:]
class_2 = class_2.astype('float')
class_2 = class_2*100

x = report_arr[0].split('0.0')[1].split("\n")[3].split("   ")
class_3 = np.array([i.strip() for i in list(filter(None, x))])
for j in range(1,15):
    x = report_arr[j].split('0.0')[1].split("\n")[3].split("   ")
    class_3 = np.vstack([class_3,np.array([i.strip() for i in list(filter(None, x))])])
class_3 = class_3[:,1:]
class_3 = class_3.astype('float')
class_3 = class_3*100

x = report_arr[0].split('0.0')[1].split("\n")[4].split("   ")
class_4 = np.array([i.strip() for i in list(filter(None, x))])
for j in range(1,15):
    x = report_arr[j].split('0.0')[1].split("\n")[4].split("   ")
    class_4 = np.vstack([class_4,np.array([i.strip() for i in list(filter(None, x))])])
class_4 = class_4[:,1:]
class_4 = class_4.astype('float')
class_4 = class_4*100

In [13]:
maximums = [
    np.argmax(class_0[:, 0] + class_0[:, 1] + class_0[:, 2]),
    np.argmax(class_1[:, 0] + class_1[:, 1] + class_1[:, 2]),
    np.argmax(class_2[:, 0] + class_2[:, 1] + class_2[:, 2]),
    np.argmax(class_3[:, 0] + class_3[:, 1] + class_3[:, 2]),
    np.argmax(class_4[:, 0] + class_4[:, 1] + class_4[:, 2]),
]

maximums

[11, 12, 6, 11, 2]

In [14]:
fig, ax = plt.subplots(3, 1, figsize=(10, 5), sharex=True)
ylabel = ["Precision", "Recall", "F1-score"]
for i in range(3):
    ax[i].plot(n_feat_arr, class_0[:, i], label="Wake", color="tomato")
    ax[i].plot(n_feat_arr, class_0[:, i], "o", color="red")
    ax[i].axvline(x=n_feat_arr[maximums[0]], color="tomato", linestyle='--', linewidth=6, alpha=0.3)
    ax[i].plot(n_feat_arr, class_1[:, i], label="N1", color="gold")
    ax[i].plot(n_feat_arr, class_1[:, i], "o", color="goldenrod")
    ax[i].axvline(x=n_feat_arr[maximums[1]], color="gold", linestyle='--', linewidth=4, alpha=0.3)
    ax[i].plot(n_feat_arr, class_2[:, i], label="N2", color="limegreen")
    ax[i].plot(n_feat_arr, class_2[:, i], "o", color="olivedrab")
    ax[i].axvline(x=n_feat_arr[maximums[2]], color="limegreen", linestyle='--', linewidth=3, alpha=0.3)
    ax[i].plot(n_feat_arr, class_3[:, i], label="N3", color="dodgerblue")
    ax[i].plot(n_feat_arr, class_3[:, i], "o", color="royalblue")
    ax[i].axvline(x=n_feat_arr[maximums[3]], color="dodgerblue", linestyle='--', linewidth=4, alpha=0.3)
    ax[i].plot(n_feat_arr, class_4[:, i], label="REM", color="mediumslateblue")
    ax[i].plot(n_feat_arr, class_4[:, i], "o", color="darkviolet")
    ax[i].axvline(x=n_feat_arr[maximums[4]],color="mediumslateblue", linestyle='--', linewidth=4, alpha=0.3)
    ax[i].set(ylim=[25, 100], xticks=n_feat_arr)
    ax[i].grid(alpha=0.4)
    ax[i].set(ylabel=ylabel[i])

plt.xlabel("Number of top features")
ax[1].legend()
# ax[1].legend(loc=(1.01,0.05))
ax[0].set(
    title="SVM RBF C=10 performance metrics with different number of features. (train,test)=(55,6) sessions"
)
plt.tight_layout()
# plt.savefig('svm performace metrics plot.png')
# plt.savefig('svm performace metrics plot.svg')
plt.show()


In [15]:
arr = class_0[:, 0] + class_0[:, 1] + class_0[:, 2]
arr = np.vstack([arr, class_1[:, 0] + class_1[:, 1] + class_1[:, 2]])
arr = np.vstack([arr, class_2[:, 0] + class_2[:, 1] + class_2[:, 2]])
arr = np.vstack([arr, class_3[:, 0] + class_3[:, 1] + class_3[:, 2]])
arr = np.vstack([arr, class_4[:, 0] + class_4[:, 1] + class_4[:, 2]])

In [16]:
arr

array([[187., 225., 204., 219., 214., 215., 226., 205., 221., 213., 229.,
        232., 206., 218., 213.],
       [ 89., 150., 163., 141., 169., 155., 177., 169., 162., 152., 164.,
        169., 190., 185., 156.],
       [162., 210., 229., 228., 222., 207., 233., 227., 228., 225., 229.,
        232., 225., 218., 228.],
       [200., 263., 259., 268., 264., 249., 267., 258., 271., 264., 273.,
        274., 256., 249., 270.],
       [207., 243., 255., 245., 255., 246., 247., 235., 254., 242., 244.,
        248., 247., 247., 245.]])

In [17]:
fig, ax = plt.subplots(2,1,figsize=(10,7), sharex=True)
ax[0].imshow(arr, cmap="Blues")
ax[0].set(
    xticks=list(range(0, 15)),
    xticklabels=n_feat_arr,
    yticks=list(range(0, 5)),
    yticklabels=["Wake", "N1", "N2", "N3", "REM"],
    ylabel="Sleep Stage",
    title="Sum of precision, recall, and f1-score"
)

array = np.sum(arr, axis=0)
temp = array.argsort()
ranks = np.empty_like(temp)
ranks[temp] = np.arange(len(array))
ranks = 14 - ranks

ax[1].imshow(np.sum(arr, axis=0).reshape(1,-1), cmap="Blues")
ax[1].set(
    xticks=list(range(0, 15)),
    xticklabels=n_feat_arr,
    yticks=list(range(0, 1)),
    xlabel="Number of top features",
    ylabel="Sum of stages",
)
ax[1].set_yticklabels(["$\Sigma$"], fontsize=25)
for i in range(len(ranks)):
    if i == 0:
        ax[1].text(i, 0, ranks[i]+1, ha="center", va="center", color="black", fontsize=15)
    else:
        ax[1].text(i, 0, ranks[i]+1, ha="center", va="center", color="white", fontsize=15)
    
plt.subplots_adjust(top=1.4)
plt.tight_layout()
# plt.savefig("sum of metrics matrix.png")
# plt.savefig("sum of metrics matrix.svg")
plt.show()


## K-fold CV with C=10, n_feat=35

In [18]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

from sklearn.pipeline import make_pipeline

# pipe_lr = make_pipeline(SVC(kernel="rbf", C=10))

# X_train_std, X_test_std, y_train, y_test = train_test_split(test_prop=0.05, n_feat=35)
# kfold = StratifiedKFold(n_splits=10).split(X_train_std, y_train)

# scores = []
# for k, (train, test) in enumerate(kfold):

#     pipe_lr.fit(X_train_std[train], y_train[train])
#     score = pipe_lr.score(X_train_std[test], y_train[test])
#     scores.append(score)

#     print(
#         f"Fold: {k+1:02d}, Class distr.: {np.bincount(y_train[train].astype(int))}, Acc.: {score:.3f}"
#     )

# mean_acc = np.mean(scores)
# std_acc = np.std(scores)
# print(f"\nCV accuracy: {mean_acc:.3f} +/- {std_acc:.3f}")


In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

X_train_std, X_test_std, y_train, y_test = train_test_split(test_prop=0.25, n_feat=60)

svm = SVC(kernel="rbf", C=10, random_state=1)
svm.fit(X_train_std, y_train)
y_pred = svm.predict(X_test_std)
print("Misclassified examples: %d" % (y_test != y_pred).sum())
print("Accuracy: %.3f" % accuracy_score(y_test, y_pred))


>>>>>>>> train recordings (index): 
[14 21 37 22  8 15 48 60 23 40 32 25  6 45 57 28 47 11 10 35 27  3 20 52
 36 29 56 51 39  5 53 59 18  9 54 55 41  1 17 42 19 24 38 50 12 16]
>>>>>>>> test recordings: 
[33 31  4  0 26 13  2 44  7 49 46 34 30 58 43]
Train set: X=(90044, 60) y=(90044,)
Test set: X=(30228, 60) y=(30228,)
Misclassified examples: 7792
Accuracy: 0.742


In [20]:
report = classification_report(y_test, y_pred)
print(report)

confmat = confusion_matrix(y_test, y_pred)
confmat_f(confmat)


              precision    recall  f1-score   support

         0.0       0.73      0.67      0.70      4485
         1.0       0.63      0.51      0.56      5326
         2.0       0.68      0.75      0.71      6441
         3.0       0.83      0.84      0.83      6850
         4.0       0.80      0.85      0.82      7126

    accuracy                           0.74     30228
   macro avg       0.73      0.73      0.73     30228
weighted avg       0.74      0.74      0.74     30228



  ax.set(


# Plot results in hypnogram

In [60]:
def plot_spectrogram(
    data,
    sf,
    hypno=None,
    hypno_pred=None,
    win_sec=30,
    fmin=0.5,
    fmax=25,
    trimperc=2.5,
    cmap="RdBu_r",
):
    """
    Plot a full-night multi-taper spectrogram, optionally with the hypnogram on top.
    For more details, please refer to the `Jupyter notebook
    <https://github.com/raphaelvallat/yasa/blob/master/notebooks/10_spectrogram.ipynb>`_
    .. versionadded:: 0.1.8
    Parameters
    ----------
    data : :py:class:`numpy.ndarray`
        Single-channel EEG data. Must be a 1D NumPy array.
    sf : float
        The sampling frequency of data AND the hypnogram.
    hypno : array_like
        Sleep stage (hypnogram), optional.
        The hypnogram must have the exact same number of samples as ``data``.
        To upsample your hypnogram, please refer to :py:func:`yasa.hypno_upsample_to_data`.
        .. note::
            The default hypnogram format in YASA is a 1D integer
            vector where:
            - -2 = Unscored
            - -1 = Artefact / Movement
            - 0 = Wake
            - 1 = N1 sleep
            - 2 = N2 sleep
            - 3 = N3 sleep
            - 4 = REM sleep
    win_sec : int or float
        The length of the sliding window, in seconds, used for multitaper PSD
        calculation. Default is 30 seconds. Note that ``data`` must be at least
        twice longer than ``win_sec`` (e.g. 60 seconds).
    fmin, fmax : int or float
        The lower and upper frequency of the spectrogram. Default 0.5 to 25 Hz.
    trimperc : int or float
        The amount of data to trim on both ends of the distribution when
        normalizing the colormap. This parameter directly impacts the
        contrast of the spectrogram plot (higher values = higher contrast).
        Default is 2.5, meaning that the min and max of the colormap
        are defined as the 2.5 and 97.5 percentiles of the spectrogram.
    cmap : str
        Colormap. Default to 'RdBu_r'.
    Returns
    -------
    fig : :py:class:`matplotlib.figure.Figure`
        Matplotlib Figure
    Examples
    --------
    1. Full-night multitaper spectrogram on Cz, no hypnogram
    .. plot::
        >>> import yasa
        >>> import numpy as np
        >>> # In the next 5 lines, we're loading the data from GitHub.
        >>> import requests
        >>> from io import BytesIO
        >>> r = requests.get('https://github.com/raphaelvallat/yasa/raw/master/notebooks/data_full_6hrs_100Hz_Cz%2BFz%2BPz.npz', stream=True)
        >>> npz = np.load(BytesIO(r.raw.read()))
        >>> data = npz.get('data')[0, :]
        >>> sf = 100
        >>> fig = yasa.plot_spectrogram(data, sf)
    2. Full-night multitaper spectrogram on Cz with the hypnogram on top
    .. plot::
        >>> import yasa
        >>> import numpy as np
        >>> # In the next lines, we're loading the data from GitHub.
        >>> import requests
        >>> from io import BytesIO
        >>> r = requests.get('https://github.com/raphaelvallat/yasa/raw/master/notebooks/data_full_6hrs_100Hz_Cz%2BFz%2BPz.npz', stream=True)
        >>> npz = np.load(BytesIO(r.raw.read()))
        >>> data = npz.get('data')[0, :]
        >>> sf = 100
        >>> # Load the 30-sec hypnogram and upsample to data
        >>> hypno = np.loadtxt('https://raw.githubusercontent.com/raphaelvallat/yasa/master/notebooks/data_full_6hrs_100Hz_hypno_30s.txt')
        >>> hypno = yasa.hypno_upsample_to_data(hypno, 1/30, data, sf)
        >>> fig = yasa.plot_spectrogram(data, sf, hypno, cmap='Spectral_r')
    """
    # Increase font size while preserving original
    old_fontsize = plt.rcParams["font.size"]
    plt.rcParams.update({"font.size": 13})

    # Safety checks
    assert isinstance(data, np.ndarray), "Data must be a 1D NumPy array."
    assert isinstance(sf, (int, float)), "sf must be int or float."
    assert data.ndim == 1, "Data must be a 1D (single-channel) NumPy array."
    assert isinstance(win_sec, (int, float)), "win_sec must be int or float."
    assert isinstance(fmin, (int, float)), "fmin must be int or float."
    assert isinstance(fmax, (int, float)), "fmax must be int or float."
    assert fmin < fmax, "fmin must be strictly inferior to fmax."
    assert fmax < sf / 2, "fmax must be less than Nyquist (sf / 2)."

    # Calculate multi-taper spectrogram
    nperseg = int(win_sec * sf)
    assert data.size > 2 * nperseg, "Data length must be at least 2 * win_sec."
    f, t, Sxx = spectrogram_lspopt(data, sf, nperseg=nperseg, noverlap=0)
    Sxx = 10 * np.log10(Sxx)  # Convert uV^2 / Hz --> dB / Hz

    # Select only relevant frequencies (up to 30 Hz)
    good_freqs = np.logical_and(f >= fmin, f <= fmax)
    Sxx = Sxx[good_freqs, :]
    f = f[good_freqs]
    t /= 3600  # Convert t to hours

    # Normalization
    vmin, vmax = np.percentile(Sxx, [0 + trimperc, 100 - trimperc])
    norm = Normalize(vmin=vmin, vmax=vmax)

    if hypno is None:
        fig, ax = plt.subplots(nrows=1, figsize=(12, 4))
        im = ax.pcolormesh(
            t, f, Sxx, norm=norm, cmap=cmap, antialiased=True, shading="auto"
        )
        ax.set_xlim(0, t.max())
        ax.set_ylabel("Frequency [Hz]")
        ax.set_xlabel("Time [hrs]")

        # Add colorbar
        cbar = fig.colorbar(im, ax=ax, shrink=0.95, fraction=0.1, aspect=25)
        cbar.ax.set_ylabel("Log Power (dB / Hz)", rotation=270, labelpad=20)
        return fig
    elif (hypno is not None) and (hypno_pred is None):
        hypno = np.asarray(hypno).astype(int)
        assert hypno.ndim == 1, "Hypno must be 1D."
        assert hypno.size == data.size, "Hypno must have the same sf as data."
        t_hyp = np.arange(hypno.size) / (sf * 3600)
        # Make sure that REM is displayed after Wake
        hypno = (
            pd.Series(hypno).map({-2: -2, -1: -1, 0: 0, 1: 2, 2: 3, 3: 4, 4: 1}).values
        )
        hypno_rem = np.ma.masked_not_equal(hypno, 1)

        fig, (ax0, ax1) = plt.subplots(
            nrows=2, figsize=(12, 6), gridspec_kw={"height_ratios": [1, 2]}
        )
        plt.subplots_adjust(hspace=0.1)

        # Hypnogram (top axis)
        ax0.step(t_hyp, -1 * hypno, color="k")
        ax0.step(t_hyp, -1 * hypno_rem, color="r")
        if -2 in hypno and -1 in hypno:
            # Both Unscored and Artefacts are present
            ax0.set_yticks([2, 1, 0, -1, -2, -3, -4])
            ax0.set_yticklabels(["Uns", "Art", "W", "R", "N1", "N2", "N3"])
            ax0.set_ylim(-4.5, 2.5)
        elif -2 in hypno and -1 not in hypno:
            # Only Unscored are present
            ax0.set_yticks([2, 0, -1, -2, -3, -4])
            ax0.set_yticklabels(["Uns", "W", "R", "N1", "N2", "N3"])
            ax0.set_ylim(-4.5, 2.5)

        elif -2 not in hypno and -1 in hypno:
            # Only Artefacts are present
            ax0.set_yticks([1, 0, -1, -2, -3, -4])
            ax0.set_yticklabels(["Art", "W", "R", "N1", "N2", "N3"])
            ax0.set_ylim(-4.5, 1.5)
        else:
            # No artefacts or Unscored
            ax0.set_yticks([0, -1, -2, -3, -4])
            ax0.set_yticklabels(["W", "R", "N1", "N2", "N3"])
            ax0.set_ylim(-4.5, 0.5)
        ax0.set_xlim(0, t_hyp.max())
        ax0.set_ylabel("Stage")
        ax0.xaxis.set_visible(False)
        ax0.spines["right"].set_visible(False)
        ax0.spines["top"].set_visible(False)

        # Spectrogram (bottom axis)
        im = ax1.pcolormesh(
            t, f, Sxx, norm=norm, cmap=cmap, antialiased=True, shading="auto"
        )
        ax1.set_xlim(0, t.max())
        ax1.set_ylabel("Frequency [Hz]")
        ax1.set_xlabel("Time [hrs]")

        # Revert font-size
        plt.rcParams.update({"font.size": old_fontsize})
        return fig
    elif (hypno is not None) and (hypno_pred is not None):
        hypno_pred = np.asarray(hypno_pred).astype(int)
        hypno = np.asarray(hypno).astype(int)
        assert hypno.ndim == 1, "Hypno must be 1D."
        assert hypno.size == data.size, "Hypno must have the same sf as data."
        assert hypno_pred.ndim == 1, "hypno_pred must be 1D."
        assert hypno_pred.size == data.size, "hypno_pred must have the same sf as data."
        t_hyp = np.arange(hypno.size) / (sf * 3600)
        t_hyp_pred = np.arange(hypno_pred.size) / (sf * 3600)
        # Make sure that REM is displayed after Wake
        hypno = (
            pd.Series(hypno).map({-2: -2, -1: -1, 0: 0, 1: 2, 2: 3, 3: 4, 4: 1}).values
        )
        hypno_pred = (
            pd.Series(hypno_pred)
            .map({-2: -2, -1: -1, 0: 0, 1: 2, 2: 3, 3: 4, 4: 1})
            .values
        )
        hypno_rem = np.ma.masked_not_equal(hypno, 1)
        hypno_pred_rem = np.ma.masked_not_equal(hypno_pred, 1)

        fig, (ax0, ax1, ax2) = plt.subplots(
            nrows=3, figsize=(12, 6), gridspec_kw={"height_ratios": [1, 1, 2]}
        )
        plt.subplots_adjust(hspace=0.1)

        # Hypnogram (top axis)
        ax0.step(t_hyp, -1 * hypno, color="k")
        ax0.step(t_hyp, -1 * hypno_rem, color="r")
        if -2 in hypno and -1 in hypno:
            # Both Unscored and Artefacts are present
            ax0.set_yticks([2, 1, 0, -1, -2, -3, -4])
            ax0.set_yticklabels(["Uns", "Art", "W", "R", "N1", "N2", "N3"])
            ax0.set_ylim(-4.5, 2.5)
        elif -2 in hypno and -1 not in hypno:
            # Only Unscored are present
            ax0.set_yticks([2, 0, -1, -2, -3, -4])
            ax0.set_yticklabels(["Uns", "W", "R", "N1", "N2", "N3"])
            ax0.set_ylim(-4.5, 2.5)

        elif -2 not in hypno and -1 in hypno:
            # Only Artefacts are present
            ax0.set_yticks([1, 0, -1, -2, -3, -4])
            ax0.set_yticklabels(["Art", "W", "R", "N1", "N2", "N3"])
            ax0.set_ylim(-4.5, 1.5)
        else:
            # No artefacts or Unscored
            ax0.set_yticks([0, -1, -2, -3, -4])
            ax0.set_yticklabels(["W", "R", "N1", "N2", "N3"])
            ax0.set_ylim(-4.5, 0.5)
        ax0.set_xlim(0, t_hyp.max())
        ax0.set_ylabel("Stage")
        ax0.xaxis.set_visible(False)
        ax0.spines["right"].set_visible(False)
        ax0.spines["top"].set_visible(False)

        # Hypnogram Pred (middle axis)
        ax1.step(t_hyp_pred, -1 * hypno_pred, color="k")
        ax1.step(t_hyp_pred, -1 * hypno_pred_rem, color="r")
        if -2 in hypno_pred and -1 in hypno_pred:
            # Both Unscored and Artefacts are present
            ax1.set_yticks([2, 1, 0, -1, -2, -3, -4])
            ax1.set_yticklabels(["Uns", "Art", "W", "R", "N1", "N2", "N3"])
            ax1.set_ylim(-4.5, 2.5)
        elif -2 in hypno_pred and -1 not in hypno_pred:
            # Only Unscored are present
            ax1.set_yticks([2, 0, -1, -2, -3, -4])
            ax1.set_yticklabels(["Uns", "W", "R", "N1", "N2", "N3"])
            ax1.set_ylim(-4.5, 2.5)

        elif -2 not in hypno_pred and -1 in hypno_pred:
            # Only Artefacts are present
            ax1.set_yticks([1, 0, -1, -2, -3, -4])
            ax1.set_yticklabels(["Art", "W", "R", "N1", "N2", "N3"])
            ax1.set_ylim(-4.5, 1.5)
        else:
            # No artefacts or Unscored
            ax1.set_yticks([0, -1, -2, -3, -4])
            ax1.set_yticklabels(["W", "R", "N1", "N2", "N3"])
            ax1.set_ylim(-4.5, 0.5)
        ax1.set_xlim(0, t_hyp_pred.max())
        ax1.set_ylabel("Stage")
        ax1.xaxis.set_visible(False)
        ax1.spines["right"].set_visible(False)
        ax1.spines["top"].set_visible(False)

        # Spectrogram (bottom axis)
        im = ax2.pcolormesh(
            t, f, Sxx, norm=norm, cmap=cmap, antialiased=True, shading="auto"
        )
        ax2.set_xlim(0, t.max())
        ax2.set_ylabel("Frequency [Hz]")
        ax2.set_xlabel("Time [hrs]")

        # Revert font-size
        plt.rcParams.update({"font.size": old_fontsize})
        return fig


def format_seconds_to_hhmmss(seconds):
    hours = seconds // (60 * 60)
    seconds %= 60 * 60
    minutes = seconds // 60
    seconds %= 60
    return "%02i:%02i:%02i" % (hours, minutes, seconds)


def set_log_level(verbose=None):
    """Convenience function for setting the logging level.
    This function comes from the PySurfer package. See :
    https://github.com/nipy/PySurfer/blob/master/surfer/utils.py
    Parameters
    ----------
    verbose : bool, str, int, or None
        The verbosity of messages to print. If a str, it can be either
        PROFILER, DEBUG, INFO, WARNING, ERROR, or CRITICAL.
    """
    logger = logging.getLogger("yasa")
    if isinstance(verbose, bool):
        verbose = "INFO" if verbose else "WARNING"
    if isinstance(verbose, str):
        if verbose.upper() in LOGGING_TYPES:
            verbose = verbose.upper()
            verbose = LOGGING_TYPES[verbose]
            logger.setLevel(verbose)
        else:
            raise ValueError("verbose must be in %s" % ", ".join(LOGGING_TYPES))


def hypno_upsample_to_data(hypno, sf_hypno, data, sf_data=None, verbose=True):
    """Upsample an hypnogram to a given sampling frequency and fit the
    resulting hypnogram to corresponding EEG data, such that the hypnogram
    and EEG data have the exact same number of samples.
    .. versionadded:: 0.1.5
    Parameters
    ----------
    hypno : array_like
        The sleep staging (hypnogram) 1D array.
    sf_hypno : float
        The current sampling frequency of the hypnogram, in Hz, e.g.
        * 1/30 = 1 value per each 30 seconds of EEG data,
        * 1 = 1 value per second of EEG data
    data : array_like or :py:class:`mne.io.BaseRaw`
        1D or 2D EEG data. Can also be a :py:class:`mne.io.BaseRaw`, in which
        case ``data`` and ``sf_data`` will be automatically extracted.
    sf_data : float
        The sampling frequency of ``data``, in Hz (e.g. 100 Hz, 256 Hz, ...).
        Can be omitted if ``data`` is a :py:class:`mne.io.BaseRaw`.
    verbose : bool or str
        Verbose level. Default (False) will only print warning and error
        messages. The logging levels are 'debug', 'info', 'warning', 'error',
        and 'critical'. For most users the choice is between 'info'
        (or ``verbose=True``) and warning (``verbose=False``).
    Returns
    -------
    hypno : array_like
        The hypnogram, upsampled to ``sf_data`` and cropped/padded to ``max(data.shape)``.
    Warns
    -----
    UserWarning
        If the upsampled ``hypno`` is shorter / longer than ``max(data.shape)``
        and therefore needs to be padded/cropped respectively. This output can be disabled by
        passing ``verbose='ERROR'``.
    """
    set_log_level(verbose)
    if isinstance(data, mne.io.BaseRaw):
        sf_data = data.info["sfreq"]
        data = data.times

    # Upsample the hypnogram to a given sampling frequency
    repeats = sf_data / sf_hypno
    assert sf_hypno <= sf_data, "sf_hypno must be less than sf_data."
    assert repeats.is_integer(), "sf_hypno / sf_data must be a whole number."
    assert isinstance(hypno, (list, np.ndarray, pd.Series))
    hypno_up = np.repeat(np.asarray(hypno), repeats)

    # Crop or pad the hypnogram to fit the length of data.
    # Check if data is an MNE raw object
    hypno = hypno_up
    sf = sf_data
    if isinstance(data, mne.io.BaseRaw):
        sf = data.info["sfreq"]
        data = data.times  # 1D array and does not require to preload data
    data = np.asarray(data)
    hypno = np.asarray(hypno)
    assert hypno.ndim == 1, "Hypno must be 1D."
    npts_hyp = hypno.size
    npts_data = max(data.shape)  # Support for 2D data
    if npts_hyp < npts_data:
        # Hypnogram is shorter than data
        npts_diff = npts_data - npts_hyp
        if sf is not None:
            dur_diff = npts_diff / sf
            logger.warning(
                "Hypnogram is SHORTER than data by %.2f seconds. "
                "Padding hypnogram with last value to match data.size." % dur_diff
            )
        else:
            logger.warning(
                "Hypnogram is SHORTER than data by %i samples. "
                "Padding hypnogram with last value to match data.size." % npts_diff
            )
        hypno = np.pad(hypno, (0, npts_diff), mode="edge")
    elif npts_hyp > npts_data:
        # Hypnogram is longer than data
        npts_diff = npts_hyp - npts_data
        if sf is not None:
            dur_diff = npts_diff / sf
            logger.warning(
                "Hypnogram is LONGER than data by %.2f seconds. "
                "Cropping hypnogram to match data.size." % dur_diff
            )
        else:
            logger.warning(
                "Hypnogram is LONGER than data by %i samples. "
                "Cropping hypnogram to match data.size." % npts_diff
            )
        hypno = hypno[0:npts_data]

    return hypno


def transition_matrix(hypno):
    """Create a state-transition matrix from an hypnogram.
    .. versionadded:: 0.1.9
    Parameters
    ----------
    hypno : array_like
        Hypnogram. The dtype of ``hypno`` must be integer
        (e.g. [0, 2, 2, 1, 1, 1, ...]). The sampling frequency must be the
        original one, i.e. 1 value per 30 seconds if the staging was done in
        30 seconds epochs. Using an upsampled hypnogram will result in an
        incorrect transition matrix.
        For best results, we recommend using an hypnogram cropped to
        either the time in bed (TIB) or the sleep period time (SPT), without
        any artefact / unscored epochs.
    Returns
    -------
    counts : :py:class:`pandas.DataFrame`
        Counts transition matrix (number of transitions from stage A to
        stage B). The pre-transition states are the rows and the
        post-transition states are the columns.
    probs : :py:class:`pandas.DataFrame`
        Conditional probability transition matrix, i.e.
        given that current state is A, what is the probability that
        the next state is B.
        ``probs`` is a `right stochastic matrix
        <https://en.wikipedia.org/wiki/Stochastic_matrix>`_,
        i.e. each row sums to 1.
    Examples
    --------
    >>> import numpy as np
    >>> from yasa import transition_matrix
    >>> a = [0, 0, 0, 1, 1, 0, 1, 2, 2, 3, 3, 2, 3, 3, 0, 2, 2, 1, 2, 2, 3, 3]
    >>> counts, probs = transition_matrix(a)
    >>> counts
           0  1  2  3
    Stage
    0      2  2  1  0
    1      1  1  2  0
    2      0  1  3  3
    3      1  0  1  3
    >>> probs.round(2)
              0     1     2     3
    Stage
    0      0.40  0.40  0.20  0.00
    1      0.25  0.25  0.50  0.00
    2      0.00  0.14  0.43  0.43
    3      0.20  0.00  0.20  0.60
    Several metrics of sleep fragmentation can be calculated from the
    probability matrix. For example, the stability of sleep stages can be
    calculated by taking the average of the diagonal values (excluding Wake
    and N1 sleep):
    >>> np.diag(probs.loc[2:, 2:]).mean().round(3)
    0.514
    Finally, we can plot the transition matrix using :py:func:`seaborn.heatmap`
    .. plot::
        >>> import numpy as np
        >>> import seaborn as sns
        >>> import matplotlib.pyplot as plt
        >>> from yasa import transition_matrix
        >>> # Calculate probability matrix
        >>> a = [1, 1, 1, 0, 0, 2, 2, 0, 2, 0, 1, 1, 0, 0]
        >>> _, probs = transition_matrix(a)
        >>> # Start the plot
        >>> grid_kws = {"height_ratios": (.9, .05), "hspace": .1}
        >>> f, (ax, cbar_ax) = plt.subplots(2, gridspec_kw=grid_kws,
        ...                                 figsize=(5, 5))
        >>> sns.heatmap(probs, ax=ax, square=False, vmin=0, vmax=1, cbar=True,
        ...             cbar_ax=cbar_ax, cmap='YlOrRd', annot=True, fmt='.2f',
        ...             cbar_kws={"orientation": "horizontal", "fraction": 0.1,
        ...                       "label": "Transition probability"})
        >>> ax.set_xlabel("To sleep stage")
        >>> ax.xaxis.tick_top()
        >>> ax.set_ylabel("From sleep stage")
        >>> ax.xaxis.set_label_position('top')
    """
    x = np.asarray(hypno, dtype=int)
    unique, inverse = np.unique(x, return_inverse=True)  # unique is sorted
    n = unique.size
    # Integer transition counts
    counts = np.zeros((n, n), dtype=int)
    np.add.at(counts, (inverse[:-1], inverse[1:]), 1)
    # Conditional probabilities
    probs = counts / counts.sum(axis=-1, keepdims=True)
    # Convert to a Pandas DataFrame
    counts = pd.DataFrame(counts, index=unique, columns=unique)
    probs = pd.DataFrame(probs, index=unique, columns=unique)
    counts.index.name = "From Stage"
    probs.index.name = "From Stage"
    counts.columns.name = "To Stage"
    probs.columns.name = "To Stage"
    return counts, probs


In [59]:
### to load augmented hypno:
name = reference_df.iloc[33].name
hypno_30s_loc = reference_df.iloc[33].hypno
hypno_30s_loc = hypno_30s_loc.split(".")[0] + " aug.txt"
hypno_30s = np.loadtxt(hypno_30s_loc, delimiter="\n")

### to load features for augmented eeg:
df_feat_loc = reference_df.iloc[33].df_feat
df_feat_loc = df_feat_loc.split(".")[0] + " aug.csv"
df_feat = pd.read_csv(df_feat_loc, index_col=False)

### to load augmented eeg:
eeg_loc = reference_df.iloc[33].eeg
eeg_loc = eeg_loc.split(".")[0] + " aug.txt"
data = np.loadtxt(eeg_loc, delimiter=",")  # took ~7 seconds # this is filtered data actually

In [56]:
hypno_30s.shape

(1677,)

In [61]:
sf = 256

print(
    f"Duration: {data.flatten().shape[0]/sf} (sec) OR {format_seconds_to_hhmmss(data.flatten().shape[0]/sf)}"
)

fig = plot_spectrogram(data.flatten(), sf, fmax=45)
plt.title(
    f"Spectrogram of {name} - {format_seconds_to_hhmmss(data.shape[1]/sf)}", fontsize=16
)
plt.tight_layout()
# plt.savefig(f'spectro QS {folder} {LR}.png', dpi=100, bbox_inches='tight')
plt.show()

hypno = hypno_upsample_to_data(
    hypno=hypno_30s, sf_hypno=(1 / 30), data=data.flatten(), sf_data=sf
)
hypno_pred = hypno_upsample_to_data(
    hypno=y_pred[: len(hypno_30s)], sf_hypno=(1 / 30), data=data.flatten(), sf_data=sf
)

fig = plot_spectrogram(
    data.flatten(), sf, hypno=hypno, hypno_pred=hypno_pred, fmax=30, trimperc=5
)
fig.suptitle(
    f"Spectrogram and Hypnogram of {name} - {format_seconds_to_hhmmss(data.shape[1]/sf)}",
    fontsize=16,
)
plt.tight_layout()
# plt.savefig(f'spectro-hypno QS {folder} {LR}.png', dpi=100, bbox_inches='tight')
plt.show()


Duration: 50310.0 (sec) OR 13:58:30


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

X_train_std2, X_test_std2, y_train2, y_test2 = train_test_split(test_prop=0.25, n_feat=60)

svm = SVC(kernel="rbf", C=10, random_state=1)
svm.fit(X_train_std2, y_train2)
y_pred2 = svm.predict(X_test_std2)
print("Misclassified examples: %d" % (y_test2 != y_pred2).sum())
print("Accuracy: %.3f" % accuracy_score(y_test2, y_pred2))


In [None]:
report = classification_report(y_test, y_pred)
print(report)

confmat = confusion_matrix(y_test, y_pred)
confmat_f(confmat)
