# Classification Implementation

In [1]:
import numpy as np
import mne
from scipy import signal
from scipy.interpolate import RectBivariateSpline
from mne.filter import resample, filter_data
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from lspopt import spectrogram_lspopt
from matplotlib.colors import Normalize, ListedColormap

import logging
LOGGING_TYPES = dict(DEBUG=logging.DEBUG, INFO=logging.INFO, WARNING=logging.WARNING,
                     ERROR=logging.ERROR, CRITICAL=logging.CRITICAL)
logger = logging.getLogger('yasa')

%matplotlib qt


In [2]:
# load reference_df     
reference_df = pd.read_csv("reference_df.csv", index_col="name")
reference_df.head(3)

Unnamed: 0_level_0,hypno,df_feat,eeg
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P18_N3 L,/Users/amirhosseindaraie/Desktop/data/synced-h...,feature/P18_N3 L.csv,/Users/amirhosseindaraie/Desktop/data/autoscor...
P18_N2 R,/Users/amirhosseindaraie/Desktop/data/synced-h...,feature/P18_N2 R.csv,/Users/amirhosseindaraie/Desktop/data/autoscor...
P17_N2 L,/Users/amirhosseindaraie/Desktop/data/synced-h...,feature/P17_N2 L.csv,/Users/amirhosseindaraie/Desktop/data/autoscor...


In [3]:
# load csv    
rankings_df = pd.read_csv("rankings_df aug.csv", index_col="method_name")
rankings_df.head(3)

Unnamed: 0_level_0,iqr,ab,ag,sb,sg,bs,ta_b,gs,ga,std,...,mean,ts,da,dfa,std_psd,ds,dt,katz,mean_psd,mean_distance
method_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
f_classif,9.0,2.0,3.0,1.0,4.0,7.0,5.0,15.0,26.0,39.0,...,73.0,54.0,68.0,45.0,60.0,71.0,65.0,56.0,64.0,63.0
MI,1.0,19.0,18.0,22.0,27.0,24.0,23.0,28.0,17.0,2.0,...,14.0,52.0,47.0,70.0,65.0,50.0,63.0,72.0,68.0,73.0
chiSqr,6.0,1.0,4.0,2.0,8.0,9.0,12.0,3.0,5.0,13.0,...,73.0,61.0,58.0,60.0,54.0,62.0,63.0,64.0,66.0,71.0


In [4]:
### to see id's
idx = reference_df.index.to_list()

epochs_count = 0
hypno_30s_all = np.array([])

# to loop over all recording files:
for i in range(0, len(reference_df)):
    # To load information of each night:
    name = reference_df.iloc[i].name
    hypno_30s_loc = reference_df.iloc[i].hypno
    hypno_30s_loc = hypno_30s_loc.split(".")[0] + " aug.txt"
    hypno_30s = np.loadtxt(hypno_30s_loc, delimiter="\n")
    # to append current hypno to array of all hypnos - to plotting histogram later:
    hypno_30s_all = np.append(hypno_30s_all, hypno_30s)
    # count the number of epochs
    epochs_count += len(hypno_30s)

print(f"{epochs_count} epochs available across {len(idx)} recordings.")

# plotting histogram of classes in all hypnos:
stages, counts = np.unique(hypno_30s_all, return_counts=True)

fig, ax = plt.subplots(figsize=(7, 6))
ax.bar(stages, counts, color="blueviolet")
ax.set(xticks=np.arange(0, 4 + 1, 1), xticklabels=["Wake", "N1", "N2", "N3", "REM"])
ax.tick_params(axis="x", labelsize=13, labelrotation=20, labelcolor="green", width=3)
ax.tick_params(axis="y", labelsize=13, labelrotation=20, labelcolor="orangered")
plt.xlabel("Sleep stage")
plt.ylabel("Count")
plt.title(f"Sleep stages for {epochs_count} epochs across {len(idx)} recordings")
plt.tight_layout()
plt.savefig("stage_distribution_count aug.svg")
plt.savefig("stage_distribution_count aug.png")
plt.show()


120272 epochs available across 61 recordings.


In [None]:
### to load augmented hypno:
name = reference_df.iloc[0].name
hypno_30s_loc = reference_df.iloc[0].hypno
hypno_30s_loc = hypno_30s_loc.split(".")[0] + " aug.txt"
hypno_30s = np.loadtxt(hypno_30s_loc, delimiter="\n")

### to load features for augmented eeg:
df_feat_loc = reference_df.iloc[0].df_feat
df_feat_loc = df_feat_loc.split(".")[0] + " aug.csv"
df_feat = pd.read_csv(df_feat_loc, index_col=False)

### to load augmented eeg:
eeg_loc = reference_df.iloc[0].eeg
eeg_loc = eeg_loc.split(".")[0] + " aug.txt"
data = np.loadtxt(eeg_loc, delimiter=",")  # took ~7 seconds # this is filtered data actually

In [5]:
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from scipy.stats import kruskal
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
import seaborn as sns

# Train on 60 nights, test of 1 night
(1560*60 x 75)

In [6]:
idx_all_recordings = np.random.permutation(len(reference_df))
idx_train_recordings = idx_all_recordings[0:-1]
idx_test_recordings = idx_all_recordings[-1]
print("train recordings (index): ")
print(idx_train_recordings)
print("test recordings: ")
print(idx_test_recordings)


train recordings (index): 
[ 3 26 11 16 35 19 44 28  0 37 17 43 33 60 46 39  2 25 34 51 22 48 24  8
 57 38 53 54 41 30 29 55 36 56 40 58 50  5 20 14 10 45  7 52 18  1 42  6
 12 49 15 32 23 21  9 47 31 13 59  4]
test recordings: 
27


In [7]:
df_feat_X_train = np.array([])
df_feat_X_test = np.array([])
hypno_y_train = np.array([])
hypno_y_test = np.array([])

columns = rankings_df.columns[:40]  # for selecting top columns

# to loop over all recording files:
for i in idx_train_recordings:
    ### to load augmented hypno:
    name = reference_df.iloc[i].name
    hypno_30s_loc = reference_df.iloc[i].hypno
    hypno_30s_loc = hypno_30s_loc.split(".")[0] + " aug.txt"
    hypno_30s = np.loadtxt(hypno_30s_loc, delimiter="\n")

    ### to load features for augmented eeg:
    df_feat_loc = reference_df.iloc[i].df_feat
    df_feat_loc = df_feat_loc.split(".")[0] + " aug.csv"
    df_feat = pd.read_csv(df_feat_loc, index_col=False)

    df_feat = df_feat.replace(
        [np.inf, -np.inf], 0
    )  # Replacing infinite values in features

    ### select top 25 ranks columns
    df_feat = df_feat[columns]

    ### shuffle X
    permut = np.random.permutation(df_feat.shape[0])
    df_feat = df_feat.iloc[permut]

    ### to load features for train: append df_feat to df_feat_X_train
    if i == idx_train_recordings[0]:
        df_feat_X_train = df_feat.to_numpy()
    else:
        df_feat_X_train = np.vstack([df_feat_X_train, df_feat.to_numpy()])

    ### shuffle y
    hypno_30s = hypno_30s[permut]

    ### to load labels for train: append hypno to hypno_y_train
    hypno_y_train = np.append(hypno_y_train, hypno_30s)


### to load features for test:
df_feat_loc = reference_df.iloc[idx_test_recordings].df_feat
df_feat_loc = df_feat_loc.split(".")[0] + " aug.csv"
df_feat = pd.read_csv(df_feat_loc, index_col=False)
df_feat_X_test = df_feat[columns].to_numpy()

### to load labels for test:
hypno_30s_loc = reference_df.iloc[idx_test_recordings].hypno
hypno_30s_loc = hypno_30s_loc.split(".")[0] + " aug.txt"
hypno_30s = np.loadtxt(hypno_30s_loc, delimiter="\n")
hypno_y_test = hypno_30s

print(
    f"Train set: X={df_feat_X_train.shape} y={df_feat_X_train.shape, hypno_y_train.shape}"
)
print(f"Test set: {df_feat_X_test.shape} y={df_feat_X_test.shape, hypno_y_test.shape}")


Train set: X=(118258, 40) y=((118258, 40), (118258,))
Test set: (2014, 40) y=((2014, 40), (2014,))


In [8]:
df_feat_all = np.array([])

# to loop over all recording files:
for i in range(len(reference_df)):
    ### to load augmented hypno:
    name = reference_df.iloc[i].name
    hypno_30s_loc = reference_df.iloc[i].hypno
    hypno_30s_loc = hypno_30s_loc.split(".")[0] + " aug.txt"
    hypno_30s = np.loadtxt(hypno_30s_loc, delimiter="\n")

    ### to load features for augmented eeg:
    df_feat_loc = reference_df.iloc[i].df_feat
    df_feat_loc = df_feat_loc.split(".")[0] + " aug.csv"
    df_feat = pd.read_csv(df_feat_loc, index_col=False)

    df_feat = df_feat.replace(
        [np.inf, -np.inf], 0
    )  # Replacing infinite values in features

    ### select top 25 ranks columns
    df_feat = df_feat[columns]

    ### to load features for train: append df_feat to df_feat_X_train
    if i == 0:
        df_feat_all = df_feat.to_numpy()
    else:
        df_feat_all = np.vstack([df_feat_all, df_feat.to_numpy()])

print(df_feat_all.shape)


(120272, 40)


In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# we will standardize the columns in dataset before we feed them to a classifier
sc = StandardScaler()
sc.fit(df_feat_all)
X_train_std = sc.transform(df_feat_X_train)
X_test_std = sc.transform(df_feat_X_test)

svm = SVC(kernel="rbf")
svm.fit(X_train_std, hypno_y_train)
y_pred = svm.predict(X_test_std)
print("Misclassified examples: %d" % (hypno_y_test != y_pred).sum())
print("Accuracy: %.3f" % accuracy_score(hypno_y_test, y_pred))


In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(hypno_y_test, y_pred)


In [None]:
def find_TP(y_true, y_pred):
    # counts the number of true positives (y_true = 1, y_pred = 1)
    return sum((y_true == 1) & (y_pred == 1))


def find_FN(y_true, y_pred):
    # counts the number of false negatives (y_true = 1, y_pred = 0)
    return  # your code here


def find_FP(y_true, y_pred):
    # counts the number of false positives (y_true = 0, y_pred = 1)
    return  # your code here


def find_TN(y_true, y_pred):
    # counts the number of true negatives (y_true = 0, y_pred = 0)
    return  # your code here


print("TP:", find_TP((hypno_y_test, y_pred)))
print("FN:", find_FN((hypno_y_test, y_pred)))
print("FP:", find_FP((hypno_y_test, y_pred)))
print("TN:", find_TN((hypno_y_test, y_pred)))


In [None]:
df_feat_X_train = np.array([])
df_feat_X_test = np.array([])
hypno_y_train = np.array([])
hypno_y_test = np.array([])

columns = rankings_df.columns[:40]  # for selecting top columns

# to loop over all recording files:
for i in idx_train_recordings:
    ### to load augmented hypno:
    name = reference_df.iloc[i].name
    hypno_30s_loc = reference_df.iloc[i].hypno
    hypno_30s_loc = hypno_30s_loc.split(".")[0] + " aug.txt"
    hypno_30s = np.loadtxt(hypno_30s_loc, delimiter="\n")

    ### to load features for augmented eeg:
    df_feat_loc = reference_df.iloc[i].df_feat
    df_feat_loc = df_feat_loc.split(".")[0] + " aug.csv"
    df_feat = pd.read_csv(df_feat_loc, index_col=False)

    df_feat = df_feat.replace(
        [np.inf, -np.inf], 0
    )  # Replacing infinite values in features

    ### select top 25 ranks columns
    df_feat = df_feat[columns]

    ### shuffle X
    permut = np.random.permutation(df_feat.shape[0])
    df_feat = df_feat.iloc[permut]

    ### to load features for train: append df_feat to df_feat_X_train
    if i == idx_train_recordings[0]:
        df_feat_X_train = df_feat.to_numpy()
    else:
        df_feat_X_train = np.vstack([df_feat_X_train, df_feat.to_numpy()])

    ### shuffle y
    hypno_30s = hypno_30s[permut]

    ### to load labels for train: append hypno to hypno_y_train
    hypno_y_train = np.append(hypno_y_train, hypno_30s)


### to load features for test:
df_feat_loc = reference_df.iloc[idx_test_recordings].df_feat
df_feat_loc = df_feat_loc.split(".")[0] + " aug.csv"
df_feat = pd.read_csv(df_feat_loc, index_col=False)
df_feat_X_test = df_feat[columns].to_numpy()

### to load labels for test:
hypno_30s_loc = reference_df.iloc[idx_test_recordings].hypno
hypno_30s_loc = hypno_30s_loc.split(".")[0] + " aug.txt"
hypno_30s = np.loadtxt(hypno_30s_loc, delimiter="\n")
hypno_y_test = hypno_30s

print(
    f"Train set: X={df_feat_X_train.shape} y={df_feat_X_train.shape, hypno_y_train.shape}"
)
print(f"Test set: {df_feat_X_test.shape} y={df_feat_X_test.shape, hypno_y_test.shape}")


Train set: X=(118258, 40) y=((118258, 40), (118258,))
Test set: (2014, 40) y=((2014, 40), (2014,))
