# Feature Selection for all recordings to select optimum set of features

In [1]:
import numpy as np
import mne
from scipy import signal
from scipy.interpolate import RectBivariateSpline
from mne.filter import resample, filter_data
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from lspopt import spectrogram_lspopt
from matplotlib.colors import Normalize, ListedColormap

import logging
LOGGING_TYPES = dict(DEBUG=logging.DEBUG, INFO=logging.INFO, WARNING=logging.WARNING,
                     ERROR=logging.ERROR, CRITICAL=logging.CRITICAL)
logger = logging.getLogger('yasa')

%matplotlib qt

In [2]:
# load reference_df     
reference_df = pd.read_csv("reference_df.csv", index_col="name")


In [3]:
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from scipy.stats import kruskal
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
import seaborn as sns

# ANOVA F-value

In [4]:
def feature_selection_f_classif(x, y):
    """
    Compute the ANOVA F-value for the provided sample.
    Example
    -------
    >>> fvals = feature_selection_f_classif(df_feat, hypno_30s)
    """
    k_best = SelectKBest(f_classif, k="all")
    fit = k_best.fit(x, y)
    # print("Scores: ", fit.scores_)
    # ranking = fit.get_support()
    # print("Ranking: ", ranking)
    # features = fit.transform(x)
    fvals = pd.Series(fit.scores_, index=df_feat.columns).sort_values()
    return fvals


fvals_df = None

for fname in reference_df.index.to_list():

    # to load hypno:
    hypno_loc = reference_df.loc[fname, "hypno"]
    hypno_30s = np.loadtxt(hypno_loc)[:, 0]

    # to load features:
    df_feat_loc = reference_df.loc[fname, "df_feat"]
    df_feat = pd.read_csv(df_feat_loc, index_col=False)

    df_feat = df_feat.iloc[hypno_30s != -1]  # do not process -1/artifact
    hypno_30s = hypno_30s[hypno_30s != -1]  # do not process -1/artifact

    df_feat = df_feat.replace(
        [np.inf, -np.inf], 0
    )  # Replacing infinite values in features

    # Compute the ANOVA F-value for the provided sample
    fvals = feature_selection_f_classif(df_feat, hypno_30s)

    # fig = plt.figure(figsize=(10, 10))
    # plt.suptitle(f'ANOVA F-value for features {fname}')
    # sns.barplot(y=fvals.index, x=fvals, palette="RdYlGn")
    # plt.xlabel("ANOVA F-value")
    # plt.xticks(rotation=20)
    # plt.yticks(size=8)
    # plt.tight_layout()
    # plt.savefig(f"fs_fclassif figures/fs_fclassif {fname}.png", format="png")
    # plt.savefig(f"fs_fclassif figures/fs_fclassif {fname}.svg", format="svg")
    # plt.close(fig)

    fvals["name"] = fname

    if fvals_df is None:  # if not exist, then create it
        fvals_df = pd.DataFrame(columns=fvals.index)
        fvals_df = pd.concat([fvals_df, fvals.to_frame().T])
    else:  # fill it
        fvals_df = pd.concat([fvals_df, fvals.to_frame().T])

fvals_df = fvals_df.set_index("name")
fvals_rank_df = fvals_df.rank(1, ascending=False, method="first")


In [5]:
fvals_df.head()

Unnamed: 0_level_0,mean,mean_distance,kurt,tsallisEnt,ta_ab,skew,kurt_psd,hmob_psd,skew_psd,hcomp_psd,...,ta_b,ag,sb,bs,ab,bubbleEnt2,bubbleEnt1,higuchi,petrosian,perm_entropy
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P18_N3 L,0.0928,6.149911,7.692397,11.067577,14.120312,14.196453,15.915379,17.275372,18.200308,18.566625,...,322.636758,349.377676,408.420443,425.036249,465.798516,477.2886,503.638771,549.004697,727.020899,733.173883
P18_N2 R,1.500872,10.749421,12.394198,6.636104,134.130197,14.671352,0.629736,2.685262,0.180387,1.173113,...,380.517748,179.256889,185.807955,274.501092,276.15998,326.350628,319.089471,321.482972,357.859509,376.608552
P17_N2 L,0.101974,13.74741,4.106133,1.912803,23.509277,24.574931,34.463018,28.87749,31.529567,47.311055,...,98.802522,48.973867,184.81137,88.172361,103.455912,143.761545,140.083655,156.364232,156.887225,158.272113
P15_N3 L,0.203199,1.480968,12.550943,4.6729,43.016497,20.763756,30.550745,27.175153,33.382739,25.066674,...,375.339801,310.956197,383.652952,185.725355,369.576229,138.154496,160.329126,88.843691,82.148455,92.989689
P15_N2 L,1.829339,2.444973,11.41833,12.688154,32.638936,15.668298,11.526563,9.559136,12.219586,9.148662,...,33.889055,53.687051,72.672184,35.712206,49.578912,54.414961,66.50891,83.255641,96.330132,102.109769


In [6]:
fvals_rank_df.head()

Unnamed: 0_level_0,mean,mean_distance,kurt,tsallisEnt,ta_ab,skew,kurt_psd,hmob_psd,skew_psd,hcomp_psd,...,ta_b,ag,sb,bs,ab,bubbleEnt2,bubbleEnt1,higuchi,petrosian,perm_entropy
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P18_N3 L,75.0,74.0,73.0,72.0,71.0,70.0,69.0,68.0,67.0,66.0,...,10.0,9.0,8.0,7.0,6.0,5.0,4.0,3.0,2.0,1.0
P18_N2 R,72.0,64.0,63.0,66.0,19.0,60.0,74.0,71.0,75.0,73.0,...,1.0,14.0,13.0,10.0,9.0,5.0,7.0,6.0,3.0,2.0
P17_N2 L,75.0,59.0,73.0,74.0,45.0,41.0,27.0,34.0,31.0,22.0,...,9.0,20.0,1.0,11.0,8.0,6.0,7.0,5.0,4.0,3.0
P15_N3 L,75.0,74.0,71.0,73.0,40.0,63.0,51.0,56.0,49.0,59.0,...,2.0,6.0,1.0,9.0,3.0,12.0,10.0,17.0,21.0,16.0
P15_N2 L,75.0,74.0,64.0,59.0,34.0,53.0,63.0,68.0,60.0,69.0,...,33.0,12.0,6.0,28.0,17.0,11.0,7.0,3.0,2.0,1.0


In [7]:
plt.figure(figsize=(15,5))
fvals_df = fvals_df.rename(columns={'central_tendency_measure':"CTM"}) # first name is too long :) so abbreviate it!
fvals_rank_df = fvals_rank_df.rename(columns={'central_tendency_measure':"CTM"})

fvals_df = fvals_df.astype(float)
fvals_df = fvals_df[fvals_rank_df.median().sort_values().index]
ax = fvals_df.boxplot()	
ax.tick_params(
    axis="x", labelsize=10,
    labelrotation=-90,
    labelcolor="darkblue")
ax.tick_params(
    axis="y", labelsize=10,
    labelrotation=20,
    labelcolor="orangered")
plt.xlabel("Feature")
plt.ylabel("ANOVA F-value")
plt.title(f"ANOVA F-value across {reference_df.shape[0]} recordings")
plt.tight_layout()
# plt.savefig("fs_fclassif_ranking.png")
# plt.savefig("fs_fclassif_ranking.svg")
plt.show()

## Plot signal with the best feature from ANOVA

In [11]:
# Load the EDF file
fname = "P18_N2"  # define here
lr = "R"  # define here
location = f"/Users/amirhosseindaraie/Desktop/data/autoscoring-material/data/Zmax Donders/{fname}"
raw = mne.io.read_raw_edf(f"{location}/EEG {lr}.edf", preload=True, verbose=0)
raw.pick_types(eeg=True)
# fig = raw.plot(use_opengl=False)

# Apply a zero-phase bandpass filter between 0.5 ~ 45 Hz
raw.filter(0.5, 45)

# Extract the data and convert from V to uV
data = raw._data * 1e6
sf = raw.info["sfreq"]
chan = raw.ch_names

# Time vector in seconds
times = np.arange(data.size) / sf


def sliding_window(data, sf, window, step=None, axis=-1):
    """Calculate a sliding window of a 1D or 2D EEG signal.
    .. versionadded:: 0.1.7
    Parameters
    ----------
    data : numpy array
        The 1D or 2D EEG data.
    sf : float
        The sampling frequency of ``data``.
    window : int
        The sliding window length, in seconds.
    step : int
        The sliding window step length, in seconds.
        If None (default), ``step`` is set to ``window``,
        which results in no overlap between the sliding windows.
    axis : int
        The axis to slide over. Defaults to the last axis.
    Returns
    -------
    times : numpy array
        Time vector, in seconds, corresponding to the START of each sliding
        epoch in ``strided``.
    strided : numpy array
        A matrix where row in last dimension consists of one instance
        of the sliding window, shape (n_epochs, ..., n_samples).
    Notes
    -----
    This is a wrapper around the
    :py:func:`numpy.lib.stride_tricks.as_strided` function.
    Examples
    --------
    With a 1-D array
    >>> import numpy as np
    >>> from yasa import sliding_window
    >>> data = np.arange(20)
    >>> times, epochs = sliding_window(data, sf=1, window=5)
    >>> times
    array([ 0.,  5., 10., 15.])
    >>> epochs
    array([[ 0,  1,  2,  3,  4],
           [ 5,  6,  7,  8,  9],
           [10, 11, 12, 13, 14],
           [15, 16, 17, 18, 19]])
    >>> sliding_window(data, sf=1, window=5, step=1)[1]
    array([[ 0,  1,  2,  3,  4],
           [ 2,  3,  4,  5,  6],
           [ 4,  5,  6,  7,  8],
           [ 6,  7,  8,  9, 10],
           [ 8,  9, 10, 11, 12],
           [10, 11, 12, 13, 14],
           [12, 13, 14, 15, 16],
           [14, 15, 16, 17, 18]])
    >>> sliding_window(data, sf=1, window=11)[1]
    array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10]])
    With a N-D array
    >>> np.random.seed(42)
    >>> # 4 channels x 20 samples
    >>> data = np.random.randint(-100, 100, size=(4, 20))
    >>> epochs = sliding_window(data, sf=1, window=10)[1]
    >>> epochs.shape  # shape (n_epochs, n_channels, n_samples)
    (2, 4, 10)
    >>> epochs
    array([[[  2,  79,  -8, -86,   6, -29,  88, -80,   2,  21],
            [-13,  57, -63,  29,  91,  87, -80,  60, -43, -79],
            [-50,   7, -46, -37,  30, -50,  34, -80, -28,  66],
            [ -9,  10,  87,  98,  71, -93,  74, -66, -20,  63]],
           [[-26, -13,  16,  -1,   3,  51,  30,  49, -48, -99],
            [-12, -52, -42,  69,  87, -86,  89,  89,  74,  89],
            [-83,  31, -12, -41, -87, -92, -11, -48,  29, -17],
            [-51,   3,  31, -99,  33, -47,   5, -97, -47,  90]]])
    """
    from numpy.lib.stride_tricks import as_strided

    assert axis <= data.ndim, "Axis value out of range."
    assert isinstance(sf, (int, float)), "sf must be int or float"
    assert isinstance(window, (int, float)), "window must be int or float"
    assert isinstance(step, (int, float, type(None))), (
        "step must be int, " "float or None."
    )
    if isinstance(sf, float):
        assert sf.is_integer(), "sf must be a whole number."
        sf = int(sf)
    assert isinstance(axis, int), "axis must be int."

    # window and step in samples instead of points
    window *= sf
    step = window if step is None else step * sf

    if isinstance(window, float):
        assert window.is_integer(), "window * sf must be a whole number."
        window = int(window)

    if isinstance(step, float):
        assert step.is_integer(), "step * sf must be a whole number."
        step = int(step)

    assert step >= 1, "Stepsize may not be zero or negative."
    assert window < data.shape[axis], (
        "Sliding window size may not exceed " "size of selected axis"
    )

    # Define output shape
    shape = list(data.shape)
    shape[axis] = np.floor(data.shape[axis] / step - window / step + 1).astype(int)
    shape.append(window)

    # Calculate strides and time vector
    strides = list(data.strides)
    strides[axis] *= step
    strides.append(data.strides[axis])
    strided = as_strided(data, shape=shape, strides=strides)
    t = np.arange(strided.shape[-2]) * (step / sf)

    # Swap axis: n_epochs, ..., n_samples
    if strided.ndim > 2:
        strided = np.rollaxis(strided, -2, 0)
    return t, strided


# Convert the EEG data to 30-sec data
times, data_win = sliding_window(data[0], sf, window=30)

# Convert times to minutes
times /= 60

# Plot hypnogram and a feature
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 6), sharex=True)

fig.suptitle(f'Anova F-value top rank $\\sigma/\\beta$ for {fname} {lr}')

# Load hypnogram
location_hypno = "/Users/amirhosseindaraie/Desktop/data/synced-hypnos-merged"
hypno_30s = np.loadtxt(f"{location_hypno}/p18n2_synced.txt")[:, 0]

hypno = pd.Series(hypno_30s).map({-1: -1, 0: 0, 1: 2, 2: 3, 3: 4, 4: 1}).values
hypno_rem = np.ma.masked_not_equal(hypno, 1)

# Load feature object as a dataframe
df_feat = pd.read_csv(f"feature/{fname} {lr}.csv", index_col=False)

# Plot the hypnogram
ax1.step(times, -1 * hypno, color="k", lw=1.5)
ax1.step(times, -1 * hypno_rem, color="r", lw=2.5)
ax1.set_yticks([0, -1, -2, -3, -4])
ax1.set_yticklabels(["W", "R", "N1", "N2", "N3"])
ax1.set_ylim(-4.5, 0.5)
ax1.set_ylabel("Sleep stage")

def normalize(v):
    norm = np.linalg.norm(v)
    if norm == 0:
        return v
    return v / norm

# Plot the non-linear feature
ax2.plot(times, normalize(df_feat["sb"]))
ax2.set_ylabel("Sigma / Beta ($\\sigma/\\beta$)")
ax2.set_xlabel("Time [minutes]")
ax2.set_xlim(0, times[-1])

plt.tight_layout()
plt.savefig(f"fs_fclassif_rank1 {fname} {lr}.png")
plt.savefig(f"fs_fclassif_rank1 {fname} {lr}.svg")
plt.show()


Filtering raw data in 1 contiguous segment
Setting up band-pass filter from 0.5 - 45 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal bandpass filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 0.50
- Lower transition bandwidth: 0.50 Hz (-6 dB cutoff frequency: 0.25 Hz)
- Upper passband edge: 45.00 Hz
- Upper transition bandwidth: 11.25 Hz (-6 dB cutoff frequency: 50.62 Hz)
- Filter length: 1691 samples (6.605 sec)



# chi squared