In [1]:
import numpy as np
import mne
from scipy import signal
from scipy.interpolate import RectBivariateSpline
from mne.filter import resample, filter_data
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from lspopt import spectrogram_lspopt
from matplotlib.colors import Normalize, ListedColormap

import logging
LOGGING_TYPES = dict(DEBUG=logging.DEBUG, INFO=logging.INFO, WARNING=logging.WARNING,
                     ERROR=logging.ERROR, CRITICAL=logging.CRITICAL)
logger = logging.getLogger('yasa')

%matplotlib qt

In [2]:
# Load the EDF file
fname = "P18_N3"  # define here
lr = "L"  # define here
location = f"/Users/amirhosseindaraie/Desktop/data/autoscoring-material/data/Zmax Donders/{fname}"
raw = mne.io.read_raw_edf(f"{location}/EEG {lr}.edf", preload=True, verbose=0)
raw.pick_types(eeg=True)

# Apply a zero-phase bandpass filter between 0.5 ~ 45 Hz
raw.filter(0.5, 45)

# Extract the data and convert from V to uV
data = raw._data * 1e6
sf = raw.info["sfreq"]
chan = raw.ch_names


def format_seconds_to_hhmmss(seconds):
    # Return hhmmss of total seconds parameter
    hours = seconds // (60 * 60)
    seconds %= 60 * 60
    minutes = seconds // 60
    seconds %= 60
    return "%02i:%02i:%02i" % (hours, minutes, seconds)


print(
    f"Duration: {data.shape[1]/sf} (sec) OR {format_seconds_to_hhmmss(data.shape[1]/sf)}"
)


Filtering raw data in 1 contiguous segment
Setting up band-pass filter from 0.5 - 45 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal bandpass filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 0.50
- Lower transition bandwidth: 0.50 Hz (-6 dB cutoff frequency: 0.25 Hz)
- Upper passband edge: 45.00 Hz
- Upper transition bandwidth: 11.25 Hz (-6 dB cutoff frequency: 50.62 Hz)
- Filter length: 1691 samples (6.605 sec)

Duration: 28490.0 (sec) OR 07:54:50


In [3]:
import antropy as ant
import scipy.signal as sp_sig
import scipy.stats as sp_stats
from numpy import apply_along_axis as apply

pd.set_option("display.max_columns", None)
pd.set_option("display.expand_frame_repr", False)
pd.set_option("max_colwidth", -1)

# Time vector in seconds
times = np.arange(data.size) / sf


def sliding_window(data, sf, window, step=None, axis=-1):
    """Calculate a sliding window of a 1D or 2D EEG signal.
    .. versionadded:: 0.1.7
    Parameters
    ----------
    data : numpy array
        The 1D or 2D EEG data.
    sf : float
        The sampling frequency of ``data``.
    window : int
        The sliding window length, in seconds.
    step : int
        The sliding window step length, in seconds.
        If None (default), ``step`` is set to ``window``,
        which results in no overlap between the sliding windows.
    axis : int
        The axis to slide over. Defaults to the last axis.
    Returns
    -------
    times : numpy array
        Time vector, in seconds, corresponding to the START of each sliding
        epoch in ``strided``.
    strided : numpy array
        A matrix where row in last dimension consists of one instance
        of the sliding window, shape (n_epochs, ..., n_samples).
    Notes
    -----
    This is a wrapper around the
    :py:func:`numpy.lib.stride_tricks.as_strided` function.
    Examples
    --------
    With a 1-D array
    >>> import numpy as np
    >>> from yasa import sliding_window
    >>> data = np.arange(20)
    >>> times, epochs = sliding_window(data, sf=1, window=5)
    >>> times
    array([ 0.,  5., 10., 15.])
    >>> epochs
    array([[ 0,  1,  2,  3,  4],
           [ 5,  6,  7,  8,  9],
           [10, 11, 12, 13, 14],
           [15, 16, 17, 18, 19]])
    >>> sliding_window(data, sf=1, window=5, step=1)[1]
    array([[ 0,  1,  2,  3,  4],
           [ 2,  3,  4,  5,  6],
           [ 4,  5,  6,  7,  8],
           [ 6,  7,  8,  9, 10],
           [ 8,  9, 10, 11, 12],
           [10, 11, 12, 13, 14],
           [12, 13, 14, 15, 16],
           [14, 15, 16, 17, 18]])
    >>> sliding_window(data, sf=1, window=11)[1]
    array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10]])
    With a N-D array
    >>> np.random.seed(42)
    >>> # 4 channels x 20 samples
    >>> data = np.random.randint(-100, 100, size=(4, 20))
    >>> epochs = sliding_window(data, sf=1, window=10)[1]
    >>> epochs.shape  # shape (n_epochs, n_channels, n_samples)
    (2, 4, 10)
    >>> epochs
    array([[[  2,  79,  -8, -86,   6, -29,  88, -80,   2,  21],
            [-13,  57, -63,  29,  91,  87, -80,  60, -43, -79],
            [-50,   7, -46, -37,  30, -50,  34, -80, -28,  66],
            [ -9,  10,  87,  98,  71, -93,  74, -66, -20,  63]],
           [[-26, -13,  16,  -1,   3,  51,  30,  49, -48, -99],
            [-12, -52, -42,  69,  87, -86,  89,  89,  74,  89],
            [-83,  31, -12, -41, -87, -92, -11, -48,  29, -17],
            [-51,   3,  31, -99,  33, -47,   5, -97, -47,  90]]])
    """
    from numpy.lib.stride_tricks import as_strided

    assert axis <= data.ndim, "Axis value out of range."
    assert isinstance(sf, (int, float)), "sf must be int or float"
    assert isinstance(window, (int, float)), "window must be int or float"
    assert isinstance(step, (int, float, type(None))), (
        "step must be int, " "float or None."
    )
    if isinstance(sf, float):
        assert sf.is_integer(), "sf must be a whole number."
        sf = int(sf)
    assert isinstance(axis, int), "axis must be int."

    # window and step in samples instead of points
    window *= sf
    step = window if step is None else step * sf

    if isinstance(window, float):
        assert window.is_integer(), "window * sf must be a whole number."
        window = int(window)

    if isinstance(step, float):
        assert step.is_integer(), "step * sf must be a whole number."
        step = int(step)

    assert step >= 1, "Stepsize may not be zero or negative."
    assert window < data.shape[axis], (
        "Sliding window size may not exceed " "size of selected axis"
    )

    # Define output shape
    shape = list(data.shape)
    shape[axis] = np.floor(data.shape[axis] / step - window / step + 1).astype(int)
    shape.append(window)

    # Calculate strides and time vector
    strides = list(data.strides)
    strides[axis] *= step
    strides.append(data.strides[axis])
    strided = as_strided(data, shape=shape, strides=strides)
    t = np.arange(strided.shape[-2]) * (step / sf)

    # Swap axis: n_epochs, ..., n_samples
    if strided.ndim > 2:
        strided = np.rollaxis(strided, -2, 0)
    return t, strided


# Convert the EEG data to 30-sec data
times, data_win = sliding_window(data[0], sf, window=30)

# Convert times to minutes
times /= 60


def lziv(x):
    """Binarize the EEG signal and calculate the Lempel-Ziv complexity."""
    return ant.lziv_complexity(x > x.mean(), normalize=True)


  pd.set_option("max_colwidth", -1)


In [4]:
# This cell took ~2min for ~8 hours of sleep data

# Calculate standard descriptive statistics
hmob, hcomp = ant.hjorth_params(data_win, axis=1)

# Feature extraction
df_feat = {
    # Statistical
    "std": apply(np.std, arr=data_win, axis=1, ddof=1),
    "mean": apply(np.mean, arr=data_win, axis=1),
    "median": apply(np.median, arr=data_win, axis=1),
    "iqr": apply(sp_stats.iqr, arr=data_win, axis=1, rng=(25, 75)),
    "skew": apply(sp_stats.skew, arr=data_win, axis=1),
    "kurt": apply(sp_stats.kurtosis, arr=data_win, axis=1),
    "nzc": apply(ant.num_zerocross, arr=data_win, axis=1),
    "hmob": hmob,
    "hcomp": hcomp,
    # Entropy
    "perm_entropy": apply(ant.perm_entropy, axis=1, arr=data_win, normalize=True),
    "svd_entropy": apply(ant.svd_entropy, 1, data_win, normalize=True),
    "sample_entropy": apply(ant.sample_entropy, 1, data_win),
    "app_entropy": apply(ant.app_entropy, 1, data_win, order=2),
    "spec_entropy": apply(
        ant.spectral_entropy,
        1,
        data_win,
        sf,
        normalize=True,
        method="welch",
        nperseg=50,
    ),
    "lziv": apply(ant.lziv_complexity, 1, data_win),
    # Fractal dimension
    "dfa": apply(ant.detrended_fluctuation, 1, data_win),
    "petrosian": apply(ant.petrosian_fd, 1, data_win),
    "katz": apply(ant.katz_fd, 1, data_win),
    "higuchi": apply(ant.higuchi_fd, 1, data_win),
}


df_feat = pd.DataFrame(df_feat)
df_feat.head()


Unnamed: 0,std,mean,median,iqr,skew,kurt,nzc,hmob,hcomp,perm_entropy,svd_entropy,sample_entropy,app_entropy,spec_entropy,lziv,dfa,petrosian,katz,higuchi
0,54.8822,-0.003694,2.306805,44.005194,0.015072,1.379865,192,0.082981,9.403449,0.807286,0.268363,0.177962,0.293946,0.355766,1485,1.486522,1.011184,2.136577,1.39811
1,75.845872,0.738753,1.611624,50.347797,1.052544,6.475114,309,0.084321,8.252569,0.80829,0.268092,0.157969,0.243013,0.341539,1629,1.428058,1.011227,1.831358,1.390486
2,28.866452,-0.206621,2.344649,24.914854,-0.837093,3.476886,473,0.119,7.322419,0.832472,0.34579,0.409602,0.472739,0.538437,1418,1.437477,1.012296,2.184215,1.640853
3,67.180801,-2.024212,-0.236644,15.44831,-0.809879,6.938453,504,0.037268,20.826391,0.812461,0.149264,0.066868,0.096754,0.377652,1208,1.677114,1.011405,1.77912,1.511126
4,72.467643,2.259023,4.616672,76.77325,-0.33279,1.408277,234,0.092555,9.321508,0.813432,0.292107,0.230731,0.303413,0.444736,1684,1.499897,1.011453,1.9954,1.503711


In [5]:
from scipy.integrate import simps
from scipy.signal import welch

# Estimate power spectral density using Welch's method
freqs, psd = welch(data_win, sf, nperseg=int(4 * sf))


def bandpower_from_psd_ndarray(
    psd,
    freqs,
    bands=[
        (0.5, 4, "Delta"),
        (4, 8, "Theta"),
        (8, 12, "Alpha"),
        (12, 16, "Sigma"),
        (16, 30, "Beta"),
        (30, 40, "Gamma"),
    ],
    relative=True,
):
    """Compute bandpowers in N-dimensional PSD.
    This is a np-only implementation of the :py:func:`yasa.bandpower_from_psd` function,
    which supports 1-D arrays of shape (n_freqs), or N-dimensional arays (e.g. 2-D (n_chan,
    n_freqs) or 3-D (n_chan, n_epochs, n_freqs))
    .. versionadded:: 0.2.0
    Parameters
    ----------
    psd : :py:class:`np.ndarray`
        Power spectral density of data, in uV^2/Hz. Must be a N-D array of shape (..., n_freqs).
        See :py:func:`scipy.signal.welch` for more details.
    freqs : :py:class:`np.ndarray`
        Array of frequencies. Must be a 1-D array of shape (n_freqs,)
    bands : list of tuples
        List of frequency bands of interests. Each tuple must contain the lower and upper
        frequencies, as well as the band name (e.g. (0.5, 4, 'Delta')).
    relative : boolean
        If True, bandpower is divided by the total power between the min and
        max frequencies defined in ``band`` (default 0.5 to 40 Hz).
    Returns
    -------
    bandpowers : :py:class:`np.ndarray`
        Bandpower array of shape *(n_bands, ...)*.
    """
    # Type checks
    assert isinstance(bands, list), "bands must be a list of tuple(s)"
    assert isinstance(relative, bool), "relative must be a boolean"

    # Safety checks
    freqs = np.asarray(freqs)
    psd = np.asarray(psd)
    assert freqs.ndim == 1, "freqs must be a 1-D array of shape (n_freqs,)"
    assert psd.shape[-1] == freqs.shape[-1], "n_freqs must be last axis of psd"

    # Extract frequencies of interest
    all_freqs = np.hstack([[b[0], b[1]] for b in bands])
    fmin, fmax = min(all_freqs), max(all_freqs)
    idx_good_freq = np.logical_and(freqs >= fmin, freqs <= fmax)
    freqs = freqs[idx_good_freq]
    res = freqs[1] - freqs[0]

    # Trim PSD to frequencies of interest
    psd = psd[..., idx_good_freq]

    # Check if there are negative values in PSD
    if (psd < 0).any():
        msg = (
            "There are negative values in PSD. This will result in incorrect "
            "bandpower values. We highly recommend working with an "
            "all-positive PSD. For more details, please refer to: "
            "https://github.com/raphaelvallat/yasa/issues/29"
        )
        logger.warning(msg)

    # Calculate total power
    total_power = simps(psd, dx=res, axis=-1)
    total_power = total_power[np.newaxis, ...]

    # Initialize empty array
    bp = np.zeros((len(bands), *psd.shape[:-1]), dtype=np.float64)

    # Enumerate over the frequency bands
    labels = []
    for i, band in enumerate(bands):
        b0, b1, la = band
        labels.append(la)
        idx_band = np.logical_and(freqs >= b0, freqs <= b1)
        bp[i] = simps(psd[..., idx_band], dx=res, axis=-1)

    if relative:
        bp /= total_power
    return bp


# Compute bandpowers in N-dimensional PSD
bp = bandpower_from_psd_ndarray(psd, freqs)
bp = pd.DataFrame(bp.T, columns=["delta", "theta", "alpha", "sigma", "beta", "gamma"])
df_feat = pd.concat([df_feat, bp], axis=1)
df_feat.head()


Unnamed: 0,std,mean,median,iqr,skew,kurt,nzc,hmob,hcomp,perm_entropy,svd_entropy,sample_entropy,app_entropy,spec_entropy,lziv,dfa,petrosian,katz,higuchi,delta,theta,alpha,sigma,beta,gamma
0,54.8822,-0.003694,2.306805,44.005194,0.015072,1.379865,192,0.082981,9.403449,0.807286,0.268363,0.177962,0.293946,0.355766,1485,1.486522,1.011184,2.136577,1.39811,0.943567,0.043672,0.004745,0.001304,0.003798,0.002913
1,75.845872,0.738753,1.611624,50.347797,1.052544,6.475114,309,0.084321,8.252569,0.80829,0.268092,0.157969,0.243013,0.341539,1629,1.428058,1.011227,1.831358,1.390486,0.881081,0.095164,0.014046,0.003701,0.003483,0.002525
2,28.866452,-0.206621,2.344649,24.914854,-0.837093,3.476886,473,0.119,7.322419,0.832472,0.34579,0.409602,0.472739,0.538437,1418,1.437477,1.012296,2.184215,1.640853,0.9519,0.018425,0.00458,0.002265,0.013815,0.009016
3,67.180801,-2.024212,-0.236644,15.44831,-0.809879,6.938453,504,0.037268,20.826391,0.812461,0.149264,0.066868,0.096754,0.377652,1208,1.677114,1.011405,1.77912,1.511126,0.982728,0.006495,0.002767,0.001265,0.00422,0.002525
4,72.467643,2.259023,4.616672,76.77325,-0.33279,1.408277,234,0.092555,9.321508,0.813432,0.292107,0.230731,0.303413,0.444736,1684,1.499897,1.011453,1.9954,1.503711,0.948788,0.029935,0.005475,0.00184,0.007322,0.006639


In [6]:
# Ratio of spectral power
df_feat.eval("dt = delta / theta", inplace=True)
df_feat.eval("da = delta / alpha", inplace=True)
df_feat.eval("ds = delta / sigma", inplace=True)
df_feat.eval("db = delta / beta", inplace=True)
df_feat.eval("dg = delta / gamma", inplace=True)

df_feat.eval("td = theta / delta", inplace=True)
df_feat.eval("ta = theta / alpha", inplace=True)
df_feat.eval("ts = theta / sigma", inplace=True)
df_feat.eval("tb = theta / beta", inplace=True)
df_feat.eval("tg = theta / gamma", inplace=True)

df_feat.eval("ad = alpha / delta", inplace=True)
df_feat.eval("at = alpha / theta", inplace=True)
df_feat.eval("asi = alpha / sigma", inplace=True)
df_feat.eval("ab = alpha / beta", inplace=True)
df_feat.eval("ag = alpha / gamma", inplace=True)

df_feat.eval("sd = sigma / delta", inplace=True)
df_feat.eval("st = sigma / theta", inplace=True)
df_feat.eval("sa = sigma / alpha", inplace=True)
df_feat.eval("sb = sigma / beta", inplace=True)
df_feat.eval("sg = sigma / gamma", inplace=True)

df_feat.eval("bd = beta / delta", inplace=True)
df_feat.eval("bt = beta / theta", inplace=True)
df_feat.eval("ba = beta / alpha", inplace=True)
df_feat.eval("bs = beta / sigma", inplace=True)
df_feat.eval("bg = beta / gamma", inplace=True)

df_feat.eval("gd = gamma / delta", inplace=True)
df_feat.eval("gt = gamma / theta", inplace=True)
df_feat.eval("ga = gamma / alpha", inplace=True)
df_feat.eval("gs = gamma / sigma", inplace=True)
df_feat.eval("gb = gamma / beta", inplace=True)

df_feat.eval("ta_b = (theta + alpha)/beta", inplace=True)
df_feat.eval("ta_ab = (theta + alpha)/(alpha + beta)", inplace=True)
df_feat.eval("gb_da = (gamma + beta)/(delta + alpha)", inplace=True)

df_feat.head()


Unnamed: 0,std,mean,median,iqr,skew,kurt,nzc,hmob,hcomp,perm_entropy,svd_entropy,sample_entropy,app_entropy,spec_entropy,lziv,dfa,petrosian,katz,higuchi,delta,theta,alpha,sigma,beta,gamma,dt,da,ds,db,dg,td,ta,ts,tb,tg,ad,at,asi,ab,ag,sd,st,sa,sb,sg,bd,bt,ba,bs,bg,gd,gt,ga,gs,gb,ta_b,ta_ab,gb_da
0,54.8822,-0.003694,2.306805,44.005194,0.015072,1.379865,192,0.082981,9.403449,0.807286,0.268363,0.177962,0.293946,0.355766,1485,1.486522,1.011184,2.136577,1.39811,0.943567,0.043672,0.004745,0.001304,0.003798,0.002913,21.605609,198.834382,723.597811,248.410628,323.970098,0.046284,9.202906,33.491202,11.497507,14.994722,0.005029,0.108661,3.639199,1.249334,1.629346,0.001382,0.029859,0.274786,0.343299,0.447721,0.004026,0.086975,0.800426,2.91291,1.304172,0.003087,0.06669,0.613743,2.233533,0.76677,12.746841,5.666939,0.007077
1,75.845872,0.738753,1.611624,50.347797,1.052544,6.475114,309,0.084321,8.252569,0.80829,0.268092,0.157969,0.243013,0.341539,1629,1.428058,1.011227,1.831358,1.390486,0.881081,0.095164,0.014046,0.003701,0.003483,0.002525,9.258548,62.727287,238.091497,252.97289,348.984682,0.108008,6.775068,25.715858,27.323172,37.693242,0.015942,0.1476,3.795661,4.0329,5.563523,0.0042,0.038887,0.263459,1.062503,1.465759,0.003953,0.036599,0.247961,0.941174,1.379534,0.002865,0.02653,0.179742,0.682241,0.724883,31.356072,6.230219,0.006711
2,28.866452,-0.206621,2.344649,24.914854,-0.837093,3.476886,473,0.119,7.322419,0.832472,0.34579,0.409602,0.472739,0.538437,1418,1.437477,1.012296,2.184215,1.640853,0.9519,0.018425,0.00458,0.002265,0.013815,0.009016,51.66301,207.852751,420.227703,68.905277,105.581494,0.019356,4.023241,8.134015,1.333745,2.043657,0.004811,0.248556,2.021757,0.33151,0.507963,0.00238,0.122941,0.494619,0.163971,0.251248,0.014513,0.749769,3.0165,6.098629,1.53227,0.009471,0.489319,1.968648,3.980127,0.652626,1.665255,1.250651,0.023869
3,67.180801,-2.024212,-0.236644,15.44831,-0.809879,6.938453,504,0.037268,20.826391,0.812461,0.149264,0.066868,0.096754,0.377652,1208,1.677114,1.011405,1.77912,1.511126,0.982728,0.006495,0.002767,0.001265,0.00422,0.002525,151.295862,355.211728,777.011653,232.864582,389.258515,0.00661,2.347795,5.13571,1.539134,2.57283,0.002815,0.425931,2.187461,0.655566,1.095849,0.001287,0.194715,0.457151,0.299693,0.500969,0.004294,0.649716,1.5254,3.336753,1.671609,0.002569,0.388677,0.912534,1.996133,0.598226,2.194699,1.325649,0.006844
4,72.467643,2.259023,4.616672,76.77325,-0.33279,1.408277,234,0.092555,9.321508,0.813432,0.292107,0.230731,0.303413,0.444736,1684,1.499897,1.011453,1.9954,1.503711,0.948788,0.029935,0.005475,0.00184,0.007322,0.006639,31.694711,173.287419,515.506576,129.575775,142.91227,0.031551,5.467392,16.264751,4.088246,4.509026,0.005771,0.182903,2.974864,0.747751,0.824712,0.00194,0.061483,0.33615,0.251356,0.277227,0.007717,0.244604,1.337344,3.978418,1.102924,0.006997,0.221777,1.212544,3.607154,0.906681,4.835997,2.766983,0.01463


In [7]:
def hjorth_activity(x):
    """Column-wise computation of Hjorth activity (variance)."""
    return np.var(x, axis=0)


def hjorth_mobility(x):
    """Column-wise computation of Hjorth mobility"""
    return np.sqrt(np.var(np.gradient(x, axis=0), axis=0) / np.var(x, axis=0))


def hjorth_complexity(x):
    """Column-wise computation of Hjorth complexity"""
    return hjorth_mobility(np.gradient(x, axis=0)) / hjorth_mobility(x)

# Energy (E) of the signal is the sum of the squares of amplitude
def energy_fn(x):
    x /= np.max(x)
    return np.mean(x**2)

def calc_wavelet_energy(data_set):
    """
    Input : 1 * N vector
    Output: Float with the wavelet energy of the input vector,
    rounded to 3 decimal places.
    """
    # p_sqr = [i ** 2 for i in data_set]
    wavelet_energy = np.nansum(np.log2(np.square(data_set)))
    return round(wavelet_energy, 3)

In [8]:
E = np.apply_along_axis(energy_fn, 1, data_win)
df_feat["E"] = E

from scipy.integrate import simps
from scipy.signal import welch

# Estimate power spectral density using Welch's method
freqs, psd = welch(data_win, sf, nperseg=int(4 * sf))

# Compute features
## Compute featrues for normal singal (to compare w/ psd later)
hmob, hcomp = ant.hjorth_params(data_win, axis=1)
std_nor = np.apply_along_axis(np.std, 1, data_win, ddof=1)
mean_nor = np.apply_along_axis(np.mean, 1, data_win)
median_nor = np.apply_along_axis(np.median, 1, data_win)
iqr_nor = np.apply_along_axis(sp_stats.iqr, 1, data_win, rng=(25, 75))
skew_nor = np.apply_along_axis(sp_stats.skew, 1, data_win)
kurt_nor = np.apply_along_axis(sp_stats.kurtosis, 1, data_win)
hmob_nor = hmob
hcomp_nor = hcomp

## Compute featrues for PSD
hmob, hcomp = ant.hjorth_params(psd, axis=1)
std_psd = np.apply_along_axis(np.std, 1, psd, ddof=1)
mean_psd = np.apply_along_axis(np.mean, 1, psd)
median_psd = np.apply_along_axis(np.median, 1, psd)
iqr_psd = np.apply_along_axis(sp_stats.iqr, 1, psd, rng=(25, 75))
skew_psd = np.apply_along_axis(sp_stats.skew, 1, psd)
kurt_psd = np.apply_along_axis(sp_stats.kurtosis, 1, psd)
hmob_psd = hmob
hcomp_psd = hcomp

# Add features to features dataframe
df_feat["E"] = E
df_feat["std_psd"] = std_psd
df_feat["mean_psd"] = mean_psd
df_feat["iqr_psd"] = iqr_psd
df_feat["skew_psd"] = skew_psd
df_feat["kurt_psd"] = kurt_psd
df_feat["hmob_psd"] = hmob_psd
df_feat["hcomp_psd"] = hcomp_psd

wavelet_energy = np.apply_along_axis(calc_wavelet_energy, 1, data_win)

# Add features to features dataframe
df_feat["WEn"] = wavelet_energy


In [9]:
import math, sys


def __to_inc(x):
    incs = x[1:] - x[:-1]
    return incs


def __to_pct(x):
    pcts = x[1:] / x[:-1] - 1.0
    return pcts


def __get_RS(series, kind):
    """
    Get rescaled range (using the range of cumulative sum
    of deviations instead of the range of a series as in the simplified version
    of R/S) from a time-series of values.
    Parameters
    ----------
    series : array-like
        (Time-)series
    kind : str
        The kind of series (refer to compute_Hc docstring)
    """

    if kind == "random_walk":
        incs = __to_inc(series)
        mean_inc = (series[-1] - series[0]) / len(incs)
        deviations = incs - mean_inc
        Z = np.cumsum(deviations)
        R = max(Z) - min(Z)
        S = np.std(incs, ddof=1)

    elif kind == "price":
        incs = __to_pct(series)
        mean_inc = np.sum(incs) / len(incs)
        deviations = incs - mean_inc
        Z = np.cumsum(deviations)
        R = max(Z) - min(Z)
        S = np.std(incs, ddof=1)

    elif kind == "change":
        incs = series
        mean_inc = np.sum(incs) / len(incs)
        deviations = incs - mean_inc
        Z = np.cumsum(deviations)
        R = max(Z) - min(Z)
        S = np.std(incs, ddof=1)

    if R == 0 or S == 0:
        return 0  # return 0 to skip this interval due undefined R/S

    return R / S


def __get_simplified_RS(series, kind):
    """
    Simplified version of rescaled range
    Parameters
    ----------
    series : array-like
        (Time-)series
    kind : str
        The kind of series (refer to compute_Hc docstring)
    """

    if kind == "random_walk":
        incs = __to_inc(series)
        R = max(series) - min(series)  # range in absolute values
        S = np.std(incs, ddof=1)
    elif kind == "price":
        pcts = __to_pct(series)
        R = max(series) / min(series) - 1.0  # range in percent
        S = np.std(pcts, ddof=1)
    elif kind == "change":
        incs = series
        _series = np.hstack([[0.0], np.cumsum(incs)])
        R = max(_series) - min(_series)  # range in absolute values
        S = np.std(incs, ddof=1)

    if R == 0 or S == 0:
        return 0  # return 0 to skip this interval due the undefined R/S ratio

    return R / S


def compute_Hc(
    series, kind="random_walk", min_window=10, max_window=None, simplified=True
):
    """
    Compute H (Hurst exponent) and C according to Hurst equation:
    E(R/S) = c * T^H
    Refer to:
    https://en.wikipedia.org/wiki/Hurst_exponent
    https://en.wikipedia.org/wiki/Rescaled_range
    https://en.wikipedia.org/wiki/Random_walk
    Parameters
    ----------
    series : array-like
        (Time-)series
    kind : str
        Kind of series
        possible values are 'random_walk', 'change' and 'price':
        - 'random_walk' means that a series is a random walk with random increments;
        - 'price' means that a series is a random walk with random multipliers;
        - 'change' means that a series consists of random increments
            (thus produced random walk is a cumulative sum of increments);
    min_window : int, default 10
        the minimal window size for R/S calculation
    max_window : int, default is the length of series minus 1
        the maximal window size for R/S calculation
    simplified : bool, default True
        whether to use the simplified or the original version of R/S calculation
    Returns tuple of
        H, c and data
        where H and c — parameters or Hurst equation
        and data is a list of 2 lists: time intervals and R/S-values for correspoding time interval
        for further plotting log(data[0]) on X and log(data[1]) on Y
    """

    if len(series) < 100:
        raise ValueError("Series length must be greater or equal to 100")

    ndarray_likes = [np.ndarray]
    if "pandas.core.series" in sys.modules.keys():
        ndarray_likes.append(pd.core.series.Series)

    # convert series to np array if series is not np array or pandas Series
    if type(series) not in ndarray_likes:
        series = np.array(series)

    if (
        "pandas.core.series" in sys.modules.keys()
        and type(series) == pd.core.series.Series
    ):
        if series.isnull().values.any():
            raise ValueError("Series contains NaNs")
        series = series.values  # convert pandas Series to np array
    elif np.isnan(np.min(series)):
        raise ValueError("Series contains NaNs")

    if simplified:
        RS_func = __get_simplified_RS
    else:
        RS_func = __get_RS

    err = np.geterr()
    np.seterr(all="raise")

    max_window = max_window or len(series) - 1
    window_sizes = list(
        map(
            lambda x: int(10**x),
            np.arange(math.log10(min_window), math.log10(max_window), 0.25),
        )
    )
    window_sizes.append(len(series))

    RS = []
    for w in window_sizes:
        rs = []
        for start in range(0, len(series), w):
            if (start + w) > len(series):
                break
            _ = RS_func(series[start : start + w], kind)
            if _ != 0:
                rs.append(_)
        RS.append(np.mean(rs))

    A = np.vstack([np.log10(window_sizes), np.ones(len(RS))]).T
    H, c = np.linalg.lstsq(A, np.log10(RS), rcond=-1)[0]
    np.seterr(**err)

    c = 10**c
    return H, c  # , [window_sizes, RS]


# H, c, [window_sizes, RS] = compute_Hc(data_win[0,:])


In [10]:
Hurst_coeffs = np.apply_along_axis(compute_Hc, 1, data_win, kind="random_walk")
Hurst_H1 = Hurst_coeffs[:, 0]
Hurst_C1 = Hurst_coeffs[:, 1]
Hurst_coeffs = np.apply_along_axis(compute_Hc, 1, data_win, kind="change")
Hurst_H2 = Hurst_coeffs[:, 0]
Hurst_C2 = Hurst_coeffs[:, 1]


In [11]:
import collections
import numpy as np
import scipy.stats as stat
from scipy.stats import iqr as IQR


class Outlier:
    """
    Find outlier in a numerical dataset with two different methods:
        - `sd_outlier`: z-score based method
        - `IQR_outlier`: IQR based method
    Also allows to remove/filter-out the detected outliers with `filter` method.
    `plot` method allows you to plot the original and filtered dataset and inspect the performance.
    """

    def __init__(self, x=None):
        self.x = x
        self.outliers = None
        self.outliersIndices = np.array([])
        self.x_filt = None

    def sd_outlier(self=None, x=None, axis=None, bar=3, side="both"):
        """
        z-score based method
        This method will test if the numbers falls outside the three standard deviations.
        Based on this rule, if the value is outlier, the method will return true, if not, return false.
        """

        assert side in ["gt", "lt", "both"], "Side should be `gt`, `lt` or `both`."

        if (x is None) and (self.x is not None):
            x = self.x
        elif (x is None) and (self.x is None):
            raise ValueError("Enter x input!")

        d_z = stat.zscore(x, axis=axis)

        if side == "gt":
            self.outliers = d_z > bar
            return d_z > bar
        elif side == "lt":
            self.outliers = d_z < -bar
            return d_z < -bar
        elif side == "both":
            self.outliers = np.abs(d_z) > bar
            return np.abs(d_z) > bar

    def __Q1(self, x, axis=None):
        if (x is None) and (self.x is not None):
            x = self.x
        elif (x is None) and (self.x is None):
            raise ValueError("Enter x input!")

        return np.percentile(x, 25, axis=axis)

    def __Q3(self, x, axis=None):
        if (x is None) and (self.x is not None):
            x = self.x
        elif (x is None) and (self.x is None):
            raise ValueError("Enter x input!")

        return np.percentile(x, 75, axis=axis)

    def IQR_outlier(self, x=None, axis=None, bar=1.5, side="both"):
        """
        IQR based method
        This method will test if the value is less than q1 - 1.5 * iqr or
        greater than q3 + 1.5 * iqr.
        """
        self.method = "IQR_outlier"

        assert side in ["gt", "lt", "both"], "Side should be `gt`, `lt` or `both`."

        if (x is None) and (self.x is not None):
            x = self.x
        elif (x is None) and (self.x is None):
            raise ValueError("Enter x input!")

        d_IQR = IQR(x, axis=axis)
        d_Q1 = self.__Q1(x, axis=axis)
        d_Q3 = self.__Q3(x, axis=axis)
        IQR_distance = np.multiply(d_IQR, bar)

        stat_shape = list(x.shape)

        if isinstance(axis, collections.Iterable):
            for single_axis in axis:
                stat_shape[single_axis] = 1
        else:
            stat_shape[axis] = 1

        if side in ["gt", "both"]:
            upper_range = d_Q3 + IQR_distance
            upper_outlier = np.greater(x - upper_range.reshape(stat_shape), 0)
        if side in ["lt", "both"]:
            lower_range = d_Q1 - IQR_distance
            lower_outlier = np.less(x - lower_range.reshape(stat_shape), 0)

        if side == "gt":
            self.outliers = upper_outlier
            return upper_outlier
        if side == "lt":
            self.outliers = lower_outlier
            return lower_outlier
        if side == "both":
            self.outliers = np.logical_or(upper_outlier, lower_outlier)
            return np.logical_or(upper_outlier, lower_outlier)

    def filter(self, x=None):
        if (x is None) and (self.x is not None):
            x = self.x
        elif (x is None) and (self.x is None):
            raise ValueError("Enter x input!")

        self.outliersIndices = np.where(self.outliers == True)
        print(f"Outliers are detected in {len(self.outliersIndices[0])} points.")
        self.x_filt = np.copy(x)
        self.x_filt[self.outliersIndices] = np.mean(x[~self.outliers])
        return self.x_filt, self.outliersIndices[0]

    def plot(self, plot_original=False):
        # Plot the signal and detected outliers
        plt.figure()

        if plot_original:
            # plt.plot(np.asarray(self.x), "ok", label="Orginal Signal")
            plt.plot(np.asarray(self.x), "-k", linewidth=7, label="Orginal Signal")

        for outlier in self.outliersIndices[0]:
            plt.axvline(outlier, color="red", linestyle="--", alpha=0.5, linewidth=4)

        if plot_original:
            plt.plot(filtered, "-", c="cyan", linewidth=1, label="Filtered Signal")
        else:
            plt.plot(filtered, "-", c="blue", linewidth=1, label="Filtered Signal")

        plt.legend()
        plt.tight_layout()
        plt.show()


In [12]:
# detect and remove outliers from Hurst coefficients
outlier = Outlier(np.asarray(Hurst_H1))
outlier.IQR_outlier(axis=0, bar=1.5, side="both")
filtered, outlierIndices = outlier.filter()
outlier.plot(plot_original=True)
Hurst_H1 = filtered


Outliers are detected in 1 points.


  if isinstance(axis, collections.Iterable):


In [13]:
# detect and remove outliers from Hurst coefficients
outlier = Outlier(np.asarray(Hurst_H2))
outlier.IQR_outlier(axis=0, bar=1.5, side="both")
filtered, outlierIndices = outlier.filter()
outlier.plot(plot_original=True)
Hurst_H2 = filtered


Outliers are detected in 0 points.


In [14]:
# detect and remove outliers from Hurst coefficients
outlier = Outlier(np.asarray(Hurst_C1))
outlier.IQR_outlier(axis=0, bar=1.5, side="both")
filtered, outlierIndices = outlier.filter()
outlier.plot(plot_original=True)
Hurst_C1 = filtered


Outliers are detected in 7 points.


In [15]:
# detect and remove outliers from Hurst coefficients
outlier = Outlier(np.asarray(Hurst_C2))
outlier.IQR_outlier(axis=0, bar=1.5, side="both")
filtered, outlierIndices = outlier.filter()
outlier.plot(plot_original=True)
Hurst_C2 = filtered


Outliers are detected in 15 points.


In [16]:
def calc_mean_and_ctm(X, Y):
    # features = pd.DataFrame(columns=['radius','mean_distance','central_tendency_measure'])
    r = 0.5
    d = [math.sqrt(X[i] * X[i] + Y[i] * Y[i]) for i in range(0, len(X))]
    delta = [1 if i < r else 0 for i in d]
    d = [i for i in d if i < r]

    ctm = np.sum(delta[:-2]) / (len(delta) - 2)
    mean_distance = np.mean(d)

    # features.loc[0] = [r] + [ctm] + [mean_distance]
    return r, ctm, mean_distance


def mean_ctm_wrapper(x):
    """
    A wrapper function for calc_mean_and_ctm().
    This function calculates mean and central tendancy measure for a given time series `x`.

    Parameters
    ----------
    x : :py:class:`np.ndarray`
        Array of time series data. Must be a 1-D array of shape `(dataPoints,)`

    Returns
    -------
    Tuple of `mean_distance` and `central_tendency_measure`

    Example
    -------
        >>> y = np.random.randn(7680)*10 + 100
        >>> md, ctm = mean_ctm_wrapper(y)
        (0.054281767955801107, 0.33950566436214885)
    """
    upper_quartile = np.percentile(x, 80)
    lower_quartile = np.percentile(x, 20)
    IQR = (upper_quartile - lower_quartile) * 1.5
    quartileSet = (lower_quartile - IQR, upper_quartile + IQR)
    x = x[np.where((x >= quartileSet[0]) & (x <= quartileSet[1]))]
    # plotting SODP
    X = np.subtract(x[1:], x[0:-1])  # x(n+1)-x(n)
    Y = np.subtract(x[2:], x[0:-2]).tolist()  # x(n+2)-x(n-1)
    Y.extend([0])
    # calculate MD and CTM
    _, mean_distance, central_tendency_measure = calc_mean_and_ctm(X, Y)
    return mean_distance, central_tendency_measure


In [17]:
# Calculate feature for all epochs. Then add them to FeaturesDataFrame
mean_ctm = np.apply_along_axis(mean_ctm_wrapper, 1, arr=data_win)
df_feat["mean_distance"] = mean_ctm[:, 0]
df_feat["central_tendency_measure"] = mean_ctm[:, 1]


In [18]:
from collections import Counter


class Counter(Counter):
    def prob(self):
        return np.array(list(self.values()))


def symbols_to_prob(symbols):
    """
    Return a dict mapping symbols to  probability.
    input:
    -----
        symbols:     iterable of hashable items
                     works well if symbols is a zip of iterables
    """
    myCounter = Counter(symbols)

    N = float(len(list(symbols)))  # symbols might be a zip object in python 3

    for k in myCounter:
        myCounter[k] /= N

    return myCounter


def entropy(data=None, prob=None, tol=1e-5):
    """
    given a probability distribution (prob) or an interable of symbols (data) compute and
    return its entropy
    inputs:
    ------
        data:       iterable of symbols
        prob:       iterable with probabilities
        tol:        if prob is given, 'entropy' checks that the sum is about 1.
                    It raises an error if abs(sum(prob)-1) >= tol
    """

    if prob is None and data is None:
        raise ValueError(
            "%s.entropy requires either 'prob' or 'data' to be defined" % __name__
        )

    if prob is not None and data is not None:
        raise ValueError(
            "%s.entropy requires only 'prob' or 'data to be given but not both"
            % __name__
        )

    if prob is not None and not isinstance(prob, np.ndarray):
        raise TypeError("'entropy' in '%s' needs 'prob' to be an ndarray" % __name__)

    if prob is not None and abs(prob.sum() - 1) > tol:
        raise ValueError("parameter 'prob' in '%s.entropy' should sum to 1" % __name__)

    if data is not None:
        prob = symbols_to_prob(data).prob()

    # compute the log2 of the probability and change any -inf by 0s
    logProb = np.log2(prob)
    logProb[logProb == -np.inf] = 0

    # return dot product of logProb and prob
    return -float(np.dot(prob, logProb))


def renyi(data=None, a=2):
    if data is not None:
        prob = symbols_to_prob(data).prob()

    # compute the log2 of the probability and change any -inf by 0s
    powerProb = prob ** int(a)
    logProb = np.log(powerProb)
    # return dot product of logProb and prob
    return -(a / (1 - a)) * (np.sum(logProb))


def tsallis(data=None, q=2):
    if data is not None:
        prob = symbols_to_prob(data).prob()

    # compute the log2 of the probability and change any -inf by 0s
    powerProb = prob ** int(q)
    # return dot product of logProb and prob
    return (1 / (q - 1)) * (1 - np.sum(powerProb))


In [19]:
# Calculate feature for all epochs. Then add them to FeaturesDataFrame
data_win_rnd3 = np.around(data_win, decimals=3)
tsallisEnt = np.apply_along_axis(tsallis, 1, arr=data_win_rnd3)
df_feat["tsallisEnt"] = tsallisEnt


In [20]:
data_win_rnd3 = np.around(data_win, decimals=3)
renyiEnt = np.apply_along_axis(renyi, 1, arr=data_win_rnd3)
df_feat["renyi"] = renyiEnt

In [21]:
# Manis and Sassi, “A Python Library with Fast Algorithms for Popular Entropy Definitions.”

from numpy import histogram, log


def bubble_count(x):
    """
    counts the number of swaps when sorting
    :param x: the input vector
    :return: the total number of swaps
    """
    y = 0
    for i in range(len(x) - 1, 0, -1):
        for j in range(i):
            if x[j] > x[j + 1]:
                x[j], x[j + 1] = x[j + 1], x[j]
                y += 1
    return y


def complexity_count_fast(x, m):
    """
    :param x: the input series
    :param m: the dimension of the space
    :return: the series of complexities for total number of swaps
    """

    if len(x) < m:
        return []

    y = [bubble_count(x[:m])]
    v = sorted(x[:m])

    for i in range(m, len(x)):
        steps = y[i - m]
        steps -= v.index(x[i - m])
        v.pop(v.index(x[i - m]))
        v.append(x[i])
        j = m - 1
        while j > 0 and v[j] < v[j - 1]:
            v[j], v[j - 1] = v[j - 1], v[j]
            steps += 1
            j -= 1
        y.append(steps)

    return y


def renyi_int(data):
    """
    returns renyi entropy (order 2) of an integer series and bin_size=1
    (specified for the needs of bubble entropy)
    :param data: the input series
    :return: metric
    """
    counter = [0] * (max(data) + 1)
    for x in data:
        counter[x] += 1
    r = 0
    for c in counter:
        p = c / len(data)
        r += p * p
    return -log(r)


def bubble_entropy(x, m=10):
    """
    computes bubble entropy following the definition
    :param x: the input signal
    :param m: the dimension of the embedding space
    :return: metric
    """
    complexity = complexity_count_fast(x, m)
    B = renyi_int(complexity) / log(1 + m * (m - 1) / 2)

    complexity = complexity_count_fast(x, m + 1)
    A = renyi_int(complexity) / log(1 + (m + 1) * m / 2)

    return A - B


def bubble_entropy_2(x, m=10):
    """
    computes bubble entropy following the definition
    :param x: the input signal
    :param m: the dimension of the embedding space
    :return: metric
    """
    complexity = complexity_count_fast(x, m)
    B = renyi_int(complexity) / log(1 + m * (m - 1) / 2)

    complexity = complexity_count_fast(x, m + 2)
    A = renyi_int(complexity) / log(1 + (m + 2) * (m + 1) / 2)

    return A - B


In [22]:
# This cell took ~40 seconds from ~8 hours of sleep data

# Calculate feature for all epochs. Then add them to FeaturesDataFrame
data_win_rnd3 = np.around(data_win, decimals=3)
bubbleEnt1 = np.apply_along_axis(bubble_entropy, 1, arr=data_win_rnd3)
df_feat["bubbleEnt1"] = bubbleEnt1

# Calculate feature for all epochs. Then add them to FeaturesDataFrame
data_win_rnd3 = np.around(data_win, decimals=3)
bubbleEnt2 = np.apply_along_axis(bubble_entropy_2, 1, arr=data_win_rnd3)
df_feat["bubbleEnt2"] = bubbleEnt2


In [23]:
from scipy.stats import differential_entropy

# Calculate feature for all epochs. Then add them to FeaturesDataFrame
data_win_rnd3 = np.around(data_win, decimals=3)
diffEnt = np.apply_along_axis(differential_entropy, 1, arr=data_win_rnd3)
diffEntMean = np.mean(diffEnt[~(diffEnt == -np.inf)])
diffEnt[diffEnt == -np.inf] = diffEntMean
df_feat["diffEnt"] = diffEnt

  logs = np.log(n/(2*m) * differences)


In [24]:
fig = plt.figure(figsize=(10, 5))
plt.plot(
    diffEnt - np.mean(diffEnt),
    label="Differential Entropy",
    color="darkgreen",
    linewidth=1.5,
)
plt.title("Differential Entropy (Normalized)")
plt.xlabel("Epoch")
plt.ylabel("Entropy")
plt.legend()
plt.tight_layout()
plt.show()

In [25]:
# This cell took ~11 min to execute

# pip install EntropyHub
import EntropyHub as enth


def fuzzEnt_f(x, m=1, tau=1):
    """
    A wrapper function for EntropyHub.FuzzEn() Function

    Input
    ------
    `sig`: Time series signal, a vector of length > 10.
    `m`: Embedding dimension, a positive integer (for embbeding dim).
    `tau`: Time delay, a positive integer (for embbeding dim).
    `Fx`: Type of fuzzy function for distance transformation, one of the following strings
    Return
    ------
    `Fuzz`: Fuzzy entropy estimates for each embedding dimension 1:m.
    `Ps1`: The average fuzzy distances for embedding dimensions 1:m.
    `Ps2`: The average fuzzy distances for embedding dimensions 2:m+1.
    Example
    -------
    >>> [Fuzz, Ps1, Ps2] = enth.FuzzEn(x, m=1, tau=1)
    Source
    ------
    https://github.com/MattWillFlood/EntropyHub/blob/main/Guide/EntropyHub%20Guide.pdf
    """

    [Fuzz, Ps1, Ps2] = enth.FuzzEn(x, m=m, tau=tau)
    return Fuzz[0]


# Calculate feature for all epochs. Then add them to FeaturesDataFrame
data_win_rnd3 = np.around(data_win, decimals=3)
fuzzEnt = np.apply_along_axis(fuzzEnt_f, 1, arr=data_win_rnd3)
df_feat["fuzzEnt"] = fuzzEnt


In [26]:
# Write feature object to a comma-separated values (csv) file
df_feat.to_csv(f"feature/{fname} {lr}.csv", index=False)