# Nb to look for standout features that show statistical relevance

In [1]:
import sys
from importlib import reload
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, normalize
from scipy.spatial.distance import pdist, squareform
from skbio.stats.distance import permanova, DistanceMatrix
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from skbio.stats.distance import permdisp
from pyclustertend.hopkins import hopkins
from sklearn.cluster import KMeans
from scipy.spatial import ConvexHull
import matplotlib.patches as mpatches
from sklearn.metrics import silhouette_score
from scipy.stats import spearmanr


sys.path.append("../file_loader")
sys.path.append("../preprocessing-block")
sys.path.append("../features_calculation")
sys.path.append("../statistical_tests")


import ecg_file_loader
from ecg_file_loader import ECGFileLoader

import rpeaks_finder
from rpeaks_finder import *

import ectopic_beats_filtering
from ectopic_beats_filtering import *

import time_features
from time_features import *

import frequency_features
from frequency_features import *

import nonlinear_features
from nonlinear_features import *


reload(ecg_file_loader)
reload(rpeaks_finder)
reload(ectopic_beats_filtering)
reload(time_features)
reload(frequency_features)
reload(nonlinear_features)

  import pkg_resources


<module 'nonlinear_features' from '/Users/ashleyandrea/Documents/StartUps/seizury/seizure-block-codes/statistical_tests/../features_calculation/nonlinear_features.py'>

In [2]:
giorgio = pd.read_csv("giorgio.csv")

In [3]:
giorgio.head()

Unnamed: 0,minute,tm_nni_counter,tm_nni_mean,tm_nni_min,tm_nni_max,tm_hr_mean,tm_hr_min,tm_hr_max,tm_hr_std,tm_nni_diff_mean,...,lf_hf_ratio,fft_rel_lf,fft_rel_hf,total_power_y,vlf_abs,vlf_rel,vlf_peak,group,win_start_time_s,win_end_time_s
0,111,438.0,683.340183,320.0,1054.0,88.634778,56.925996,187.5,10.199581,24.514874,...,1.635241,62.052802,37.947198,1596.40646,911.191707,57.077676,0.003906,non_seizure-1,6359.582031,6659.777344
1,112,434.0,689.725806,351.0,894.0,87.589609,67.114094,170.940171,8.404609,18.662818,...,1.144514,53.36938,46.63062,1439.520022,724.954968,50.360881,0.003906,non_seizure-1,6420.0,6720.234375
2,113,432.0,692.06713,351.0,894.0,87.300412,67.114094,170.940171,8.452257,18.069606,...,1.493377,59.893757,40.106243,1596.211305,793.762379,49.727901,0.003906,non_seizure-1,6479.890625,6779.761719
3,114,430.0,695.081395,351.0,894.0,86.885457,67.114094,170.940171,8.262244,18.846154,...,1.871609,65.17632,34.82368,1679.845857,809.946775,48.215541,0.007812,non_seizure-1,6540.429688,6840.183594
4,115,432.0,692.736111,375.0,796.0,86.960196,75.376884,160.0,6.099031,13.937355,...,2.421761,70.775282,29.224718,1617.419859,825.615091,51.045193,0.007812,non_seizure-1,6599.679688,6899.949219


In [5]:
giorgio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 273 entries, 0 to 272
Data columns (total 49 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   minute            273 non-null    int64  
 1   tm_nni_counter    273 non-null    float64
 2   tm_nni_mean       273 non-null    float64
 3   tm_nni_min        273 non-null    float64
 4   tm_nni_max        273 non-null    float64
 5   tm_hr_mean        273 non-null    float64
 6   tm_hr_min         273 non-null    float64
 7   tm_hr_max         273 non-null    float64
 8   tm_hr_std         273 non-null    float64
 9   tm_nni_diff_mean  273 non-null    float64
 10  tm_nni_diff_min   273 non-null    float64
 11  tm_nni_diff_max   273 non-null    float64
 12  tm_sdnn           273 non-null    float64
 13  tm_sdnn_index     0 non-null      float64
 14  tm_sdann          0 non-null      float64
 15  tm_rmssd          273 non-null    float64
 16  tm_sdsd           273 non-null    float64
 1

In [6]:
giorgio.rename(columns={
    'total_power_x': 'total_power_lf_hf',
    'total_power_y': 'total_power_vlf'
}, inplace=True)

## First of all, what do these column tell us? 

### Time domain features

pyHRV — `time_domain()` Parameter Descriptions

| Key (in our df)     | Description                                                  |
|---------------------|---------------------------------------------------------------|
| tm_nni_counter      | Number of NN intervals (i.e., count of NNI values)           |
| tm_nni_mean         | Mean value of the NN interval series [ms]                    |
| tm_nni_min          | Minimum NN interval [ms]                                     |
| tm_nni_max          | Maximum NN interval [ms]                                     |
| tm_hr_mean          | Mean heart rate derived from NN intervals [bpm]              |
| tm_hr_min           | Minimum heart rate [bpm]                                     |
| tm_hr_max           | Maximum heart rate [bpm]                                     |
| tm_hr_std           | Standard deviation of the heart rate series [bpm]            |
| tm_nni_diff_mean    | Mean of successive differences of NN intervals [ms]          |
| tm_nni_diff_min     | Minimum of successive NN interval differences [ms]           |
| tm_nni_diff_max     | Maximum of successive NN interval differences [ms]           |
| tm_sdnn             | Standard deviation of NN interval series (SDNN) [ms]         |
| tm_sdnn_index       | Mean of segment-wise SDNN values (~5-minute windows) [ms]    |
| tm_sdann            | Standard deviation of segment means of NN intervals [ms]     |
| tm_rmssd            | Root mean square of successive NN-interval differences [ms]  |
| tm_sdsd             | Standard deviation of successive NN-interval differences [ms]|
| tm_nn50             | Count of successive NN interval differences greater than 50 ms|
| tm_pnn50            | Proportion of NN interval differences >50 ms                |
| tm_nn20             | Count of successive NN interval differences greater than 20 ms|
| tm_pnn20            | Proportion of NN interval differences >20 ms                |
| tm_tinn_n           | Left corner (N) value of the interpolated triangle in NNI histogram [ms]|
| tm_tinn_m           | Right corner (M) value of the interpolated triangle in NNI histogram [ms]|
| tm_tinn             | Baseline width (M−N) of the interpolated triangle in histogram [ms]|
| tm_tri_index        | Triangular index from NN-interval histogram (geometric HRV measure)|


### Frequency features 

pyHRV — Frequency Domain Feature Descriptions

| Key (in our df) | Description                                                                 |
|-----------------|-----------------------------------------------------------------------------|
| lf_abs          | Absolute power of the low frequency (LF) band [ms²]                          |
| lf_rel          | Relative power of the LF band [%] (relative to all defined bands)            |
| fft_rel_lf      | Relative power of the LF band [%] (relative to LF + HF only)                 |
| lf_peak         | Peak frequency of the LF band [Hz]                                          |
| lf_log          | Logarithmic power of the LF band [log(ms²)]                                 |
| lf_norm         | Normalised power of the LF band (unitless) [-]                              |
| hf_abs          | Absolute power of the high frequency (HF) band [ms²]                         |
| hf_rel          | Relative power of the HF band [%] (relative to all defined bands)            |
| fft_rel_hf      | Relative power of the HF band [%] (relative to LF + HF only)                 |
| hf_peak         | Peak frequency of the HF band [Hz]                                          |
| hf_log          | Logarithmic power of the HF band [log(ms²)]                                 |
| hf_norm         | Normalised power of the HF band [-]                                          |
| lf_hf_ratio     | Ratio of LF power to HF power (LF/HF) [-]                                   |
| vlf_abs         | Absolute power of the very low frequency (VLF) band [ms²]                    |
| vlf_rel         | Relative power of the VLF band [%] (relative to all defined bands)           |
| vlf_peak        | Peak frequency of the VLF band [Hz]                                         |
| vlf_log         | Logarithmic power of the VLF band [log(ms²)]                                |


### Nonlinear features

pyHRV — Nonlinear Features Descriptions

| Key (in our df)  | Description                                                        |
|------------------|---------------------------------------------------------------------|
| nl_sd1           | Standard deviation of the short-term variability (perpendicular axis of the Poincaré plot) |
| nl_sd2           | Standard deviation of the long-term variability (along the major axis of the Poincaré plot) |
| nl_sd_ratio      | Ratio of SD2 to SD1 — quantifies the relative balance of long- vs short-term variability |
| nl_ellipse_area  | Area of the fitted ellipse in the Poincaré plot (π × SD1 × SD2)    |
| nl_sampen        | Sample entropy of the NN interval series — measure of signal irregularity/unpredictability |
| nl_dfa_alpha1    | Detrended fluctuation analysis exponent α₁ — short-term correlation scaling exponent in the NN interval series |
| nl_dfa_alpha2    | Detrended fluctuation analysis exponent α₂ — long-term correlation scaling exponent in the NN interval series |