In [20]:
import sys
import os
module_path = os.path.abspath(os.path.join(os.pardir, os.pardir))
if module_path not in sys.path:    
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:    
    sys.path.append(module_path)
import pandas as pd
#import seaborn as sns
import platform
from sleep_stage_config import Config
from utilities.utils import *
from sklearn.preprocessing import Normalizer
#sns.set(style='whitegrid', rc={'axes.facecolor': '#EFF2F7'})
import hrvanalysis as hrvana
from datetime import datetime
from pathlib import Path
from tqdm import tqdm

## This tutorial will explain the feature extraction pipeline for activity counts and heart rate variabilities in details
This tutorial only focuses on the pipline of feature extraction for each modality. We didn't align the actigraphy data and RR interval data by sleep epochs. You could find detailed information about aligning activity counts and RR intervals in `align_actigraphy_rri.py`

In [38]:
acc_path = r"\Dataset\MESA\actigraphy"   # the directory is the actigraphy data you downloaded from MESA
acc_feature_output_path = r"\tmp\sleep\act_features" # the directory is the actigraphy feature you want to save
hr_path = r"\Dataset\MESA\annotations-rpoints" # the directory is the R-points you downloaded from MESA
admin_file_path = r"\Dataset\MESA\mesa-sleep-dataset-0.3.0.csv" # the directory is the admin data you downloaded from MESA
hrv_feature_output_path = r"\tmp\sleep\hrv_features" # the directory is the HRV features you want to save
standarize_feature = True

In [22]:
admin_df = pd.read_csv(admin_file_path)
total_subjects_list = admin_df['mesaid'].unique()

In [137]:
all_acc_files = os.listdir(acc_path)
all_hr_files = os.listdir(hr_path)

In [138]:
len(all_hr_files)

1966

### Part 1 The pipeline of feature extraction for heart rate variability in details
Note: 
In our paper, we only used a single window length of 30s to extract the features related to RR-interval. This method will diminish physiological meaning of these features but yield a better classification outcome. 


In [39]:
hrv_win = 0 # 0 means we use one sleep epoch as the window length (every 30s)

In [140]:
all_hr_files[0]

'mesa-sleep-0001-rpoint.csv'

In [None]:
# only extract the HRV features if the subject is in the valid subject list
for PID in tqdm(total_subjects_list):
    mesa_id = "%04d" % PID
    hr_inlist_idx = [s for s in all_hr_files if mesa_id in s]
#     print("processing pid: %s" % os.path.basename(file).split("-")[2])
#     mesa_id = os.path.basename(file).split("-")[2]
    if len(hr_inlist_idx) > 0: # the pid is in the file list.
        hr_file_idx = all_hr_files.index(''.join(hr_inlist_idx))
        hr_df = pd.read_csv(os.path.join(hr_path, all_hr_files[hr_file_idx]))

        hr_df = hr_df[hr_df['TPoint'] > 0]
        hr_df['RR Intervals'] = hr_df['seconds'].diff() * 1000
        hr_df.loc[0, 'RR Intervals']=hr_df.loc[0]['seconds'] * 1000 # make sure the data export from PSG software doesn't contain Nan
        clean_rri = hr_df['RR Intervals'].values
        clean_rri = hrvana.remove_outliers(rr_intervals=clean_rri, low_rri=300, high_rri=2000)
        clean_rri = hrvana.interpolate_nan_values(rr_intervals=clean_rri, interpolation_method="linear")
        clean_rri = hrvana.remove_ectopic_beats(rr_intervals=clean_rri, method="malik")
        clean_rri = hrvana.interpolate_nan_values(rr_intervals=clean_rri)
        hr_df["RR Intervals"] = clean_rri
        # calculate the Heart Rate
        hr_df['HR'] = np.round((60000.0 / hr_df['RR Intervals']), 0)

        # filter RRI if the RR intervals doesn't contain at least 4 valid beats
        t1 = hr_df.epoch.value_counts().reset_index().rename({'index': 'epoch_idx', 'epoch': 'count'}, axis=1)
        invalid_idx = set(t1[t1['count'] < 3]['epoch_idx'].values)
        del t1
        hr_df = hr_df[~hr_df['epoch'].isin(list(invalid_idx))]
        feature_list = []
        # go through all sleep epochs and extract HRV features from them
        for index, hr_epoch_idx in enumerate(hr_df['epoch'].unique()):
            gt_label = hr_df[hr_df['epoch'] == hr_epoch_idx]["stage"].values[0]  # in MESA r-points files, the stage is annotated by the sleep experts
            if hrv_win != 0:  # if you want get the features from a 5 min window, please set hrv_win = 10, as per sleep epoch is 30s
                offset = int(np.floor(hrv_win/2))
                tmp_hr_df = hr_df[hr_df['epoch'].isin(np.arange(hr_epoch_idx-offset, hr_epoch_idx+offset))]
            else:
                tmp_hr_df = hr_df[hr_df['epoch'] == hr_epoch_idx]
            try:  # check to see if the first time stamp is empty
                start_sec = float(tmp_hr_df['seconds'].head(1) * 1.0)
            except Exception as ee:
                print("Exception %s, source dataset: %s" % (ee, tmp_hr_df['seconds'].head(1)))
            # calculate the HRV features for each epoch
            rr_epoch = tmp_hr_df['RR Intervals'].values
            all_hr_features = {}
            try:
                all_hr_features.update(hrvana.get_time_domain_features(rr_epoch))
            except Exception as ee:
                print("processed time domain features with error message: {}".format(str(ee)))
            try:
                all_hr_features.update(hrvana.get_frequency_domain_features(rr_epoch))
            except Exception as ee:
                print("processed frequency domain features with error message: {}".format(str(ee)))
            try:
                all_hr_features.update(hrvana.get_poincare_plot_features(rr_epoch))
            except Exception as ee:
                print("processed poincare features: {} with error message".format(str(ee)))
            try:
                all_hr_features.update(hrvana.get_csi_cvi_features(rr_epoch))
            except Exception as ee:        
                print("processed csi cvi domain features: {} with error message".format(str(ee)))
            try:
                all_hr_features.update(hrvana.get_geometrical_features(rr_epoch))
            except Exception as ee:
                print("processed geometrical features: {} with error message".format(str(ee)))

            all_hr_features.update({'stages': gt_label
                                    , 'mesaid': str(mesa_id)                            
                                    , 'epoch': hr_epoch_idx
                                    #, 'index': index
                                    })
            feature_list.append(all_hr_features)
        hrv_feature_df = pd.DataFrame(feature_list)
        hrv_feature_df.to_csv(os.path.join(hrv_feature_output_path, (mesa_id + '_hrv_features.csv')), index=False)


### Part-2 The pipeline of feature extraction for activity counts in details
Note: 

For the actigraphy based sliding window method, the `get_statistic_feature` function will calculate statistic features based on two window-centring methods. The centred window and backwards-looking window. The dataframe passed in as the function's argument will be expanded to include the calculated features. In this tutorial, we show an example of using 20 sleep epochs as the length of the window. In MESA actigraphy, the 370 calculated features will be appended after column *daybynoon* the calculated actigraphy features can be found in `acc_feature_output_path`


In [None]:
for PID in tqdm(total_subjects_list):
    mesa_id = "%04d" % PID
    # filter Acc and HR based on the overlap records
    print('*' * 100)
    print("Processing subject %s dataset" % mesa_id)
    acc_inlist_idx = [s for s in all_acc_files if mesa_id in s]
    feature_list = []
    if len(acc_inlist_idx) > 0: # the pid is in the file list.
        # get the raw dataset file index
        acc_file_idx = all_acc_files.index(''.join(acc_inlist_idx))
        # load Acc and HR into Pandas
        acc_df = pd.read_csv(os.path.join(acc_path, all_acc_files[acc_file_idx]))
        # filter ACC, as per MESA website recommanded.
        acc_df = acc_df[acc_df['interval'] != 'EXCLUDED']
        #combined_pd = combined_pd.reset_index(drop=True)
        acc_df['timestamp'] = pd.to_datetime(acc_df['linetime'])
        acc_df['base_time'] = pd.to_datetime('00:00:00')
        acc_df['seconds'] = (acc_df['timestamp'] - acc_df['base_time'])
        acc_df['seconds'] = acc_df['seconds'].dt.seconds
        acc_df.drop(['timestamp', 'base_time'], axis=1, inplace=True)
        featnames = get_statistic_feature(acc_df, column_name="activity", windows_size=20)

        list_size_chk = np.array(acc_df[['marker', 'activity']].values.tolist())
        # check whether the activity is empty
        if len(list_size_chk.shape) < 2:
            print(
                "File {f_name} doesn't meet dimension requirement, it's size is {wrong_dim}".format(
                    f_name=all_acc_files[acc_file_idx], wrong_dim=list_size_chk.shape)
            )
        else:
            acc_df = acc_df.fillna(acc_df.median()) # fill the missing data with their median value which is resistant to outliters 
            # standardise and normalise the df
            feature_list = acc_df.columns.to_list() 
            std_feature = [x for x in feature_list if x not in ['two_stages', 'seconds', 'activity', 'interval', 'wake', 'linetime', 'mesaid', 'stages', 'line']]
            if standarize_feature:
                standardize_df_given_feature(acc_df, std_feature, df_name='acc_df', simple_method=False)
            acc_df.to_csv(os.path.join(acc_feature_output_path, (mesa_id + '_act_features.csv')), index=False)