In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pyedflib

import os 
from tqdm import tqdm

from  scipy import signal
from scipy.signal import lfilter
from neurokit2 import ecg_clean, ecg_quality

import xml.etree.ElementTree as ET

In [2]:
def extract_sleep_stages(anno_path):

    tree = ET.parse(anno_path) 
    root = tree.getroot() 

    sleep_stages = []

    scored_events = "None"

    for child in root:
        if child.tag == "ScoredEvents":
            scored_events = child

    for event in scored_events:
        event_type = "None"
        duration = "None"
        flag = 0
        
        for child in event:
            if child.tag == "EventType":
                if child.text == "Stages|Stages":
                    flag = 1

            if child.tag == "EventConcept":
                sleep_stage = child.text.split("|")[-1]

            if child.tag == "Duration":
                duration = int(float(child.text))
            
            if child.tag == "Start":
                start = int(float(child.text))

        if flag == 1:
            for _ in range(0, duration, 10):
                sleep_stages.append(sleep_stage)

        
    return sleep_stages

In [3]:
def write_signal(signal, path):
    f = open(path, 'w')
    for value in signal:
        f.write(str(value))
        f.write("\n")
    
    f.close()

# SHHS2 PSG Recordings

In [9]:
dir_path = "D:/shhs/polysomnography/edfs/shhs2"
records = os.listdir(dir_path)

f = open('Data/RECORDS.txt', 'w')

for rec in records:
    rec = rec.split("-")[-1]
    rec = rec.split(".")[0]
    f.write(rec)
    f.write("\n")
    
f.close()

In [4]:
with open('Data/RECORDS.txt') as f:
    lines = f.readlines()

records = [line[:-1] for line in lines]

In [5]:
edf_path = "D:/shhs/polysomnography/edfs/shhs2"
sleep_stages_dir = "D:/shhs/polysomnography/annotations-events-nsrr/shhs2"
output_dir = "Data/SHHS2 Strips/"

l = 10
final_fs = 100

selected_strips_names, selected_sleep_stages, strips_number, selected_patient = [], [], [], []

for pat in tqdm(records[:5]):
    file_name = "shhs2-" + pat + ".edf"
    file_path = os.path.join(edf_path, file_name)

    annot_file = "shhs2-" + pat + "-nsrr.xml"
    anno_path = os.path.join(sleep_stages_dir, annot_file)

    sleep_stages = extract_sleep_stages(anno_path)
    f = pyedflib.EdfReader(file_path)
    n = f.signals_in_file
    signal_labels = np.array(f.getSignalLabels())

    ecg_idx = np.where(signal_labels == "ECG")[0][0]
    ecg_signal = f.readSignal(ecg_idx)

    fs = f.getSampleFrequencies()[ecg_idx]

    num_strips = int(ecg_signal.shape[0] / (fs * l))

    if num_strips != len(sleep_stages):
        print(pat)
        continue

    # Signal Cleaning
    ecg_signal = ecg_signal * (-1)
    filter = signal.butter(N=5, Wn=0.5, btype='highpass', output='sos', fs=fs)
    ecg_signal = signal.sosfilt(filter, ecg_signal)

    # Signal Resampling
    resampled_ecg = signal.resample(ecg_signal, num = num_strips * l * 100)
    counter = 0

    for i in range(num_strips):
        tmp_signal = resampled_ecg[i * l * final_fs : (i + 1) * l * final_fs]

        try:
            q = ecg_quality(tmp_signal, rpeaks=None, sampling_rate=fs, method='zhao2018')
        
        except:
            continue

        if q in ['Excellent']:
            output_file_name = pat + "_" + str(counter) + ".txt"
            output_file_path = os.path.join(output_dir, output_file_name)

            write_signal(tmp_signal, output_file_path)

            selected_strips_names.append(output_file_name)
            selected_sleep_stages.append(sleep_stages[i])
            strips_number.append(i + 1)
            selected_patient.append(pat)
            
            counter += 1

selected_strips_names = np.array(selected_strips_names).reshape(-1, 1)
selected_sleep_stages = np.array(selected_sleep_stages).reshape(-1, 1)
strips_number = np.array(strips_number).reshape(-1, 1)
selected_patient = np.array(selected_patient).reshape(-1, 1)

dataset = np.concatenate([selected_strips_names, strips_number, selected_patient, selected_sleep_stages], axis=1)
pd.DataFrame(dataset, columns=["Strip ID", "Strip Time Step", "Patient ID", "Sleep Stage"]).to_csv("sshs1.csv", index=False)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 5/5 [01:41<00:00, 20.29s/it]


In [11]:
edf_path = "D:/shhs/polysomnography/edfs/shhs1"
sleep_stages_dir = "D:/shhs/polysomnography/annotations-events-nsrr/shhs1"
output_dir = "Data/SHHS1 Strips/"

l = 10
final_fs = 100

selected_strips_names, selected_sleep_stages, strips_number, selected_patient = [], [], [], []

for pat in tqdm(records[:5]):
    file_name = "shhs1-" + pat + ".edf"
    file_path = os.path.join(edf_path, file_name)

    annot_file = "shhs1-" + pat + "-nsrr.xml"
    anno_path = os.path.join(sleep_stages_dir, annot_file)

    sleep_stages = extract_sleep_stages(anno_path)
    f = pyedflib.EdfReader(file_path)
    n = f.signals_in_file
    signal_labels = np.array(f.getSignalLabels())

    ecg_idx = np.where(signal_labels == "ECG")[0][0]
    ecg_signal = f.readSignal(ecg_idx)

    fs = f.getSampleFrequencies()[ecg_idx]

    num_strips = int(ecg_signal.shape[0] / (fs * l))

    if num_strips != len(sleep_stages):
        print(pat)
        continue

    # Signal Cleaning
    ecg_signal = ecg_signal * (-1)
    filter = signal.butter(N=5, Wn=0.5, btype='highpass', output='sos', fs=fs)
    ecg_signal = signal.sosfilt(filter, ecg_signal)

    # Signal Resampling
    resampled_ecg = signal.resample(ecg_signal, num = num_strips * l * 100)
    counter = 0

    for i in range(num_strips):
        tmp_signal = resampled_ecg[i * l * final_fs : (i + 1) * l * final_fs]

        try:
            q = ecg_quality(tmp_signal, rpeaks=None, sampling_rate=fs, method='zhao2018')
        
        except:
            continue

        if q in ['Excellent']:
            output_file_name = pat + "_" + str(counter) + ".txt"
            output_file_path = os.path.join(output_dir, output_file_name)

            write_signal(tmp_signal, output_file_path)

            selected_strips_names.append(output_file_name)
            selected_sleep_stages.append(sleep_stages[i])
            strips_number.append(i + 1)
            selected_patient.append(pat)
            
            counter += 1


selected_strips_names = np.array(selected_strips_names).reshape(-1, 1)
selected_sleep_stages = np.array(selected_sleep_stages).reshape(-1, 1)
strips_number = np.array(strips_number).reshape(-1, 1)
selected_patient = np.array(selected_patient).reshape(-1, 1)

dataset = np.concatenate([selected_strips_names, strips_number, selected_patient, selected_sleep_stages], axis=1)
pd.DataFrame(dataset, columns=["Strip ID", "Strip Time Step", "Patient ID", "Sleep Stage"]).to_csv("sshs2.csv", index=False)

100%|██████████| 5/5 [02:22<00:00, 28.53s/it]
