In [7]:
!pip install mne --quiet

In [1]:
#download
import os

os.makedirs("sleep-edf-database/sleep-cassette", exist_ok=True)
os.makedirs("sleep-edf-database/sleep-telemetry", exist_ok=True)

!wget -r -N -c -np https://physionet.org/files/sleep-edfx/1.0.0/sleep-cassette/ \
    -P sleep-edf-database/ --no-check-certificate --cut-dirs=3 -nH -nd -A '*PSG.edf,*Hypnogram.edf'
!wget -r -N -c -np https://physionet.org/files/sleep-edfx/1.0.0/sleep-telemetry/ \
    -P sleep-edf-database/ --no-check-certificate --cut-dirs=3 -nH -nd -A '*PSG.edf,*Hypnogram.edf'
!mkdir -p sleep-edf-database/sleep-cassette sleep-edf-database/sleep-telemetry
!mv sleep-edf-database/*SC* sleep-edf-database/sleep-cassette/
!mv sleep-edf-database/*ST* sleep-edf-database/sleep-telemetry/

--2025-06-28 11:19:10--  https://physionet.org/files/sleep-edfx/1.0.0/sleep-cassette/
Resolving physionet.org (physionet.org)... 18.18.42.54
Connecting to physionet.org (physionet.org)|18.18.42.54|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘sleep-edf-database/index.html.tmp’

index.html.tmp          [ <=>                ]  37.31K  --.-KB/s    in 0.03s   

Last-modified header missing -- time-stamps turned off.
2025-06-28 11:19:10 (1.18 MB/s) - ‘sleep-edf-database/index.html.tmp’ saved [38201]

Loading robots.txt; please ignore errors.
--2025-06-28 11:19:10--  https://physionet.org/robots.txt
Reusing existing connection to physionet.org:443.
HTTP request sent, awaiting response... 200 OK
Length: 22 [text/plain]
Saving to: ‘sleep-edf-database/robots.txt.tmp’


2025-06-28 11:19:10 (21.0 MB/s) - ‘sleep-edf-database/robots.txt.tmp’ saved [22/22]

Removing sleep-edf-database/robots.txt.tmp.
Removing sleep-edf-database/index.ht

In [None]:
#imports
import os
import numpy as np
import mne

In [None]:
stage_map = {
    "W":0,
    "1":1,
    "2":2,
    "3":3,
    "4":3,
    "R":4
}

In [None]:
def load_and_segment(path_psg,path_hyp,subject_id):
    raw = mne.io.read_raw_edf(path_psg, preload=True, stim_channel=None, verbose = False)
    raw.pick_types(eeg=True,eog=True,emg=True)

    sfreq = raw.info["sfreq"]
    sig_eeg = raw.get_data(picks="eeg")
    sig_eog = raw.get_data(picks="eog")
    sig_emg = raw.get_data(picks="emg")

    ann = mne.read_annotations(path_hyp)

    Xe,Xo,Xm,labs,subs = [],[],[],[],[]

    epoch_len = int(30*sfreq) #30 second windows
    for onset,desc in zip(ann.onset, ann.description):
        char = desc.split()[-1]
        if char in stage_map: #skipping M
            start = int(onset * sfreq)

            if start + epoch_len <= sig_eeg.shape[1]:
                seg_eeg = sig_eeg[:, start:start + epoch_len]
                seg_eog = sig_eog[:, start:start + epoch_len]
                seg_emg = sig_emg[:, start:start + epoch_len]

                Xe.append(seg_eeg.T)
                Xo.append(seg_eog.T)
                Xm.append(seg_emg.T)
                #turns to  3000,n_ch
                labs.append(stage_map[char])
                subs.append(subject_id)

    return np.array(Xe), np.array(Xo), np.array(Xm), np.array(labs), np.array(subs)


In [None]:
data_dir = "sleep-edf-database"
all_Xe, all_Xo, all_Xm, all_y,all_subj = [], [], [], [], []

for subset in ["sleep-cassette","sleep-telemetry"]:
    subdir = os.path_join(data_dir, subset)
    for fname in os.listdir(subdir):
        if fname.endswith("PSG.edf"):
            psg_path = os.path.join(subdir, fname)
            prefix = fname.replace("-PSG.edf", "")
            hyp_files = [f for f in os.listdir(subdir) if f.startswith(prefix[:-1]) and f.endswith("Hypnogram.edf")]

            if not hyp_files:
                continue

            hyp_path = os.path.join(subdir, hyp_files[0])
            subj_str = prefix[3:5] if subset == "sleep-cassette" else prefix[3:5]
            subj_id = int(subj_str)

            Xe,Xo, Xm,labs,subs = load_and_segment(psg_path, hyp_path, subj_id)
            all_Xe.append(Xe);all_Xo.append(Xo);all_Xm.append(Xm);all_y.append(labs);all_subj.append(subs)

X_eeg, X_eog,X_emg, y, subjects = np.vstack(all_Xe),np.vstack(all_Xo),np.vstack(all_Xm),np.concatenate(all_y),np.concatenate(all_subj)

print(f"Total epochs: {len(y)} (EEG shape: {X_eeg.shape}, EOG shape: {X_eog.shape}, EMG shape: {X_emg.shape})")
print(f"class distrib: {np.bincount(y)}")
print(f"# of subjects: {len(set(subjects))}")