In [3]:
# Standard library imports
from datetime import time, datetime, timedelta
import os

# Third-party imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import resample
from tqdm import tqdm
from joblib import Parallel, delayed

# Local imports
from helper_code import *
from team_code import *
from segment_quality_evaluation import process_EEG

from ClassicalFeatureExtractor import ClassicalFeatureExtractor
from FMMFeatureExtractor import FMMFeatureExtractor

# autoreload changes in helper_code and team_code
%load_ext autoreload
%autoreload 2


# Get the start time (as a time object) from the recording data.
def get_start_time_triplet(string):
    variable = get_variable(string, '#Start time', str)
    hours, minutes, seconds = (int(value) for value in variable.split(':'))
    return time(hours, minutes, seconds)

# Get the end time (as a time object) from the recording data.
def get_end_time_triplet(string):
    variable = get_variable(string, '#End time', str)
    hours, minutes, seconds = (int(value) for value in variable.split(':'))
    return time(hours, minutes, seconds)

DATA_FOLDER = "../data/"
ROOT_FOLDER = DATA_FOLDER+ "physionet.org/files/i-care/2.1/training"

In [5]:
def segmentos_5_minutos(start: time, end: time):
    # Convertir a datetime para facilitar los cálculos
    start_dt = datetime(2000, 1, 1, start.hour, start.minute, start.second)
    end_dt = datetime(2000, 1, 1, end.hour, end.minute, end.second)
    
    # Ajustar el inicio al siguiente múltiplo de 5 minutos
    start_min = (start_dt.minute // 5) * 5
    if start_dt.minute % 5 != 0 or start_dt.second > 0:
        start_min += 5
    
    # Controlar si start_min se vuelve 60 y ajustar la hora
    if start_min == 60:
        start_segment = datetime(2000, 1, 1, start_dt.hour + 1, 0, 0)
    else:
        start_segment = datetime(2000, 1, 1, start_dt.hour, start_min, 0)
    
    # Lista de segmentos
    segments = []
    
    while start_segment + timedelta(minutes=5) - timedelta(seconds=1) <= end_dt:
        end_minute = start_segment.minute + 4
        end_hour = start_segment.hour
        if end_minute >= 60:
            end_minute -= 60
            end_hour += 1
        
        segments.append((
            time(start_segment.hour, start_segment.minute, 0),
            time(end_hour, end_minute, 59)
        ))
        start_segment += timedelta(minutes=5)
    
    return segments

def dividir_dataframe_en_segmentos(df, start: time, end: time, sampling_frequency: int):
    samples_per_5_min = sampling_frequency * 5 * 60
    start_times = []
    end_times = []

    # If len is less that 5min do early return
    if len(df) < samples_per_5_min:
        return start_times, end_times

    segments = segmentos_5_minutos(start, end)

    return segments

In [6]:
# Definir los tiempos de inicio y fin
start_time = time(23, 00, 0)
end_time = time(23, 59, 59)

# Crear un DataFrame de ejemplo con sampling_frequency de 128
sampling_frequency = 128  # 128 observaciones por segundo
total_seconds = (datetime(2000, 1, 1, end_time.hour, end_time.minute, end_time.second) - 
                 datetime(2000, 1, 1, start_time.hour, start_time.minute, start_time.second)).total_seconds()
data = {"col1": range(int(total_seconds * sampling_frequency)), "col2": range(int(total_seconds * sampling_frequency))}
df = pd.DataFrame(data)

# Obtener sub DataFrames
segments = dividir_dataframe_en_segmentos(df, start_time, end_time, sampling_frequency)

# Imprimir cada sub DataFrame
for i, (start_frag, end_frag) in enumerate(segments):
    print(f"Segmento {i + 1} (Inicio: {start_frag}, Fin: {end_frag}):")

Segmento 1 (Inicio: 23:00:00, Fin: 23:04:59):
Segmento 2 (Inicio: 23:05:00, Fin: 23:09:59):
Segmento 3 (Inicio: 23:10:00, Fin: 23:14:59):
Segmento 4 (Inicio: 23:15:00, Fin: 23:19:59):
Segmento 5 (Inicio: 23:20:00, Fin: 23:24:59):
Segmento 6 (Inicio: 23:25:00, Fin: 23:29:59):
Segmento 7 (Inicio: 23:30:00, Fin: 23:34:59):
Segmento 8 (Inicio: 23:35:00, Fin: 23:39:59):
Segmento 9 (Inicio: 23:40:00, Fin: 23:44:59):
Segmento 10 (Inicio: 23:45:00, Fin: 23:49:59):
Segmento 11 (Inicio: 23:50:00, Fin: 23:54:59):
Segmento 12 (Inicio: 23:55:00, Fin: 23:59:59):


In [7]:
data_folder = ROOT_FOLDER
patient_ids = find_data_folders(data_folder)
num_patients = len(patient_ids)
current_features = None
GROUP = 'EEG'



def time_to_seconds(t):
    return t.hour * 3600 + t.minute * 60 + t.second

def resample_segment(data, original_sampling_frequency, target_sampling_frequency):
    num_samples = int(data.shape[1] * target_sampling_frequency / original_sampling_frequency)
    resampled_data = resample(data, num_samples, axis=1)
    return resampled_data

In [8]:
N_HOURS = 12

MIN_MINUTES = 5
DROP_MEAN_STD = 0
DOUBLE_BANANA = [
        ("Fp1", "F7"), ("F7", "T3"), ("T3", "T5"), ("T5", "O1"),  # Left temporal chain
        ("Fp2", "F8"), ("F8", "T4"), ("T4", "T6"), ("T6", "O2"),  # Right temporal chain
        ("Fp1", "F3"), ("F3", "C3"), ("C3", "P3"), ("P3", "O1"),  # Left parasagittal chain
        ("Fp2", "F4"), ("F4", "C4"), ("C4", "P4"), ("P4", "O2"),  # Right parasagittal chain
        ("Fz", "Cz"), ("Cz", "Pz")                                # Central chain
]
BANDPASS_FREQS = [0.1, 45.0]
TARGET_SAMPLING_FREQUENCY = 128 # Nyquist

In [9]:
len(DOUBLE_BANANA)

18

## FMM features

In [10]:
# convert this into list of tuples
epoch_subset = pd.read_csv(os.path.join(DATA_FOLDER, "epoch_subset.csv"))
epoch_subset_starts = list(zip(epoch_subset["patient_id"], epoch_subset["start_time"]))
print(epoch_subset_starts[0:5])
epoch_subset_hours = list(zip(epoch_subset["patient_id"], epoch_subset["hour"]))
print(epoch_subset_hours[0:5])

[(284, '04:10:00'), (284, '06:00:00'), (284, '06:05:00'), (284, '06:50:00'), (284, '07:35:00')]
[(284, 4), (284, 6), (284, 6), (284, 6), (284, 7)]


In [11]:
epoch_subset

Unnamed: 0,patient_id,hour,time_block,start_time,quality,end_time,nan_quality,gap_quality,outlier_quality,flat_quality,sharpness_quality,cohesion_quality,max,min,random,central,lastgood
0,284,4,4-8H,04:10:00,57.375176,04:14:59,100.0,100.0,93.104890,47.725839,59.640442,29.029533,True,False,False,False,False
1,284,6,4-8H,06:00:00,47.548611,06:04:59,100.0,100.0,95.108507,49.954861,23.283882,21.847196,False,False,False,True,False
2,284,6,4-8H,06:05:00,47.419414,06:09:59,100.0,100.0,95.189525,48.716869,22.050632,23.720628,False,False,True,False,False
3,284,6,4-8H,06:50:00,42.707738,06:54:59,100.0,100.0,94.382957,49.324942,11.693447,15.429606,False,True,False,False,False
4,284,7,4-8H,07:35:00,50.155103,07:39:59,100.0,100.0,95.070891,49.093461,26.735795,29.720265,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8110,1020,19,16-20H,19:10:00,46.832234,19:14:59,100.0,100.0,90.842014,56.883970,22.858524,16.744427,False,False,True,False,False
8111,1020,20,20-24H,20:05:00,44.841225,20:09:59,100.0,100.0,89.418113,55.547743,16.948358,17.450685,False,False,True,False,False
8112,1020,20,20-24H,20:10:00,43.993089,20:14:59,100.0,100.0,89.308304,55.577836,11.752766,19.333452,False,True,False,False,False
8113,1020,22,20-24H,22:00:00,45.705842,22:04:59,100.0,100.0,89.150897,56.002894,11.685924,25.983654,False,False,False,True,False


In [None]:
# Define a function to process a single recording file
def process_recording(patient_id, recording_id):
    
    recording_location = os.path.join(
        data_folder, patient_id, f"{recording_id}_{GROUP}"
    )

    with open(recording_location + ".hea", "r") as f:
        header = f.read()
        start_time = get_start_time_triplet(header)
        end_time = get_end_time_triplet(header)

    epoch_tuple = (int(patient_id), start_time.hour)

    if epoch_tuple not in epoch_subset_hours:
        return None

    data, channels, sampling_frequency = load_recording_data(recording_location)
    utility_frequency = get_utility_frequency(recording_location + ".hea")
    data, sampling_frequency = preprocess_data(
        data, sampling_frequency, utility_frequency, channels
    )

    if data is None:
        return None  # Skip empty data

    data = get_bipolar_data(data)  # Create double-banana montage
    segments = dividir_dataframe_en_segmentos(
        data, start_time, end_time, sampling_frequency
    )

    df_classical = []
    df_fmm = []

    for start_frag, end_frag in segments:
        start_idx = (
            time_to_seconds(start_frag) - time_to_seconds(start_time)
        ) * sampling_frequency
        end_idx = (
            time_to_seconds(end_frag) - time_to_seconds(start_time)
        ) * sampling_frequency + sampling_frequency
        segment_df = data.iloc[start_idx:end_idx, :].reset_index(drop=True)

        epoch_tuple = (int(patient_id), start_frag.strftime("%H:%M:%S"))
        
        if epoch_tuple not in epoch_subset_starts:
            continue
        
        # For each epoch of 10 sec
        for i in range(0, segment_df.shape[0], 10 * sampling_frequency):
            start_idx = i
            end_idx = i + 10 * sampling_frequency
            epoch_num = i // (10 * sampling_frequency) + 1

            if end_idx > segment_df.shape[0]:
                break

            epoch_df = segment_df.iloc[start_idx:end_idx, :].reset_index(drop=True)
            
            eeg_extractor = ClassicalFeatureExtractor(epoch_df, sf = sampling_frequency)
            fmm_extractor = FMMFeatureExtractor(epoch_df)


            try:
                features_classical = eeg_extractor.extract_all_features()
                features_fmm = fmm_extractor.estimate_parameters(n_components=10)

                features_classical.insert(0, "patient_id", patient_id)
                features_classical.insert(1, "start_time", start_frag)
                features_classical.insert(2, "end_time", end_frag)
                features_classical.insert(3, "epoch_num", epoch_num) 
                
                features_fmm.insert(0, "patient_id", patient_id)
                features_fmm.insert(1, "start_time", start_frag)
                features_fmm.insert(2, "end_time", end_frag)
                features_fmm.insert(3, "epoch_num", epoch_num)

                df_classical.append(features_classical)
                df_fmm.append(features_fmm)

            except Exception as e:
                print(f"Error processing recording {recording_id} for patient {patient_id}: {e}")
                continue

    if not df_classical or not df_fmm:
        return None

    df_classical = pd.concat(df_classical, axis=0)
    df_fmm = pd.concat(df_fmm, axis=0)

    return (df_classical, df_fmm)

In [None]:
# have list of all files in a given folder
lista = os.listdir("../data/features_fmm")
lista = [x[0:4] for x in lista]
diffs = set(patient_ids) - set(lista)
print("Number of patient without extracted features for any reason: ", len(diffs))
folder_counts = pd.Series({folder: len(os.listdir(os.path.join(data_folder, folder))) for folder in os.listdir(data_folder)})
non_eeg_ids = folder_counts.sort_values(ascending=True)
non_eeg_ids = non_eeg_ids[non_eeg_ids <=1].index.tolist()
non_eeg_ids = list(sorted(non_eeg_ids))
print("Patients without extracted features because of computation error", set(diffs) - set(non_eeg_ids))

Number of patient without extracted features for any reason:  89
Patients without extracted features because of computation error {'0830', '0548', '0666', '0569'}


In [50]:
patient_ids = ['0548', '0569', '0666', '0830']

for patient_id in tqdm([x for x in patient_ids], desc="Processing Patients"): # TODO
    # Process all files of a single patient in parallel
    num_jobs = 8  # Use all CPU cores
    patient_results = Parallel(n_jobs=num_jobs)(
        delayed(process_recording)(patient_id, recording_id)
        for recording_id in find_recording_files(data_folder, patient_id)
    )

    # Remove None results
    patient_results = [res for res in patient_results if res is not None]

    if patient_results:
        # Separate classical and FMM features
        classical_features = [res[0] for res in patient_results]
        fmm_features = [res[1] for res in patient_results]

        # Concatenate results for each type of features
        classical_features_df = pd.concat(classical_features, axis=0)
        fmm_features_df = pd.concat(fmm_features, axis=0)

        # Define output directories
        output_dir_classical = os.path.join(DATA_FOLDER, "features_classical")
        output_dir_fmm = os.path.join(DATA_FOLDER, "features_fmm")
        os.makedirs(output_dir_classical, exist_ok=True)
        os.makedirs(output_dir_fmm, exist_ok=True)

        # Save CSVs immediately per patient
        classical_csv_path = os.path.join(output_dir_classical, f"{patient_id}_features_classical.csv")
        fmm_csv_path = os.path.join(output_dir_fmm, f"{patient_id}_features_fmm.csv")

        classical_features_df.to_csv(classical_csv_path, index=False)
        fmm_features_df.to_csv(fmm_csv_path, index=False)

        print(f"Saved classical features: {classical_csv_path}")
        print(f"Saved FMM features: {fmm_csv_path}")


Processing Patients: 100%|██████████| 4/4 [00:49<00:00, 12.32s/it]


In [205]:
patient_id =  "0284"
recording_id = "0284_002_005"
recording_id = "0284_001_004"
process_recording(patient_id, recording_id)

YES  04:10:00 04:14:59   (38400, 18)
NO  (284, '04:15:00')
NO  (284, '04:20:00')
NO  (284, '04:25:00')
NO  (284, '04:30:00')
NO  (284, '04:35:00')
NO  (284, '04:40:00')
NO  (284, '04:45:00')
NO  (284, '04:50:00')
NO  (284, '04:55:00')


(  patient_id start_time  end_time  epoch_num  ENT_perm_Fp1-F7  \
 0       0284   04:10:00  04:14:59          1         0.995237   
 0       0284   04:10:00  04:14:59          2         0.996659   
 0       0284   04:10:00  04:14:59          3         0.998784   
 0       0284   04:10:00  04:14:59          4         0.999233   
 0       0284   04:10:00  04:14:59          5         0.998300   
 0       0284   04:10:00  04:14:59          6         0.994240   
 0       0284   04:10:00  04:14:59          7         0.992685   
 0       0284   04:10:00  04:14:59          8         0.983276   
 0       0284   04:10:00  04:14:59          9         0.994831   
 0       0284   04:10:00  04:14:59         10         0.979899   
 0       0284   04:10:00  04:14:59         11         0.987067   
 0       0284   04:10:00  04:14:59         12         0.998813   
 0       0284   04:10:00  04:14:59         13         0.999831   
 0       0284   04:10:00  04:14:59         14         0.998381   
 0       0