In [None]:
# ============================================
# Notebook 01 - Preprocessing of HRV Time Series
# ============================================

# Mount drive if in Colab
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import os

# --------------------------------------------
# 1. Load metadata
# --------------------------------------------
df_info = pd.read_csv("/content/drive/MyDrive/Paper_TDA_HRV/patient-info.csv")

# --------------------------------------------
# 2. Define pediatric age groups (WHO criteria)
# --------------------------------------------
def create_age_groups_who(df):
    def age_group(age):
        if pd.isna(age):
            return "Unknown"
        elif 0 <= age < 1/12:
            return "Neonates (0–1 mo)"
        elif 1/12 <= age < 6/12:
            return "Early Infancy (1–5 mo)"
        elif 6/12 <= age < 1.0:
            return "Late Infancy (6–11 mo)"
        elif 1.0 <= age < 3.0:
            return "Toddlers (1–2 yr)"
        elif 3.0 <= age < 6.0:
            return "Preschoolers (3–5 yr)"
        elif 6.0 <= age < 12.0:
            return "School-age (6–11 yr)"
        elif 12.0 <= age < 18.0:
            return "Adolescents (12–17 yr)"
        else:
            return "Excluded"
    df['Age_Group'] = df['Age (years)'].apply(age_group)
    return df[df['Age_Group'] != "Excluded"]

df_info = create_age_groups_who(df_info)
print(df_info['Age_Group'].value_counts())

# --------------------------------------------
# 3. Load and preprocess RR time series
# --------------------------------------------
base_path = "/content/drive/MyDrive/Paper_TDA_HRV/rr_data/rr-interval-time-series-from-healthy-subjects-1.0.0"
N_SAMPLE = 3000

def load_rr_series(file_path, N_sample=N_SAMPLE):
    rr = pd.to_numeric(pd.read_csv(file_path, header=None, usecols=[0], dtype=str).iloc[:, 0],
                       errors='coerce').dropna().values
    if len(rr) < N_sample:
        return None
    rr = (rr - np.mean(rr)) / np.std(rr)     # Standardization (z-score)
    rr = rr[:N_sample]                       # Truncate to fixed length
    return rr

processed = []
for _, row in df_info.iterrows():
    file_path = os.path.join(base_path, f"{int(row['File']):03d}.txt")
    if os.path.exists(file_path):
        rr = load_rr_series(file_path)
        if rr is not None:
            processed.append({
                "File": int(row['File']),
                "Age_Group": row['Age_Group'],
                "RR_series": rr
            })

df_rr = pd.DataFrame(processed)
print(f" Preprocessed {len(df_rr)} valid subjects.")

# --------------------------------------------
# 4. Save preprocessed dataset
# --------------------------------------------
output_path = "/content/drive/MyDrive/Paper_TDA_HRV/data_preprocessed/RR_preprocessed.pkl"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df_rr.to_pickle(output_path)
print(f" Saved preprocessed RR series to: {output_path}")
