Code to create mean PPG signal and all PPG signals overlaid for ages \
Code to predict mean estimating w/ cross-validation

In [None]:
pip install scikit-learn

In [2]:
import os
import pickle
#import torch
#import torch.nn as nn
#import torch.nn.functional as F
#from scipy.signal import periodogram
import numpy as np
import random
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split

seed = 0
random.seed(seed)
np.random.seed(seed)
#torch.manual_seed(seed)
#if torch.cuda.is_available():
#    torch.cuda.manual_seed(seed)
#torch.backends.cudnn.deterministic = True

device = "cpu" # device to use

In [3]:
def parse_ppg(ppg_str):
    parts = ppg_str.split("|")
    y_values = []
    for part in parts[1:]:
        if part.strip() == "":
            continue
        _, y_val = part.split(",")
        y_values.append(float(y_val))
    return np.array(y_values)

df = pd.read_csv("data.csv")
df = df[df['p4205_i0'].notna()]
#df = df.sample(n=10000, random_state=seed)
df['y_values'] = df['p4205_i0'].apply(parse_ppg)

X = np.stack(df['y_values'].values)
X = (X - X.min(axis=1, keepdims=True)) / X.ptp(axis=1, keepdims=True)
Y = df['p21003_i0'].values

# 80/20 train/test split
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=seed)

# 70/15/15 train/val/test
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.3, random_state=seed)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=seed)


In [6]:
mean_Y_train = np.mean(Y_train)
predictions_train = np.full_like(Y_train, mean_Y_train)


In [None]:
mse = np.mean((predictions_train - Y_train)**2)
mae = np.mean(np.abs(predictions_train - Y_train))
print("Train MSE:", mse)
print("Train MAE:", mae)


In [8]:
predictions_val = np.full_like(Y_val, mean_Y_train)


In [None]:
mse = np.mean((predictions_val - Y_val)**2)
mae = np.mean(np.abs(predictions_val - Y_val))
print("Val MSE:", mse)
print("Val MAE:", mae)


# Cross Validation

In [None]:
import numpy as np
from tqdm import tqdm
import random
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import spearmanr
import subprocess
import sys # For exiting on error

seed = 0
random.seed(seed)
np.random.seed(seed)

def parse_ppg(ppg_str):
    y_values = []
    try:
        parts = ppg_str.split("|")
        for part in parts[1:]:
            if part.strip():
                _, y_val = part.split(",")
                y_values.append(float(y_val))
    except Exception: # Catch potential errors during splitting/conversion
        pass # Return empty list on error
    return np.array(y_values)

def run_dx_upload_minimal(filename):
    command = ["dx", "upload", filename, "--brief"]
    try:
        subprocess.run(command, check=True, capture_output=True, text=True)
        print(f"Successfully uploaded {filename}")
    except FileNotFoundError:
        print("Error: 'dx' command not found.", file=sys.stderr)
    except subprocess.CalledProcessError as e:
        print(f"Error during 'dx upload {filename}':\nStderr: {e.stderr}", file=sys.stderr)
    except Exception as e:
        print(f"An unexpected error during upload of {filename}: {e}", file=sys.stderr)

print("Loading and processing data...")
try:
    df = pd.read_csv("data.csv")
    df = df.dropna(subset=['p4205_i0', 'p21003_i0'])
    df['y_values'] = df['p4205_i0'].apply(parse_ppg)
    df = df[df['y_values'].apply(len) > 0]
    Y = df['p21003_i0'].values
    if len(Y) == 0:
        raise ValueError("No valid data remaining after processing.")
    print(f"Data loaded successfully. Target variable size: {len(Y)}")
except FileNotFoundError:
    print("Error: data.csv not found.", file=sys.stderr)
    sys.exit(1)
except ValueError as e:
    print(f"Error: {e}", file=sys.stderr)
    sys.exit(1)
except Exception as e:
    print(f"An unexpected error occurred during data loading/processing: {e}", file=sys.stderr)
    sys.exit(1)


kf = KFold(n_splits=5, shuffle=True, random_state=seed)
all_results = []
indices = np.arange(len(Y))

print("\nStarting 5-Fold Cross-Validation for Mean Baseline...")

for fold_idx, (train_idx, test_idx) in enumerate(tqdm(kf.split(indices), total=kf.get_n_splits(), desc="CV Folds")):
    fold = fold_idx + 1
    Y_tr, Y_te = Y[train_idx], Y[test_idx]

    if len(Y_tr) == 0 or len(Y_te) == 0:
        print(f"Warning: Skipping Fold {fold} due to empty train/test set.", file=sys.stderr)
        continue

    mean_y_train = np.mean(Y_tr)
    preds = np.full_like(Y_te, fill_value=mean_y_train, dtype=np.float64)

    preds_flat = preds.flatten()
    Y_te_flat = Y_te.flatten()

    valid_indices = ~np.isnan(preds_flat) & ~np.isnan(Y_te_flat) & ~np.isinf(preds_flat) & ~np.isinf(Y_te_flat)
    n_test_valid = int(np.sum(valid_indices))

    if n_test_valid < 2:
        mse, mae, spearman_corr, spearman_p = np.nan, np.nan, np.nan, np.nan
        print(f"Warning: Fold {fold} - Not enough valid points ({n_test_valid}) for metrics.", file=sys.stderr)
    else:
        mse = mean_squared_error(Y_te_flat[valid_indices], preds_flat[valid_indices])
        mae = mean_absolute_error(Y_te_flat[valid_indices], preds_flat[valid_indices])
        spearman_corr, spearman_p = spearmanr(preds_flat[valid_indices], Y_te_flat[valid_indices])

    fold_results_dict = {
        "fold": fold,
        "mean_train_age": mean_y_train,
        "test_mse": mse,
        "test_mae": mae,
        "test_spearman_rho": spearman_corr,
        "test_spearman_p": spearman_p,
        "n_train": len(Y_tr),
        "n_test": len(Y_te),
        "n_test_valid": n_test_valid
    }
    all_results.append(fold_results_dict)
    print(f"Fold {fold} - MAE: {mae:.5f}, MSE: {mse:.5f}, Rho: {spearman_corr:.5f}")


if all_results:
    overall_results_df = pd.DataFrame(all_results)
    overall_results_filename = "all_folds_results_mean_baseline.csv"
    overall_results_df.to_csv(overall_results_filename, index=False, float_format='%.8g')
    print(f"\nSaved overall results to {overall_results_filename}")
    run_dx_upload_minimal(overall_results_filename)

    avg_mse = overall_results_df['test_mse'].mean()
    avg_mae = overall_results_df['test_mae'].mean()
    avg_rho = overall_results_df['test_spearman_rho'].mean()
    print("\n--- Average Metrics Across Folds ---")
    print(f"Average MSE: {avg_mse:.5f}")
    print(f"Average MAE: {avg_mae:.5f}")
    print(f"Average Spearman ρ: {avg_rho:.5f}")
else:
    print("\nNo results generated.")

print("\nCross-validation for mean baseline finished.")

In [None]:
overall_results_df

In [None]:
mean_mse = overall_results_df['test_mse'].mean()
std_mse = overall_results_df['test_mse'].std()

mean_mae = overall_results_df['test_mae'].mean()
std_mae = overall_results_df['test_mae'].std()

mean_rho = overall_results_df['test_spearman_rho'].mean()
std_rho = overall_results_df['test_spearman_rho'].std()


print("--- Mean Baseline Metrics ---")
print(f"Mean MSE: {mean_mse:.3f}")
print(f"Std Dev MSE: {std_mse:.3f}")
print(f"MSE (Mean ± SD): {mean_mse:.3f} ± {std_mse:.3f}")
print(f"MAE (Mean ± SD): {mean_mae:.3f} ± {std_mae:.3f}")
if pd.isna(mean_rho) or pd.isna(std_rho):
    print(f"Spearman (Mean ± SD): NaN")
else:
    print(f"Spearman (Mean ± SD): {mean_rho:.3f} ± {std_rho:.3f}")

# Plotting Ages per group

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import gc

plt.rcdefaults()

plt.close('all')
gc.collect()

threshold = 12000
age_bins = [(40, 45), (45, 50), (50, 55), (55, 60), (60, 65), (65, 70)]
num_bins = len(age_bins)

fig, axes = plt.subplots(nrows=num_bins, ncols=2, sharex=True, sharey=True, figsize=(8, 10))

if num_bins == 1:
    axes = np.array([axes])

max_y_val = -np.inf
min_y_val = np.inf

title_fontsize = 22
xlabel_fontsize = 20
ylabel_fontsize = 20 
current_labelpad = 35

for i, (low, high) in enumerate(age_bins):
    ax_mean = axes[i, 0]
    ax_over = axes[i, 1]

    mask = (Y >= low) & (Y < high)
    X_group = X[mask]
    mean_signal = X_group.mean(axis=0)
    ax_mean.plot(mean_signal)
    max_y_val = max(max_y_val, np.max(mean_signal))
    min_y_val = min(min_y_val, np.min(mean_signal))

    n_signals = X_group.shape[0]
    alpha_val = 0.1
    for sig_idx in range(n_signals):
        ax_over.plot(X_group[sig_idx, :], alpha=alpha_val, linewidth=0.8)
    max_y_val = max(max_y_val, np.max(X_group))
    min_y_val = min(min_y_val, np.min(X_group))

    if i == 0:
        ax_mean.set_title("Mean PPG Signal", fontsize=title_fontsize)
        ax_over.set_title("All PPG Overlaid", fontsize=title_fontsize)
    
    ax_mean.tick_params(axis='both', which='major', labelsize=18)
    ax_over.tick_params(axis='both', which='major', labelsize=18)

    ax_mean.set_ylabel(f"{low}–{high} yr",
                       rotation=0,
                       labelpad=current_labelpad,
                       va="center",
                       ha='right',
                       fontsize=ylabel_fontsize) # Keep larger ylabel font

    if i == num_bins - 1:
        ax_mean.set_xlabel("Time point", fontsize=xlabel_fontsize)
        ax_over.set_xlabel("Time point", fontsize=xlabel_fontsize)

y_padding = (max_y_val - min_y_val) * 0.05
current_ylim_low = min(min_y_val - y_padding, min_y_val * 0.9) if min_y_val > 0 else min_y_val - y_padding
current_ylim_high = max_y_val + y_padding
axes[0, 0].set_ylim(current_ylim_low, current_ylim_high)

plt.tight_layout()
plt.show()

fig.set_size_inches(8, 10)
fig.savefig("ppg_by_age_paired_larger_fonts.png", dpi=300, bbox_inches="tight")

plt.close(fig)
gc.collect()