In [1]:
# Ensure time_hours exists immediately after dataset creation
# This cell should run early so later summary cells can use it
try:
    import pandas as pd
    if 'df_complete' in globals():
        if 'time_hours' not in df_complete.columns:
            if 'timestamp' in df_complete.columns:
                ts0 = pd.to_datetime(df_complete['timestamp'].iloc[0])
                df_complete['time_hours'] = (pd.to_datetime(df_complete['timestamp']) - ts0).dt.total_seconds() / 3600.0
            else:
                df_complete = df_complete.copy()
                df_complete['row_idx'] = df_complete.groupby('patient_id').cumcount()
                df_complete['time_hours'] = (df_complete['row_idx'] * 5) / 60.0
                df_complete.drop(columns=['row_idx'], inplace=True)
        print('time_hours ready (top patch). Max:', round(float(df_complete['time_hours'].max()), 2))
    else:
        print('Dataset df_complete not yet defined; this cell will be effective after generation.')
except Exception as e:
    print('Failed to ensure time_hours:', e)

Dataset df_complete not yet defined; this cell will be effective after generation.


# Phase 1: Generate Mocked Physiological Data
## AI-Based Evolution for Personalized Adaptive Hypothermia Device

This notebook generates synthetic physiological data for infants undergoing therapeutic hypothermia for Hypoxic-Ischemic Encephalopathy (HIE) treatment. The data will be used to train ML models for personalized temperature tuning, seizure prediction, and outcome assessment.

**Key physiological parameters:**
- Rectal temperature (core temperature monitoring)
- Heart rate and rhythm
- Blood pressure (systolic/diastolic)
- Oxygen saturation (SpO2)
- EEG signals (for seizure detection)
- Blood gas analysis (pH, pCO2, pO2, lactate)
- Seizure indicators

## Section 1: Import Required Libraries

In [2]:
# Import required libraries
import sys
import os

# Add utils directory to path
sys.path.insert(0, '../utils')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set style for visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Libraries imported successfully!")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"Matplotlib version: {plt.matplotlib.__version__}")
print(f"Seaborn version: {sns.__version__}")

Libraries imported successfully!
NumPy version: 2.2.3
Pandas version: 2.3.3
Matplotlib version: 3.9.2
Seaborn version: 0.13.2


## Section 2: Initialize Data Generator and Generate Mocked Data

In [3]:
# Import the data generator
from data_generator import InfantPhysiologicalDataGenerator

# Initialize the generator with fixed seed for reproducibility
generator = InfantPhysiologicalDataGenerator(seed=42)

print("Data Generator initialized successfully!")

Data Generator initialized successfully!


In [4]:
# Generate dataset for a cohort of 50 infants over 72 hours (3 days of therapeutic hypothermia)
# This creates realistic mock data for model training
print("Generating physiological data for 50 infants over 72 hours...")
print("This may take a few moments...\n")

# Generate dataset
df_complete = generator.generate_batch_dataset(
    num_patients=50,
    duration_hours=72
)

# Ensure 'time_hours' exists even if generator skipped it
if 'time_hours' not in df_complete.columns:
    print("'time_hours' missing — deriving from timestamps or index...")
    if 'timestamp' in df_complete.columns and pd.api.types.is_datetime64_any_dtype(df_complete['timestamp']):
        df_complete['time_hours'] = (
            df_complete.groupby('patient_id')['timestamp']
            .transform(lambda s: (s - s.min()).dt.total_seconds() / 3600.0)
        )
    else:
        cadence_minutes = 5
        df_complete['row_idx'] = df_complete.groupby('patient_id').cumcount()
        df_complete['time_hours'] = df_complete['row_idx'] * (cadence_minutes / 60.0)
        df_complete.drop(columns=['row_idx'], inplace=True)
    print("✓ 'time_hours' column created.")
else:
    print("'time_hours' already present.")

print(f"\n✓ Dataset generation completed!")
print(f"Total records: {len(df_complete):,}")
print(f"Number of patients: {df_complete['patient_id'].nunique()}")
print(f"Records per patient: {len(df_complete) // df_complete['patient_id'].nunique()}")
print(f"Duration per patient: {df_complete.groupby('patient_id')['time_hours'].max().median():.1f} hours (median)")
print(f"Sampling interval: 5 minutes")

Generating physiological data for 50 infants over 72 hours...
This may take a few moments...

Generating data for PATIENT_0001...
Generating data for PATIENT_0002...
Generating data for PATIENT_0003...
Generating data for PATIENT_0004...
Generating data for PATIENT_0005...
Generating data for PATIENT_0006...
Generating data for PATIENT_0007...
Generating data for PATIENT_0008...
Generating data for PATIENT_0009...
Generating data for PATIENT_0010...
Generating data for PATIENT_0011...
Generating data for PATIENT_0012...
Generating data for PATIENT_0013...
Generating data for PATIENT_0014...
Generating data for PATIENT_0015...
Generating data for PATIENT_0016...
Generating data for PATIENT_0017...
Generating data for PATIENT_0018...
Generating data for PATIENT_0019...
Generating data for PATIENT_0020...
Generating data for PATIENT_0021...
Generating data for PATIENT_0011...
Generating data for PATIENT_0012...
Generating data for PATIENT_0013...
Generating data for PATIENT_0014...
Genera

KeyError: 'time_hours'

In [None]:
# Ensure 'time_hours' exists even if generator skipped it
if 'time_hours' not in df_complete.columns:
    print("'time_hours' missing — deriving from timestamps or index...")
    if 'timestamp' in df_complete.columns:
        # Assume 'timestamp' is pandas datetime; derive hours since first timestamp per patient
        df_complete['time_hours'] = (
            df_complete.groupby('patient_id')['timestamp']
            .transform(lambda s: (s - s.min()).dt.total_seconds() / 3600.0)
        )
    else:
        # Fallback: infer by row order assuming 5-minute sampling cadence per patient
        cadence_minutes = 5
        df_complete['row_idx'] = df_complete.groupby('patient_id').cumcount()
        df_complete['time_hours'] = df_complete['row_idx'] * (cadence_minutes / 60.0)
        df_complete.drop(columns=['row_idx'], inplace=True)
    print("✓ 'time_hours' column created.")
else:
    print("'time_hours' already present.")

In [None]:
# Display dataset information
print("=" * 80)
print("DATASET OVERVIEW")
print("=" * 80)
print(f"\nDataset shape: {df_complete.shape}")
print(f"\nColumn names and types:")
print(df_complete.dtypes)
print(f"\nFirst 10 rows:")
print(df_complete.head(10))

In [None]:
# Display statistical summary
print("\n" + "=" * 80)
print("STATISTICAL SUMMARY")
print("=" * 80)
print(df_complete.describe())

## Section 3: Visualize Temperature Profiles

In [None]:
# Visualize temperature profiles for different HIE severity levels
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Rectal Temperature Profiles by HIE Severity', fontsize=16, fontweight='bold')

hie_levels = ['mild', 'moderate', 'severe']
colors = ['green', 'orange', 'red']

# Plot 1: Individual temperature traces
ax = axes[0, 0]
for hie, color in zip(hie_levels, colors):
    patients = df_complete[df_complete['hie_severity'] == hie]['patient_id'].unique()[:5]
    for patient in patients:
        patient_data = df_complete[df_complete['patient_id'] == patient]
        ax.plot(patient_data['time_hours'], patient_data['rectal_temperature_c'], 
                alpha=0.3, color=color, linewidth=1)
    
    # Plot mean
    mean_temp = df_complete[df_complete['hie_severity'] == hie].groupby('time_hours')['rectal_temperature_c'].mean()
    ax.plot(mean_temp.index, mean_temp.values, color=color, linewidth=2.5, 
            label=f'{hie.capitalize()} (mean)', marker='o', markersize=3, markevery=20)

ax.axhline(y=33.5, color='blue', linestyle='--', linewidth=2, label='Standard target (33.5°C)')
ax.set_xlabel('Time (hours)', fontsize=11, fontweight='bold')
ax.set_ylabel('Rectal Temperature (°C)', fontsize=11, fontweight='bold')
ax.set_title('Temperature Trajectories by HIE Severity', fontsize=12, fontweight='bold')
ax.legend(loc='best')
ax.grid(True, alpha=0.3)

# Plot 2: Distribution of temperature changes
ax = axes[0, 1]
for hie, color in zip(hie_levels, colors):
    patient_ids = df_complete[df_complete['hie_severity'] == hie]['patient_id'].unique()
    temp_changes = []
    for patient in patient_ids:
        patient_data = df_complete[df_complete['patient_id'] == patient]['rectal_temperature_c']
        temp_change = patient_data.iloc[-1] - patient_data.iloc[0]
        temp_changes.append(temp_change)
    
    ax.hist(temp_changes, bins=10, alpha=0.6, color=color, label=f'{hie.capitalize()}', edgecolor='black')

ax.set_xlabel('Temperature Change (°C)', fontsize=11, fontweight='bold')
ax.set_ylabel('Frequency', fontsize=11, fontweight='bold')
ax.set_title('Distribution of Temperature Changes', fontsize=12, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

# Plot 3: Cooling rate comparison
ax = axes[1, 0]
cooling_rates = []
severities = []
for patient in df_complete['patient_id'].unique():
    patient_data = df_complete[df_complete['patient_id'] == patient].sort_values('time_hours')
    # Calculate cooling rate in first 12 hours
    first_12h = patient_data[patient_data['time_hours'] <= 12]
    if len(first_12h) > 1:
        cooling_rate = (first_12h['rectal_temperature_c'].iloc[-1] - first_12h['rectal_temperature_c'].iloc[0]) / 12
        cooling_rates.append(cooling_rate)
        severity = first_12h['hie_severity'].iloc[0]
        severities.append(severity)

cooling_df = pd.DataFrame({'cooling_rate': cooling_rates, 'hie_severity': severities})
sns.boxplot(data=cooling_df, x='hie_severity', y='cooling_rate', palette=['green', 'orange', 'red'], ax=ax)
ax.set_xlabel('HIE Severity', fontsize=11, fontweight='bold')
ax.set_ylabel('Cooling Rate (°C/hour)', fontsize=11, fontweight='bold')
ax.set_title('Cooling Rate by HIE Severity (First 12h)', fontsize=12, fontweight='bold')
ax.grid(True, alpha=0.3, axis='y')

# Plot 4: Target temperature by severity
ax = axes[1, 1]
target_temps = df_complete.groupby('hie_severity')['target_temp_c'].unique()
for hie, color in zip(hie_levels, colors):
    targets = df_complete[df_complete['hie_severity'] == hie]['target_temp_c'].unique()
    ax.hist(targets, bins=15, alpha=0.6, color=color, label=f'{hie.capitalize()}', edgecolor='black')

ax.set_xlabel('Target Temperature (°C)', fontsize=11, fontweight='bold')
ax.set_ylabel('Frequency', fontsize=11, fontweight='bold')
ax.set_title('Distribution of Target Temperatures', fontsize=12, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('../data/temperature_profiles.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n✓ Temperature profile visualizations completed")

## Section 4: Visualize Vital Signs Correlation

In [None]:
# Visualize vital signs during therapeutic hypothermia
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Vital Signs Patterns During Therapeutic Hypothermia', fontsize=16, fontweight='bold')

# Select sample patient from each severity group
sample_patients = []
for hie in ['mild', 'moderate', 'severe']:
    patient = df_complete[df_complete['hie_severity'] == hie]['patient_id'].iloc[0]
    sample_patients.append((patient, hie))

colors_map = {'mild': 'green', 'moderate': 'orange', 'severe': 'red'}

# Plot 1: Heart Rate Trends
ax = axes[0, 0]
for patient, hie in sample_patients:
    patient_data = df_complete[df_complete['patient_id'] == patient].sort_values('time_hours')
    ax.plot(patient_data['time_hours'], patient_data['heart_rate_bpm'], 
            linewidth=2, label=f'{hie.capitalize()}', color=colors_map[hie], marker='o', markersize=4, markevery=15)

ax.set_xlabel('Time (hours)', fontsize=11, fontweight='bold')
ax.set_ylabel('Heart Rate (bpm)', fontsize=11, fontweight='bold')
ax.set_title('Heart Rate During Hypothermia', fontsize=12, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

# Plot 2: Blood Pressure (Systolic)
ax = axes[0, 1]
for patient, hie in sample_patients:
    patient_data = df_complete[df_complete['patient_id'] == patient].sort_values('time_hours')
    ax.plot(patient_data['time_hours'], patient_data['systolic_bp_mmhg'], 
            linewidth=2, label=f'{hie.capitalize()}', color=colors_map[hie], marker='s', markersize=4, markevery=15)

ax.set_xlabel('Time (hours)', fontsize=11, fontweight='bold')
ax.set_ylabel('Systolic BP (mmHg)', fontsize=11, fontweight='bold')
ax.set_title('Systolic Blood Pressure During Hypothermia', fontsize=12, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

# Plot 3: Oxygen Saturation
ax = axes[1, 0]
for patient, hie in sample_patients:
    patient_data = df_complete[df_complete['patient_id'] == patient].sort_values('time_hours')
    ax.plot(patient_data['time_hours'], patient_data['oxygen_saturation_percent'], 
            linewidth=2, label=f'{hie.capitalize()}', color=colors_map[hie], marker='^', markersize=4, markevery=15)

ax.axhline(y=95, color='blue', linestyle='--', linewidth=1.5, alpha=0.7, label='Target min (95%)')
ax.set_xlabel('Time (hours)', fontsize=11, fontweight='bold')
ax.set_ylabel('SpO2 (%)', fontsize=11, fontweight='bold')
ax.set_title('Oxygen Saturation During Hypothermia', fontsize=12, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)
ax.set_ylim([92, 101])

# Plot 4: Lactate levels (blood gas)
ax = axes[1, 1]
for patient, hie in sample_patients:
    patient_data = df_complete[df_complete['patient_id'] == patient].sort_values('time_hours')
    ax.plot(patient_data['time_hours'], patient_data['lactate_mmol'], 
            linewidth=2, label=f'{hie.capitalize()}', color=colors_map[hie], marker='d', markersize=4, markevery=15)

ax.set_xlabel('Time (hours)', fontsize=11, fontweight='bold')
ax.set_ylabel('Lactate (mmol/L)', fontsize=11, fontweight='bold')
ax.set_title('Lactate Levels During Hypothermia', fontsize=12, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../data/vital_signs_patterns.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n✓ Vital signs visualization completed")

## Section 5: Correlation Analysis

In [None]:
# Correlation analysis of physiological parameters
# Select numeric columns for correlation
numeric_cols = ['rectal_temperature_c', 'heart_rate_bpm', 'systolic_bp_mmhg', 
                'diastolic_bp_mmhg', 'oxygen_saturation_percent', 'pH', 'lactate_mmol']

# Remove rows with NaN values for correlation
correlation_data = df_complete[numeric_cols].dropna()

# Calculate correlation matrix
corr_matrix = correlation_data.corr()

# Visualize correlation
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8}, ax=ax, 
            vmin=-1, vmax=1)
ax.set_title('Correlation Matrix of Physiological Parameters', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('../data/correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n✓ Correlation analysis completed")
print(f"\nKey correlations with rectal temperature:")
print(corr_matrix['rectal_temperature_c'].sort_values(ascending=False))

## Section 6: Save Generated Data

In [None]:
# Save the complete dataset
output_path = '../data/complete_mocked_dataset.csv'
df_complete.to_csv(output_path, index=False)
print(f"✓ Complete dataset saved to: {output_path}")
print(f"  File size: {os.path.getsize(output_path) / (1024*1024):.2f} MB")

# Save dataset summary statistics
summary_stats = df_complete.describe()
summary_stats.to_csv('../data/dataset_summary_statistics.csv')
print(f"✓ Summary statistics saved")

# Save patient metadata
patient_metadata = df_complete.groupby('patient_id').agg({
    'hie_severity': 'first',
    'target_temp_c': 'first',
    'baseline_birth_weight_kg': 'first',
    'baseline_gestational_age_weeks': 'first',
    'baseline_seizure_risk_factor': 'first'
}).reset_index()

patient_metadata.to_csv('../data/patient_metadata.csv', index=False)
print(f"✓ Patient metadata saved ({len(patient_metadata)} patients)")

# Display sample from each HIE severity
print("\n" + "="*80)
print("DATASET BREAKDOWN BY HIE SEVERITY")
print("="*80)
for hie in ['mild', 'moderate', 'severe']:
    subset = df_complete[df_complete['hie_severity'] == hie]
    print(f"\n{hie.upper()}:")
    print(f"  Patients: {subset['patient_id'].nunique()}")
    print(f"  Records: {len(subset):,}")
    print(f"  Avg target temp: {subset['target_temp_c'].mean():.2f}°C")
    print(f"  Seizure risk avg: {subset['baseline_seizure_risk_factor'].mean():.2f}")

In [None]:
# Ensure time_hours is present for summary and downstream usage
import numpy as np
import pandas as pd

if 'time_hours' not in df_complete.columns:
    if 'timestamp' in df_complete.columns:
        ts0 = pd.to_datetime(df_complete['timestamp'].iloc[0])
        df_complete['time_hours'] = (pd.to_datetime(df_complete['timestamp']) - ts0).dt.total_seconds() / 3600.0
    else:
        # fallback: derive from row order with 5-minute interval
        # group per patient to start each at 0
        df_complete = df_complete.copy()
        df_complete['row_idx'] = df_complete.groupby('patient_id').cumcount()
        df_complete['time_hours'] = (df_complete['row_idx'] * 5) / 60.0
        df_complete.drop(columns=['row_idx'], inplace=True)
print('time_hours column ready. Max:', round(float(df_complete['time_hours'].max()), 2))