# Phase 2: Data Preprocessing and Feature Engineering
## AI-Based Personalized Adaptive Hypothermia - ML Model Development

In this phase, we will:
1. Load and clean the mocked dataset from Phase 1
2. Handle missing values and data validation
3. Normalize/standardize physiological parameters
4. Engineer time-series features (rolling statistics, gradients)
5. Create domain-specific clinical features
6. Prepare data for ML model training

## Section 1: Import Libraries and Load Data

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

# Set style for visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Load the dataset from Phase 1
data_path = '../data/complete_mocked_dataset.csv'
df = pd.read_csv(data_path)

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Patients: {df['patient_id'].nunique()}")
print(f"Date range: {df['time_hours'].min():.1f} to {df['time_hours'].max():.1f} hours")

## Section 2: Data Validation and Missing Value Analysis

In [None]:
# Check for missing values
print("=" * 80)
print("MISSING VALUE ANALYSIS")
print("=" * 80)
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'Missing_Count': missing_values.values,
    'Missing_Percentage': missing_percentage.values
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

if len(missing_df) > 0:
    print(missing_df.to_string(index=False))
else:
    print("No missing values found!")

# Data type validation
print("\n" + "=" * 80)
print("DATA TYPE SUMMARY")
print("=" * 80)
print(df.dtypes)

In [None]:
# Handle missing values in blood gas data using forward fill (last observation carried forward)
print("\nHandling missing values...")
df.fillna(method='ffill', inplace=True)
df.fillna(method='bfill', inplace=True)

# Final missing value check
remaining_missing = df.isnull().sum().sum()
print(f"✓ Remaining missing values: {remaining_missing}")

# Check physiological data ranges
print("\n" + "=" * 80)
print("PHYSIOLOGICAL DATA RANGES VALIDATION")
print("=" * 80)

vital_checks = {
    'rectal_temperature_c': (30, 40, 'Temperature'),
    'heart_rate_bpm': (50, 200, 'Heart Rate'),
    'systolic_bp_mmhg': (30, 100, 'Systolic BP'),
    'diastolic_bp_mmhg': (10, 70, 'Diastolic BP'),
    'oxygen_saturation_percent': (85, 100, 'SpO2'),
    'pH': (6.8, 7.8, 'pH'),
    'pCO2_mmhg': (20, 80, 'pCO2'),
    'pO2_mmhg': (40, 200, 'pO2'),
    'lactate_mmol': (0, 20, 'Lactate')
}

for col, (min_val, max_val, label) in vital_checks.items():
    if col in df.columns:
        out_of_range = ((df[col] < min_val) | (df[col] > max_val)).sum()
        if out_of_range > 0:
            print(f"⚠ {label}: {out_of_range} values out of range [{min_val}, {max_val}]")
        else:
            print(f"✓ {label}: All values within normal range")

print("\nData validation completed!")

## Section 3: Feature Engineering - Time-Series Features

In [None]:
print("Engineering time-series features...")

# Create a copy for feature engineering
df_features = df.copy()

# Group by patient for time-series calculations
features_list = []

for patient_id in df['patient_id'].unique():
    patient_data = df[df['patient_id'] == patient_id].sort_values('time_hours').reset_index(drop=True)
    
    # 1. Temperature gradient (rate of change)
    patient_data['temp_gradient_5min'] = patient_data['rectal_temperature_c'].diff()
    patient_data['temp_gradient_30min'] = patient_data['rectal_temperature_c'].diff(6)  # 6 * 5min
    patient_data['temp_gradient_1h'] = patient_data['rectal_temperature_c'].diff(12)    # 12 * 5min
    
    # 2. Deviation from target temperature
    patient_data['temp_deviation_from_target'] = patient_data['rectal_temperature_c'] - patient_data['target_temp_c']
    
    # 3. Rolling statistics (5-sample window = 25 minutes)
    patient_data['hr_rolling_mean_25min'] = patient_data['heart_rate_bpm'].rolling(window=5, center=True).mean()
    patient_data['hr_rolling_std_25min'] = patient_data['heart_rate_bpm'].rolling(window=5, center=True).std()
    patient_data['hr_rolling_min_25min'] = patient_data['heart_rate_bpm'].rolling(window=5, center=True).min()
    patient_data['hr_rolling_max_25min'] = patient_data['heart_rate_bpm'].rolling(window=5, center=True).max()
    
    # 4. Heart Rate Variability (simplified: rolling std of HR)
    patient_data['hrv_25min'] = patient_data['hr_rolling_std_25min']
    
    # 5. Blood Pressure features
    patient_data['mean_arterial_pressure'] = (patient_data['systolic_bp_mmhg'] + 
                                               2 * patient_data['diastolic_bp_mmhg']) / 3
    patient_data['pulse_pressure'] = patient_data['systolic_bp_mmhg'] - patient_data['diastolic_bp_mmhg']
    
    # 6. Metabolic features
    patient_data['lactate_elevation'] = patient_data['lactate_mmol'] - 2.0  # Normal: ~2 mmol/L
    patient_data['ph_deviation'] = abs(patient_data['pH'] - 7.40)  # Normal: 7.35-7.45
    patient_data['oxygen_reserve'] = 100 - patient_data['oxygen_saturation_percent']
    
    # 7. Cumulative temperature change
    patient_data['cumulative_temp_change'] = patient_data['rectal_temperature_c'].diff().fillna(0).cumsum()
    
    # 8. Time in therapeutic window (target temp ± 0.5°C)
    patient_data['in_therapeutic_window'] = (
        (patient_data['rectal_temperature_c'] >= patient_data['target_temp_c'] - 0.5) & 
        (patient_data['rectal_temperature_c'] <= patient_data['target_temp_c'] + 0.5)
    ).astype(int)
    
    features_list.append(patient_data)

df_features = pd.concat(features_list, ignore_index=True)

print(f"✓ Time-series features created!")
print(f"Original columns: {len(df.columns)}")
print(f"New columns: {len(df_features.columns) - len(df.columns)}")
print(f"Total columns: {len(df_features.columns)}")

In [None]:
# Display new features
print("\n" + "=" * 80)
print("NEW ENGINEERED FEATURES")
print("=" * 80)
new_cols = [col for col in df_features.columns if col not in df.columns]
print(f"\nTotal new features: {len(new_cols)}")
print("\nFeatures created:")
for i, col in enumerate(new_cols, 1):
    print(f"  {i:2d}. {col}")

## Section 4: Data Normalization and Standardization

In [None]:
print("Standardizing physiological features...\n")

# Identify numeric columns to normalize (exclude IDs, categorical, and baseline params)
exclude_cols = ['patient_id', 'hie_severity', 'target_temp_c'] + \
               [col for col in df_features.columns if col.startswith('baseline_')]

numeric_cols_to_normalize = [col for col in df_features.columns 
                            if df_features[col].dtype in ['float64', 'int64'] 
                            and col not in exclude_cols]

print(f"Columns to normalize: {len(numeric_cols_to_normalize)}")

# Create a copy for normalized data
df_normalized = df_features.copy()

# Use StandardScaler for physiological features (zero mean, unit variance)
scaler = StandardScaler()
df_normalized[numeric_cols_to_normalize] = scaler.fit_transform(df_features[numeric_cols_to_normalize])

print(f"✓ Standardization completed using StandardScaler (μ=0, σ=1)")

# Verify normalization
print(f"\nSample of normalized features:")
sample_cols = numeric_cols_to_normalize[:5]
print(df_normalized[sample_cols].describe().round(3))

## Section 5: Create Clinical Risk Labels for Supervised Learning

In [None]:
print("Creating clinical risk labels for model training...\n")

# 1. Temperature Stability Label (for temperature control models)
# Target: minimize overshoot (>1°C below target) or undershoot (<1°C above target)
df_normalized['temp_overshoot'] = (
    (df_normalized['rectal_temperature_c'] < df_normalized['target_temp_c'] - 1.0)
).astype(int)

df_normalized['temp_undershoot'] = (
    (df_normalized['rectal_temperature_c'] > df_normalized['target_temp_c'] + 1.0)
).astype(int)

# 2. Physiological Instability Index
# Combine multiple risk factors
df_normalized['seizure_risk_high'] = (
    ((df_normalized['baseline_seizure_risk_factor'] > 0.5) | 
     (df_normalized['lactate_mmol'] > 4) |
     (df_normalized['pH'] < 7.30))
).astype(int)

# 3. Complication Risk Labels
df_normalized['sepsis_risk'] = (
    ((df_normalized['heart_rate_bpm'] > 160) | 
     (df_normalized['lactate_mmol'] > 6) |
     (df_normalized['pH'] < 7.25))
).astype(int)

df_normalized['cardiac_distress'] = (
    ((df_normalized['mean_arterial_pressure'] < 35) |
     (df_normalized['hrv_25min'] > 25) |
     (df_normalized['pulse_pressure'] < 10))
).astype(int)

df_normalized['renal_dysfunction_risk'] = (
    ((df_normalized['systolic_bp_mmhg'] < 45) |
     (df_normalized['pO2_mmhg'] < 60) |
     (df_normalized['lactate_mmol'] > 7))
).astype(int)

# 4. Neurodevelopmental Outcome Predictor (binary outcome at 72 hours)
# Based on severity and response to hypothermia
def predict_neuro_outcome(row):
    severity_score = {'mild': 0, 'moderate': 1, 'severe': 2}[row['hie_severity']]
    
    # Response to cooling (better if temp more stable)
    temp_stability = 1 - (row['temp_deviation_from_target'] ** 2 / 4)
    temp_stability = np.clip(temp_stability, 0, 1)
    
    # Metabolic recovery (better if lactate lower, pH higher)
    metabolic_recovery = (row['pH'] - 7.2) / 0.3 - (row['lactate_mmol'] - 1) / 7
    metabolic_recovery = np.clip(metabolic_recovery, 0, 1)
    
    # Combined risk
    outcome_score = (0.3 * severity_score + 0.35 * (1 - temp_stability) + 0.35 * (1 - metabolic_recovery)) / 2
    
    # Return 0 for good outcome, 1 for poor outcome
    return 1 if outcome_score > 0.5 else 0

df_normalized['neuro_outcome_72h'] = df_normalized.apply(predict_neuro_outcome, axis=1)

print("✓ Clinical labels created:")
print(f"  - Temperature overshoot: {df_normalized['temp_overshoot'].sum()} cases")
print(f"  - Seizure risk high: {df_normalized['seizure_risk_high'].sum()} cases")
print(f"  - Sepsis risk: {df_normalized['sepsis_risk'].sum()} cases")
print(f"  - Cardiac distress: {df_normalized['cardiac_distress'].sum()} cases")
print(f"  - Renal dysfunction risk: {df_normalized['renal_dysfunction_risk'].sum()} cases")
print(f"  - Poor neuro outcome (72h): {df_normalized['neuro_outcome_72h'].sum()} cases")

## Section 6: Save Preprocessed Dataset

In [None]:
# Save preprocessed and normalized dataset
output_path_normalized = '../data/preprocessed_normalized_dataset.csv'
df_normalized.to_csv(output_path_normalized, index=False)
print(f"✓ Normalized dataset saved: {output_path_normalized}")
print(f"  File size: {os.path.getsize(output_path_normalized) / (1024*1024):.2f} MB")
print(f"  Shape: {df_normalized.shape}")

# Save feature engineering metadata
feature_info = pd.DataFrame({
    'Feature': new_cols + ['temp_overshoot', 'temp_undershoot', 'seizure_risk_high', 
                          'sepsis_risk', 'cardiac_distress', 'renal_dysfunction_risk', 'neuro_outcome_72h'],
    'Type': ['Time-Series'] * len(new_cols) + ['Target Label'] * 7,
    'Description': [
        'Temperature change rate (5-min interval)',
        'Temperature change rate (30-min interval)',
        'Temperature change rate (1-hour interval)',
        'Deviation from individualized target temperature',
        'Rolling mean heart rate (25-min window)',
        'Rolling std dev heart rate (25-min window)',
        'Rolling min heart rate (25-min window)',
        'Rolling max heart rate (25-min window)',
        'Heart rate variability indicator',
        'Mean arterial pressure (MAP)',
        'Pulse pressure (systolic - diastolic)',
        'Lactate above normal baseline',
        'pH deviation from normal (7.40)',
        'Percentage below 100% SpO2',
        'Cumulative temperature adjustment during cooling',
        'Whether infant is within therapeutic temperature window',
        'Temperature overshoot risk indicator',
        'Temperature undershoot risk indicator',
        'High seizure risk flag',
        'Sepsis risk flag',
        'Cardiac distress risk flag',
        'Renal dysfunction risk flag',
        'Predicted poor neurodevelopmental outcome at 72h'
    ]
})

feature_info.to_csv('../data/feature_engineering_metadata.csv', index=False)
print(f"✓ Feature metadata saved")

print("\n" + "=" * 80)
print("PREPROCESSING SUMMARY")
print("=" * 80)
print(f"Original dataset shape: {df.shape}")
print(f"Preprocessed dataset shape: {df_normalized.shape}")
print(f"New features engineered: {len(new_cols)}")
print(f"Clinical labels created: 7")
print(f"Total features for modeling: {len(df_normalized.columns)}")

print("\nDataset ready for Phase 3: Model Development!")