In [None]:
# === Load core datasets ===
import pandas as pd
import numpy as np

# Patient demographics and outcomes
df_base = pd.read_csv('castor data/continuous_monitoring_export_20250423.csv', delimiter=';')

# Final diagnoses
df_diag = pd.read_csv('castor data/continuous_monitoring_admission_diagnoses_export_20250423.csv', delimiter=';')
df_diag = df_diag[df_diag['1.5|Status of diagnosis'] == 2]  # Keep only final diagnoses

# Convert diagnosis time column to datetime
df_diag['1.1|Time of diagnosis'] = pd.to_datetime(df_diag['1.1|Time of diagnosis'], errors='coerce')

# === Map diagnosis main group to readable categories ===
main_group_mapping = {
    1: "infectious_diseases",
    2: "neoplasms",
    3: "blood_diseases",
    4: "endocrine_diseases",
    5: "mental_disorders",
    6: "nervous_system",
    7: "eye_diseases",
    8: "ear_diseases",
    9: "circulatory_system",
    10: "respiratory_system",
    11: "digestive_system",
    12: "skin_diseases",
    13: "musculoskeletal_diseases",
    14: "genitourinary_diseases",
    15: "pregnancy_childbirth",
    16: "perinatal_conditions",
    17: "congenital_abnormalities",
    18: "symptoms_not_classified",
    19: "injury_poisoning",
    20: "external_causes",
    21: "health_status_contact"
}


In [None]:
df_diag['diagnosis_group'] = df_diag['1.2|Diagnosis main group'].map(main_group_mapping)

# One-hot encode diagnosis groups
df_diag_binary = pd.get_dummies(df_diag[['Participant Id', 'diagnosis_group']], columns=['diagnosis_group'], dtype='int')

# Aggregate per patient
df_diag_summary = df_diag_binary.groupby('Participant Id').max().reset_index()

# Ensure consistent Participant_Id type
df_base['Participant Id'] = df_base['Participant Id'].astype(str).str.strip()
df_diag_summary['Participant Id'] = df_diag_summary['Participant Id'].astype(str).str.strip()

# Rename for merge consistency
df_base = df_base.rename(columns={
    'pat_sex': 'Sex',
    'pat_age': 'Age',
    'pat_BMI': 'BMI',
    'adm_time': 'AdmissionTime',
    'adm_ASA': 'ASA_Score',
    '6.2|Endpoint': 'Outcome',
    '6.2.1|Guarded care': 'ICU_Type',
    '6.2.3|Surgery': 'SurgeryType',
    'Participant Id': 'Participant_Id'
})

df_diag_summary = df_diag_summary.rename(columns={'Participant Id': 'Participant_Id'})

# Select core variables
cols_to_keep = ['Participant_Id', 'Sex', 'Age', 'BMI', 'AdmissionTime', 'ASA_Score', 'Outcome', 'ICU_Type', 'SurgeryType']
df_base = df_base[cols_to_keep]


In [None]:
# === Load and process vitals ===
df_vitals = pd.read_csv('castor data/filtered_visi_data.csv', parse_dates=['Date_time'])
df_vitals['castorNr'] = df_vitals['castorNr'].astype(str).str.strip()

# Sort vitals by patient and time
df_vitals = df_vitals.sort_values(by=['castorNr', 'Date_time'])

# Define summary windows and columns
summary_windows = [15, 30, 60]  # minutes
vital_cols = ['HeartRate', 'PulseRate', 'BreathingRate', 'SpO2', 'SkinTemp', 'SysBp', 'DiasBp', 'MapBp']


In [None]:
# Summarize vitals per patient
summary_list = []

for patient_id, group in df_vitals.groupby('castorNr'):
    group = group.sort_values('Date_time')
    # start_time = group['Date_time'].min()

    patient_summary = {'Participant_Id': patient_id}

    for col in vital_cols:
        first_valid_idx = group[col].first_valid_index()
        if first_valid_idx is None:
            continue
        start_time = group.loc[first_valid_idx, 'Date_time']
        for window in summary_windows:
            cutoff_time = start_time + pd.Timedelta(minutes=window)
            subset = group[(group['Date_time'] >= start_time) & (group['Date_time'] <= cutoff_time)]

            if not subset.empty:
                patient_summary[f'{col}_mean_{window}m'] = subset[col].mean()
                if window == 60:
                    patient_summary[f'{col}_min_{window}m'] = subset[col].min()
                    patient_summary[f'{col}_max_{window}m'] = subset[col].max()
                    patient_summary[f'{col}_p90_{window}m'] = subset[col].quantile(0.9)

    summary_list.append(patient_summary)

df_vitals_summary = pd.DataFrame(summary_list)

# === Merge datasets ===
df_merged = df_vitals_summary.copy()
df_merged = df_merged.merge(df_base, on='Participant_Id', how='left')
df_merged = df_merged.merge(df_diag_summary, on='Participant_Id', how='left')


In [None]:
# === Save final merged dataset ===
df_merged.to_csv("merged_clinical_dataset.csv", index=False)
print("✅ Final merged dataset with time-windowed vitals saved as 'merged_clinical_dataset.csv'")
