In [1]:
import pandas as pd
import numpy as np

np.random.seed(42)
n_samples = 500

# Predictors
age = np.random.randint(30, 80, size=n_samples)
bmi = np.random.normal(27, 4, size=n_samples)
glucose = np.random.normal(100, 15, size=n_samples)
hdl_chol = np.random.normal(55, 10, size=n_samples)
ldl_chol = np.random.normal(130, 30, size=n_samples)
triglycerides = np.random.normal(150, 50, size=n_samples)
systolic_bp = np.random.normal(130, 15, size=n_samples)
diastolic_bp = np.random.normal(85, 10, size=n_samples)
heart_rate = np.random.normal(75, 10, size=n_samples)

alcohol_use = np.random.binomial(1, 0.3, size=n_samples)
smoking_status = np.random.binomial(1, 0.25, size=n_samples)
physical_activity = np.random.binomial(1, 0.5, size=n_samples)
diet_score = np.random.randint(1, 11, size=n_samples)
sleep_hours = np.random.normal(6.5, 1.5, size=n_samples)
stress_score = np.random.randint(1, 11, size=n_samples)

family_history_diabetes = np.random.binomial(1, 0.3, size=n_samples)
family_history_stroke = np.random.binomial(1, 0.2, size=n_samples)
family_history_heart_disease = np.random.binomial(1, 0.25, size=n_samples)

# Logistic models for outcomes
logit_stroke = (-10 + 0.04*age + 0.05*bmi + 0.03*glucose + 0.7*smoking_status +
                0.8*family_history_stroke - 0.4*physical_activity + 0.02*systolic_bp +
                0.05*stress_score)

logit_heart_disease = (-9 + 0.05*age + 0.08*bmi + 0.02*ldl_chol - 0.04*hdl_chol +
                       0.9*family_history_heart_disease + 0.6*alcohol_use +
                       0.9*smoking_status + 0.01*triglycerides - 0.3*diet_score)

logit_heart_failure = (-8 + 0.06*age + 0.1*bmi + 0.03*glucose + 0.8*smoking_status +
                       0.7*family_history_heart_disease + 0.5*stress_score +
                       0.01*heart_rate + 0.02*systolic_bp)

logit_htn = (-9 + 0.06*age + 0.1*bmi + 0.03*systolic_bp + 0.9*family_history_heart_disease +
             0.6*stress_score - 0.4*physical_activity)

logit_afib = (-11 + 0.04*age + 0.03*heart_rate + 0.7*smoking_status + 0.6*stress_score +
              0.9*family_history_heart_disease)

logit_pad = (-10 + 0.05*age + 0.03*ldl_chol + 0.9*smoking_status + 0.5*stress_score +
             0.8*family_history_heart_disease - 0.3*physical_activity)

# Generate risk outcomes
df = pd.DataFrame({
    'age': age, 'bmi': bmi, 'glucose': glucose, 'hdl_chol': hdl_chol,
    'ldl_chol': ldl_chol, 'triglycerides': triglycerides, 'systolic_bp': systolic_bp,
    'diastolic_bp': diastolic_bp, 'heart_rate': heart_rate, 'alcohol_use': alcohol_use,
    'smoking_status': smoking_status, 'physical_activity': physical_activity,
    'diet_score': diet_score, 'sleep_hours': sleep_hours, 'stress_score': stress_score,
    'family_history_diabetes': family_history_diabetes,
    'family_history_stroke': family_history_stroke,
    'family_history_heart_disease': family_history_heart_disease,
    'risk_stroke': np.random.binomial(1, 1/(1 + np.exp(-logit_stroke))),
    'risk_heart_disease': np.random.binomial(1, 1/(1 + np.exp(-logit_heart_disease))),
    'risk_heart_failure': np.random.binomial(1, 1/(1 + np.exp(-logit_heart_failure))),
    'risk_hypertension': np.random.binomial(1, 1/(1 + np.exp(-logit_htn))),
    'risk_afib': np.random.binomial(1, 1/(1 + np.exp(-logit_afib))),
    'risk_pad': np.random.binomial(1, 1/(1 + np.exp(-logit_pad)))
})

df.to_csv("multi_cvd_dataset.csv", index=False)
print("✅ Dataset saved as multi_cvd_dataset.csv")


✅ Dataset saved as multi_cvd_dataset.csv
