In [1]:
import pandas as pd
import numpy as np

In [2]:
np.random.seed(42)
n_patients = 2000

In [3]:
#generate dataset1
df_health = pd.DataFrame({
            'Patient_Number': range(1, n_patients + 1),
            'Blood_Pressure_Abnormality': np.random.choice([0, 1], n_patients, p=[0.8, 0.2]), # 0=Normal, 1=Abnormal 
            'Level_of_Hemoglobin': np.round(np.random.normal(14, 2, n_patients), 1),
            'Genetic_Pedigree_Coefficient': np.random.uniform(0, 1, n_patients), # 0 to 1 scale [cite: 37]
            'Age': np.random.randint(18, 90, n_patients),
            'BMI': np.round(np.random.normal(25, 5, n_patients), 1),
            'Sex': np.random.choice([0, 1], n_patients), # 0=Male, 1=Female 
            'Pregnancy': np.zeros(n_patients),
            'Smoking': np.random.choice([0, 1], n_patients),
            'salt_content_in_the_diet': np.random.randint(1000, 5000, n_patients),
            'alcohol_consumption_per_day': np.random.randint(0, 500, n_patients),
            'Level_of_Stress': np.random.choice([1, 2, 3], n_patients), # 1=Low, 2=Normal, 3=High 
            'Chronic_kidney_disease': np.random.choice([0, 1], n_patients, p=[0.9, 0.1]),
            'Adrenal_and_thyroid_disorders': np.random.choice([0, 1], n_patients, p=[0.9, 0.1])
        })

In [5]:
# Logic Fixes (Males cannot be pregnant)
df_health.loc[df_health['Sex'] == 0, 'Pregnancy'] = 0
df_health.loc[(df_health['Sex'] == 1) & (np.random.rand(n_patients) > 0.95), 'Pregnancy'] = 1

In [7]:
#generate dataset2
records = []
for pid in range(1, n_patients + 1):
    is_sick = df_health.loc[df_health['Patient_Number'] == pid, 'Chronic_kidney_disease'].values[0]
    base = 3000 if is_sick else 7000
    for d in range(1, 11):
        steps = int(np.random.normal(base, 1500))
        records.append([pid, d, max(0, steps)])
                
df_activity = pd.DataFrame(records, columns=['Patient_Number', 'Day_Number', 'Physical_activity'])

In [9]:
df_health.shape, df_activity.shape

((2000, 14), (20000, 3))

In [10]:
df_health.head()

Unnamed: 0,Patient_Number,Blood_Pressure_Abnormality,Level_of_Hemoglobin,Genetic_Pedigree_Coefficient,Age,BMI,Sex,Pregnancy,Smoking,salt_content_in_the_diet,alcohol_consumption_per_day,Level_of_Stress,Chronic_kidney_disease,Adrenal_and_thyroid_disorders
0,1,0,12.2,0.599299,88,26.6,0,0.0,0,2005,241,3,0,0
1,2,1,12.3,0.513708,88,19.9,1,0.0,1,1697,31,3,0,0
2,3,0,13.5,0.288185,41,22.3,0,0.0,1,3773,262,1,0,0
3,4,0,14.7,0.006464,28,21.9,0,0.0,0,1305,210,1,0,0
4,5,0,15.8,0.496239,29,27.7,0,0.0,1,4635,70,2,0,0


In [11]:
df_activity.head()

Unnamed: 0,Patient_Number,Day_Number,Physical_activity
0,1,1,7024
1,1,2,6393
2,1,3,4914
3,1,4,3692
4,1,5,6950


In [13]:
df_health.to_csv('../data/health_dataset1.csv',index=False)
df_activity.to_csv('../data/health_dataset2.csv',index=False)