In [1]:
import pandas as pd
import numpy as np

In [2]:
np.random.seed(42)  # reproducibility

n = 1000  # number of patients

data = {
    "PatientID": range(1, n + 1),
    "Age": np.random.randint(18, 80, size=n),
    "Gender": np.random.choice(["Male", "Female"], size=n),
    "Cholesterol": np.random.normal(loc=220, scale=40, size=n).round(1),
    "BloodPressure": np.random.normal(loc=120, scale=15, size=n).round(1),
    "Smoker": np.random.choice(["Yes", "No"], size=n, p=[0.3, 0.7])
}

df = pd.DataFrame(data)
df.head()


Unnamed: 0,PatientID,Age,Gender,Cholesterol,BloodPressure,Smoker
0,1,56,Male,234.3,122.0,No
1,2,69,Male,196.5,140.0,Yes
2,3,46,Female,212.3,122.9,No
3,4,32,Male,213.8,134.9,No
4,5,60,Male,274.3,119.0,No


In [3]:
#introducing the missing values
df.loc[np.random.choice(df.index, 50), "Cholesterol"] = np.nan
df.loc[np.random.choice(df.index, 30), "BloodPressure"] = np.nan

df.isna().sum()

PatientID         0
Age               0
Gender            0
Cholesterol      48
BloodPressure    30
Smoker            0
dtype: int64

In [4]:
df.to_csv("../data/healthcare_synthetic_data.csv", index=False)

In [5]:
df.shape

(1000, 6)

In [9]:
df.describe(include="all")

Unnamed: 0,PatientID,Age,Gender,Cholesterol,BloodPressure,Smoker
count,1000.0,1000.0,1000,952.0,970.0,1000
unique,,,2,,,2
top,,,Male,,,No
freq,,,523,,,677
mean,500.5,49.857,,220.164601,121.139691,
std,288.819436,18.114267,,41.152718,15.525692,
min,1.0,18.0,,84.5,76.1,
25%,250.75,35.0,,194.25,110.825,
50%,500.5,50.0,,219.2,122.0,
75%,750.25,66.0,,249.225,131.05,


In [6]:
cholesterol_median = df["Cholesterol"].median()
bp_median = df["BloodPressure"].median()

cholesterol_median, bp_median

(219.2, 122.0)

In [13]:
df["Cholesterol"].fillna(cholesterol_median, inplace=True)
df["BloodPressure"].fillna(bp_median, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Cholesterol"].fillna(cholesterol_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["BloodPressure"].fillna(bp_median, inplace=True)


In [7]:
df["Cholesterol"] = df["Cholesterol"].fillna(cholesterol_median)
df["BloodPressure"] = df["BloodPressure"].fillna(bp_median)


In [8]:
df.isna().sum()

PatientID        0
Age              0
Gender           0
Cholesterol      0
BloodPressure    0
Smoker           0
dtype: int64

In [None]:
#Note: Explicit assignment was used instead of inplace operations to avoid chained assignment ambiguity and ensure future compatibility with pandas 3.0.


In [9]:
bins = [18, 30, 45, 60,80]
labels = ["Young", "Middel-aged", "Senior", "Elderly"]

df["AgeGroup"] = pd.cut(df["Age"], bins=bins, labels=labels, right=False)
df[["Age", "AgeGroup"]].head()


Unnamed: 0,Age,AgeGroup
0,56,Senior
1,69,Elderly
2,46,Senior
3,32,Middel-aged
4,60,Elderly


In [10]:
def cholesterol_risk(value):
    if value < 200:
        return "Normal"
    elif value < 240:
        return "Borderline"
    else:
        return "High"

df["CholesterolRisk"] = df["Cholesterol"].apply(cholesterol_risk)

df[["Cholesterol", "CholesterolRisk"]].head()


Unnamed: 0,Cholesterol,CholesterolRisk
0,234.3,Borderline
1,196.5,Normal
2,212.3,Borderline
3,213.8,Borderline
4,274.3,High


In [11]:
df["HighRiskFlag"] = (
    (df["CholesterolRisk"] == "High") |
    (df["Smoker"] == "Yes") |
    (df["BloodPressure"] > 140)
)

df["HighRiskFlag"].value_counts()


HighRiskFlag
True     577
False    423
Name: count, dtype: int64

In [None]:
### Feature Engineering

To enable analytical segmentation, new features were created:
- AgeGroup: Categorizes patients into meaningful age bands
- CholesterolRisk: Classifies cholesterol levels using clinical thresholds
- HighRiskFlag: Identifies patients with elevated health risk based on multiple factors

These features improve interpretability and support population-level analysis.


In [5]:
df.shape

(1000, 6)

In [8]:
df.dtypes

PatientID          int64
Age                int32
Gender            object
Cholesterol      float64
BloodPressure    float64
Smoker            object
dtype: object

In [9]:
df["Gender"].value_counts()

Gender
Male      523
Female    477
Name: count, dtype: int64

In [11]:
df["Smoker"].value_counts()

Smoker
No     677
Yes    323
Name: count, dtype: int64

In [12]:
df["Gender"].value_counts(normalize=True)*100

Gender
Male      52.3
Female    47.7
Name: proportion, dtype: float64

In [15]:
df["AgeGroup"].value_counts().sort_index()

AgeGroup
Young          178
Middel-aged    225
Senior         253
Elderly        344
Name: count, dtype: int64

In [16]:
df["AgeGroup"].value_counts(normalize=True).sort_index()*100

AgeGroup
Young          17.8
Middel-aged    22.5
Senior         25.3
Elderly        34.4
Name: proportion, dtype: float64

In [22]:
df["Risk"].value_counts() 

KeyError: 'Risk'

In [23]:
df.columns

Index(['PatientID', 'Age', 'Gender', 'Cholesterol', 'BloodPressure', 'Smoker',
       'AgeGroup', 'CholesterolRisk', 'HighRiskFlag'],
      dtype='object')

In [24]:
df.HighRiskFlag

0      False
1       True
2      False
3      False
4       True
       ...  
995    False
996     True
997    False
998    False
999     True
Name: HighRiskFlag, Length: 1000, dtype: bool

In [25]:
df["HighRiskFlag"].value_counts()

HighRiskFlag
True     587
False    413
Name: count, dtype: int64

In [26]:
df["HighRiskFlag"].value_counts(normalize=True)*100

HighRiskFlag
True     58.7
False    41.3
Name: proportion, dtype: float64

In [27]:
pd.crosstab(df['Gender'], df['HighRiskFlag'])

HighRiskFlag,False,True
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,198,279
Male,215,308


In [29]:
pd.crosstab(df['Gender'], df['HighRiskFlag'], normalize='index')*100

HighRiskFlag,False,True
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,41.509434,58.490566
Male,41.108987,58.891013


In [30]:
pd.crosstab(df['AgeGroup'], df['HighRiskFlag'])

HighRiskFlag,False,True
AgeGroup,Unnamed: 1_level_1,Unnamed: 2_level_1
Young,75,103
Middel-aged,96,129
Senior,104,149
Elderly,138,206


In [31]:
pd.crosstab(df['AgeGroup'], df['HighRiskFlag'], normalize='index')*100

HighRiskFlag,False,True
AgeGroup,Unnamed: 1_level_1,Unnamed: 2_level_1
Young,42.134831,57.865169
Middel-aged,42.666667,57.333333
Senior,41.106719,58.893281
Elderly,40.116279,59.883721


In [32]:
df.groupby('HighRiskFlag')['Cholesterol'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
HighRiskFlag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,413.0,198.966102,28.534964,84.5,182.3,204.1,221.7,239.7
True,587.0,235.154344,42.249769,119.3,202.85,242.4,261.9,345.7


In [33]:
pd.crosstab(
    index=[df['AgeGroup'], df['Gender']],
    columns=df['HighRiskFlag'],
    normalize='index'
) * 100


Unnamed: 0_level_0,HighRiskFlag,False,True
AgeGroup,Gender,Unnamed: 2_level_1,Unnamed: 3_level_1
Young,Female,41.860465,58.139535
Young,Male,42.391304,57.608696
Middel-aged,Female,41.666667,58.333333
Middel-aged,Male,43.589744,56.410256
Senior,Female,40.458015,59.541985
Senior,Male,41.803279,58.196721
Elderly,Female,42.105263,57.894737
Elderly,Male,38.541667,61.458333


In [13]:
df.groupby("HighRiskFlag")[["Cholesterol", "BloodPressure", "Age"]].mean()

Unnamed: 0_level_0,Cholesterol,BloodPressure,Age
HighRiskFlag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,200.703546,117.886761,49.271868
True,234.3513,123.569151,50.285962


In [18]:
mean_table = df.groupby("HighRiskFlag")[["Cholesterol", "BloodPressure", "Age"]].mean()
mean_table.loc[True] - mean_table.loc[False]

Cholesterol      33.647754
BloodPressure     5.682390
Age               1.014094
dtype: float64

In [19]:
df.groupby("Gender")[["Cholesterol", "BloodPressure", "Age"]].mean()


Unnamed: 0_level_0,Cholesterol,BloodPressure,Age
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,219.244025,121.812159,49.169811
Male,220.915679,120.575717,50.483748


In [20]:
pd.crosstab(df["Gender"], df["HighRiskFlag"], normalize="index") * 100


HighRiskFlag,False,True
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,42.767296,57.232704
Male,41.873805,58.126195


In [21]:
df.groupby(pd.cut(df["Age"], bins=[20,30,40,50,60,70]))["Cholesterol"].mean()

  df.groupby(pd.cut(df["Age"], bins=[20,30,40,50,60,70]))["Cholesterol"].mean()


Age
(20, 30]    217.976978
(30, 40]    223.470213
(40, 50]    219.376923
(50, 60]    219.428859
(60, 70]    216.703571
Name: Cholesterol, dtype: float64

In [23]:
df["Outcome"] = df["HighRiskFlag"].map({
    True: "High Risk",
    False: "Low Risk"
})


In [24]:
df[["HighRiskFlag", "Outcome"]].head()

Unnamed: 0,HighRiskFlag,Outcome
0,False,Low Risk
1,True,High Risk
2,False,Low Risk
3,False,Low Risk
4,True,High Risk


In [25]:
df["Outcome"].value_counts()

Outcome
High Risk    577
Low Risk     423
Name: count, dtype: int64

In [26]:
df.groupby("Outcome")[["Cholesterol", "BloodPressure", "Age"]].mean()

Unnamed: 0_level_0,Cholesterol,BloodPressure,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
High Risk,234.3513,123.569151,50.285962
Low Risk,200.703546,117.886761,49.271868


In [27]:
pd.crosstab(df["Outcome"], df["Gender"])

Gender,Female,Male
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1
High Risk,273,304
Low Risk,204,219


In [28]:
outcome_summary = df.groupby("Outcome").agg(
    PatientCount=("PatientID", "count"),
    AvgAge=("Age", "mean"),
    AvgCholesterol=("Cholesterol", "mean"),
    AvgBloodPressure=("BloodPressure", "mean")
)

outcome_summary

Unnamed: 0_level_0,PatientCount,AvgAge,AvgCholesterol,AvgBloodPressure
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
High Risk,577,50.285962,234.3513,123.569151
Low Risk,423,49.271868,200.703546,117.886761


In [29]:
outcome_summary = outcome_summary.round(2)
outcome_summary

Unnamed: 0_level_0,PatientCount,AvgAge,AvgCholesterol,AvgBloodPressure
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
High Risk,577,50.29,234.35,123.57
Low Risk,423,49.27,200.7,117.89


In [30]:
df["Outcome"].value_counts(normalize=True) * 100

Outcome
High Risk    57.7
Low Risk     42.3
Name: proportion, dtype: float64

In [31]:
gender_outcome = pd.crosstab(
    df["Gender"],
    df["Outcome"],
    margins=True
)

gender_outcome

Outcome,High Risk,Low Risk,All
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,273,204,477
Male,304,219,523
All,577,423,1000


In [32]:
mean_table = df.groupby("Outcome")[["Cholesterol", "BloodPressure"]].mean()

mean_difference = mean_table.loc["High Risk"] - mean_table.loc["Low Risk"]
mean_difference

Cholesterol      33.647754
BloodPressure     5.682390
dtype: float64

In [33]:
age_trend = df.groupby("Outcome")["Age"].mean()
age_trend

Outcome
High Risk    50.285962
Low Risk     49.271868
Name: Age, dtype: float64

In [34]:
df["AgeGroup"] = pd.cut(
    df["Age"],
    bins=[0, 30, 45, 60, 100],
    labels=["<30", "30–45", "45–60", "60+"]
)

age_group_trend = pd.crosstab(
    df["AgeGroup"],
    df["Outcome"],
    normalize="index"
) * 100

age_group_trend

Outcome,High Risk,Low Risk
AgeGroup,Unnamed: 1_level_1,Unnamed: 2_level_1
<30,57.068063,42.931937
30–45,56.779661,43.220339
45–60,58.474576,41.525424
60+,58.160237,41.839763


In [35]:
high_risk = df[df["Outcome"] == "High Risk"]
low_risk = df[df["Outcome"] == "Low Risk"]

In [36]:
from scipy.stats import ttest_ind

t_stat_chol, p_value_chol = ttest_ind(
    high_risk["Cholesterol"],
    low_risk["Cholesterol"],
    equal_var=False
)

t_stat_chol, p_value_chol

(np.float64(15.254313139689252), np.float64(2.402279323816184e-47))

In [37]:
t_stat_bp, p_value_bp = ttest_ind(
    high_risk["BloodPressure"],
    low_risk["BloodPressure"],
    equal_var=False
)

t_stat_bp, p_value_bp

(np.float64(6.121741380825982), np.float64(1.3309224450369332e-09))

In [38]:
stats_summary = pd.DataFrame({
    "Metric": ["Cholesterol", "BloodPressure"],
    "T-statistic": [t_stat_chol, t_stat_bp],
    "P-value": [p_value_chol, p_value_bp]
})

stats_summary

Unnamed: 0,Metric,T-statistic,P-value
0,Cholesterol,15.254313,2.402279e-47
1,BloodPressure,6.121741,1.330922e-09
