In [None]:
# 01_patient_cleaning.py
import pandas as pd
import numpy as np
import gzip
from pathlib import Path

In [None]:
# Configuration
data_dir = Path('data/original')
output_dir = Path('data/processed')
output_dir.mkdir(exist_ok=True)
VALID_AGE_RANGE = (0, 120)

In [None]:
def load_gzipped_csv(path):
    with gzip.open(path, 'rb') as f:
        return pd.read_csv(f)

In [None]:
def clean_patients_data(patients):
    patients.columns = patients.columns.str.lower()
    
    # Date handling
    patients['birthdate'] = pd.to_datetime(patients['birthdate'], errors='coerce')
    patients['age'] = (pd.to_datetime('today') - patients['birthdate']).dt.days / 365.25
    
    # Age validation
    patients['age'] = np.where(
        (patients['age'] < VALID_AGE_RANGE[0]) | (patients['age'] > VALID_AGE_RANGE[1]),
        np.nan,
        patients['age']
    )
    
    # Demographics cleaning
    patients['gender'] = patients['gender'].map({'M': 'M', 'F': 'F', '8293.3': np.nan})
    
    # Simplified race cleaning (since data is already clean)
    patients['race'] = patients['race'].str.lower()
    valid_races = ['white', 'black', 'asian', 'hawaiian', 'other', 'native']
    patients['race'] = patients['race'].where(
        patients['race'].isin(valid_races),  # Keep if valid
        'other'  # Replace invalid values
    )
    
    # Quality flags
    patients['data_quality_flag'] = np.where(
        patients[['birthdate', 'gender', 'age']].isna().any(axis=1),
        'Invalid',
        'Valid'
    )
    
    return patients

In [None]:
def print_race_verification(patients, clean_patients):
    print("\n=== Race Distribution Verification ===")
    
    # Get all unique original race values
    print("\nAll original race values:")
    print(np.sort(patients['race'].unique()))
    
    # Show cleaned distribution
    print("\nCleaned race categories with counts:")
    print(clean_patients['race'].value_counts())
    
    # Detailed breakdown
    print("\nOriginal -> Cleaned mapping examples:")
    for category in clean_patients['race'].unique():
        original_values = patients.loc[patients['race'].str.lower() == category.lower(), 'race'].unique()
        print(f"\n{category} (n={len(clean_patients[clean_patients['race'] == category])}):")
        print(f"Original values: {original_values[:5]}")  # Show first 5 examples
        if category == 'other':
            non_standard = patients[~patients['race'].str.lower().isin(
                ['white','black','asian','hawaiian','native'])]['race'].unique()
            print(f"Non-standard values mapped to 'other': {non_standard}")

In [None]:
if __name__ == "__main__":
    # Load and clean
    patients = load_gzipped_csv(data_dir / 'patients.csv.gz')
    clean_patients = clean_patients_data(patients)
    
    # Split and save
    valid_patients = clean_patients[clean_patients['data_quality_flag'] == 'Valid']
    invalid_patients = clean_patients[clean_patients['data_quality_flag'] == 'Invalid']
    
    valid_patients.to_csv(output_dir / 'clean_patients.csv', index=False)
    invalid_patients.to_csv(output_dir / 'excluded_patients.csv', index=False)
    
    # Reporting
    print("\n=== Final Cleaning Report ===")
    print(f"Initial patients: {len(patients)}")
    print(f"Valid patients: {len(valid_patients)} ({len(valid_patients)/len(patients):.1%})")
    print(f"Excluded patients: {len(invalid_patients)}")
    
    print("\nFinal age distribution (years):")
    print(valid_patients['age'].describe())
    
    # Race verification
    print_race_verification(patients, valid_patients)   

02_conditions_cleaning.py

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import gzip
from IPython.display import display, Markdown

In [None]:
DATA_DIR = Path('data/original')
OUTPUT_DIR = Path('data/processed')

In [None]:
def load_conditions(conditions_path, clean_patients_path):
    """Load and validate conditions data"""
    conditions = pd.read_csv(conditions_path)
    valid_patients = pd.read_csv(clean_patients_path)['id']
    
    # QC Checks
    conditions = conditions[conditions['PATIENT'].isin(valid_patients)]
    conditions['START'] = pd.to_datetime(conditions['START'], errors='coerce')
    
    # SNOMED Validation
    snomed_codes = pd.read_csv(DATA_DIR/'dictionary_snomed.csv')['CODE']
    valid_conditions = conditions[conditions['CODE'].isin(snomed_codes)]
    
    return valid_conditions

In [None]:
if __name__ == "__main__":
    conditions = load_conditions(
        DATA_DIR/'conditions.csv.gz',
        OUTPUT_DIR/'clean_patients.csv'
    )
    
    # Save results
    conditions.to_csv(OUTPUT_DIR/'clean_conditions.csv', index=False)
    
    # Reporting
    display(Markdown("### Conditions Cleaning Report"))
    display(f"Initial conditions: {len(pd.read_csv(DATA_DIR/'conditions.csv.gz'))}")
    display(f"Valid conditions: {len(conditions)}")
    display(f"SNOMED codes: {conditions['CODE'].nunique()} unique codes")

In [None]:
# 03_observations_cleaning.py

import pandas as pd
import numpy as np
from pathlib import Path
import gzip
from IPython.display import display, Markdown
import matplotlib.pyplot as plt

DATA_DIR = Path('data/original')
OUTPUT_DIR = Path('data/processed')

In [None]:
# Key LOINC Codes for Analysis
# - **Blood Pressure**: 
#   - Systolic: `8480-6`
#   - Diastolic: `8462-4`  
# - **BMI**: `39156-5`

In [None]:
def clean_observations(obs_path, clean_patients_path):
    """
    Cleans observations data with:
    1. Patient linkage validation
    2. LOINC code verification
    3. Unit standardization
    4. Range validation
    """
    obs = pd.read_csv(obs_path)
    valid_patients = pd.read_csv(clean_patients_path)['id']
    
    # 1. Patient linkage
    obs = obs[obs['PATIENT'].isin(valid_patients)]
    
    # 2. LOINC validation
    loinc_codes = pd.read_csv(DATA_DIR/'dictionary_loinc.csv')['CODE']
    valid_obs = obs[obs['CODE'].isin(loinc_codes)].copy()  # Make sure it's a copy
    
    # 3. Numeric value extraction
    valid_obs.loc[:, 'VALUE_NUM'] = pd.to_numeric(valid_obs['VALUE'], errors='coerce')
    
    # 4. Unit standardization
    valid_obs.loc[:, 'UNITS'] = valid_obs['UNITS'].str.lower().str.strip()
    
    return valid_obs

if __name__ == "__main__":
    observations = clean_observations(
        DATA_DIR/'observations.csv.gz',
        OUTPUT_DIR/'clean_patients.csv'
    )
    
    # Save outputs
    observations.to_csv(OUTPUT_DIR/'clean_observations.csv', index=False)
    
    # Reporting
    print("### Observations Cleaning Report")
    print(f"Original observations: {len(pd.read_csv(DATA_DIR/'observations.csv.gz')):,}")
    print(f"Valid observations: {len(observations):,}")

    # Blood Pressure stats
    bp_codes = ['8480-6', '8462-4']
    bp_data = observations[observations['CODE'].isin(bp_codes)]
    print(f"**Blood Pressure Records**: {len(bp_data):,}")

    



04_medications_cleaning.py

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from IPython.display import display, Markdown
import gzip

In [None]:
# Configuration
DATA_DIR = Path('data/original')
OUTPUT_DIR = Path('data/processed')

In [None]:
def load_medications(meds_path, clean_patients_path):
    # Load raw medications
    medications = pd.read_csv(meds_path)
    valid_patients = pd.read_csv(clean_patients_path)['id']

    # Filter to valid patients
    medications = medications[medications['PATIENT'].isin(valid_patients)]

    # Normalize CODE field for matching
    medications['CODE'] = pd.to_numeric(medications['CODE'], errors='coerce').dropna().astype(int).astype(str)

    # Load and normalize RXNORM dictionary
    rxnorm_codes = pd.read_csv(DATA_DIR / 'dictionary_rxnorm.csv')
    rxnorm_codes['CODE'] = rxnorm_codes['CODE'].astype(str)

    # Filter meds by valid RXNORM codes
    valid_meds = medications[medications['CODE'].isin(rxnorm_codes['CODE'])]

    return medications, valid_meds, rxnorm_codes

In [None]:
if __name__ == "__main__":
    raw_meds, valid_meds, rxnorm = load_medications(
        DATA_DIR / 'medications.csv.gz',
        OUTPUT_DIR / 'clean_patients.csv'
    )

    # Save cleaned output
    valid_meds.to_csv(OUTPUT_DIR / 'clean_medications.csv', index=False)

    # Reporting
    display(Markdown("### Medications Cleaning Report"))
    print(f"Total raw medications: {len(raw_meds)}")
    print(f"Valid medications: {len(valid_meds)}")
    print(f"Unique patients in meds: {raw_meds['PATIENT'].nunique()}")
    print(f"Overlap with clean patients: {raw_meds['PATIENT'].isin(pd.read_csv(OUTPUT_DIR / 'clean_patients.csv')['id']).sum()}")
    print(f"Unique RXNORM codes in meds: {raw_meds['CODE'].nunique()}")
    print(f"Overlap with RXNORM dict: {valid_meds['CODE'].nunique()}")

05_encounters_cleaning.py

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import gzip

In [None]:
# Configuration
DATA_DIR = Path('data/original')
OUTPUT_DIR = Path('data/processed')

In [None]:
def load_encounters(encounters_path, clean_patients_path):
    """Load and validate encounters data"""
    encounters = pd.read_csv(encounters_path)
    valid_patients = pd.read_csv(clean_patients_path)['id']
    
    # QC: Filter encounters with valid patient IDs
    encounters = encounters[encounters['PATIENT'].isin(valid_patients)]
    
    # Handle date columns: Convert to datetime, coerce errors
    encounters['START'] = pd.to_datetime(encounters['START'], errors='coerce')
    encounters['STOP'] = pd.to_datetime(encounters['STOP'], errors='coerce')
    
    # Filter out any rows with invalid dates or missing key fields
    encounters = encounters.dropna(subset=['PATIENT', 'START'])
    
    # Add any additional cleaning logic based on specific encounter attributes
    return encounters

In [None]:
if __name__ == "__main__":
    encounters = load_encounters(
        DATA_DIR/'encounters.csv.gz',
        OUTPUT_DIR/'clean_patients.csv'
    )
    
    # Save cleaned encounters data
    encounters.to_csv(OUTPUT_DIR/'clean_encounters.csv', index=False)
    
    # Reporting
    print(f"Initial encounters: {len(pd.read_csv(DATA_DIR/'encounters.csv.gz'))}")
    print(f"Valid encounters: {len(encounters)}")
    print(f"Unique patients in encounters: {encounters['PATIENT'].nunique()}")

06_data_desc.py

In [None]:
import pandas as pd
from pathlib import Path  # Make sure to import Path

In [None]:
# Paths to cleaned data
OUTPUT_DIR = Path('data/processed')
clean_patients = pd.read_csv(OUTPUT_DIR / 'clean_patients.csv')
clean_conditions = pd.read_csv(OUTPUT_DIR / 'clean_conditions.csv')
clean_observations = pd.read_csv(OUTPUT_DIR / 'clean_observations.csv')
clean_medications = pd.read_csv(OUTPUT_DIR / 'clean_medications.csv')
clean_encounters = pd.read_csv(OUTPUT_DIR / 'clean_encounters.csv')

In [None]:
# 1. Unique patients in each dataset
print(f"Unique patients in clean_patients: {clean_patients['id'].nunique()}")
print(f"Unique patients in clean_conditions: {clean_conditions['PATIENT'].nunique()}")
print(f"Unique patients in clean_observations: {clean_observations['PATIENT'].nunique()}")
print(f"Unique patients in clean_medications: {clean_medications['PATIENT'].nunique()}")
print(f"Unique patients in clean_encounters: {clean_encounters['PATIENT'].nunique()}")

In [None]:
# 2. Most frequent ontology terms (SNOMED, LOINC, RXNORM)
# Conditions: SNOMED codes
print("\nMost frequent SNOMED codes in conditions:")
print(clean_conditions['CODE'].value_counts().head())

In [None]:
# Observations: LOINC codes
print("\nMost frequent LOINC codes in observations:")
print(clean_observations['CODE'].value_counts().head())

In [None]:
# Medications: RXNORM codes
print("\nMost frequent RXNORM codes in medications:")
print(clean_medications['CODE'].value_counts().head())

In [None]:
# 3. General stats (optional, you can expand with other metrics)
print("\nGeneral stats for cleaned data:")

In [None]:
# Clean patients
print(f"\nClean patients data summary:\n{clean_patients.describe()}")

In [None]:
# Clean conditions
print(f"\nClean conditions data summary:\n{clean_conditions.describe()}")

In [None]:
# Clean observations
print(f"\nClean observations data summary:\n{clean_observations.describe()}")

In [None]:
# Clean medications
print(f"\nClean medications data summary:\n{clean_medications.describe()}")

In [None]:
# Clean encounters
print(f"\nClean encounters data summary:\n{clean_encounters.describe()}")

07_hypertension_bp_bmi_analysis.py

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# --- Load Data ---
print("Loading data...")
conditions = pd.read_csv("data/original/conditions.csv.gz")
observations = pd.read_csv("data/original/observations.csv.gz")

In [None]:
# --- Identify hypertensive patients ---
print("Identifying hypertensive patients...")
hypertension_code = 59621000
hypertension_patients = conditions[conditions['CODE'] == hypertension_code]['PATIENT'].unique()
print(f"Number of hypertensive patients: {len(hypertension_patients)}")

In [None]:
# --- Filter BP observations ---
print("Filtering BP observations...")
bp_codes = ['8462-4', '8480-6']  # Diastolic, Systolic
observations['VALUE'] = pd.to_numeric(observations['VALUE'], errors='coerce')
bp_obs = observations[observations['CODE'].isin(bp_codes)]
bp_obs_hyper = bp_obs[bp_obs['PATIENT'].isin(hypertension_patients)].copy()

In [None]:
# Pivot systolic/diastolic per patient-date-encounter
bp_pivot = bp_obs_hyper.pivot_table(
    index=['DATE', 'PATIENT', 'ENCOUNTER'],
    columns='CODE',
    values='VALUE'
).reset_index().rename(columns={'8480-6': 'SYSTOLIC_BP', '8462-4': 'DIASTOLIC_BP'})
print(f"Blood pressure observations: {len(bp_pivot)}")

In [None]:
# --- Filter BMI observations ---
print("Filtering BMI observations...")
bmi_code = '39156-5'
bmi_obs = observations[(observations['CODE'] == bmi_code) & (observations['PATIENT'].isin(hypertension_patients))]
bmi_obs['VALUE'] = pd.to_numeric(bmi_obs['VALUE'], errors='coerce')
print(f"BMI observations: {len(bmi_obs)}")

In [None]:
# --- Summary ---
print("\n--- Summary Statistics ---")
print(bp_pivot[['SYSTOLIC_BP', 'DIASTOLIC_BP']].describe())
print("\nBMI Summary:")
print(bmi_obs['VALUE'].describe())

In [None]:
# --- Plots ---
plt.figure(figsize=(12, 5))
sns.kdeplot(bp_pivot['SYSTOLIC_BP'].dropna(), label="Systolic", fill=True)
sns.kdeplot(bp_pivot['DIASTOLIC_BP'].dropna(), label="Diastolic", fill=True)
plt.title("Distribution of Blood Pressure (Hypertensive Patients)")
plt.xlabel("Blood Pressure (mmHg)")
plt.ylabel("Density")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.kdeplot(bmi_obs['VALUE'].dropna(), label="BMI", fill=True, color="purple")
plt.title("Distribution of BMI (Hypertensive Patients)")
plt.xlabel("BMI")
plt.ylabel("Density")
plt.tight_layout()
plt.show()

08_compare_bp_bmi_hypertensive_vs_non.py

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np

In [None]:
# Configuration
DATA_DIR = Path("data/original")
OUTPUT_DIR = Path("data/processed")

In [None]:
# Load data
print("Loading data...")
conditions = pd.read_csv(DATA_DIR / "conditions.csv.gz", compression="gzip")
observations = pd.read_csv(DATA_DIR / "observations.csv.gz", compression="gzip")

In [None]:
# Show a sample of the conditions DataFrame
print("\n--- Sample of Conditions DataFrame ---")
print(conditions.head())

In [None]:
# Ensure 'CODE' is string for comparison
conditions["CODE"] = conditions["CODE"].astype(str).str.strip()

In [None]:
# Hypertension SNOMED codes
hypertensive_codes = ["10509002.0", "283371005.0", "444814009.0"]

In [None]:
# Identify hypertensive patients
print("\nIdentifying hypertensive patients...")
hypertensive_patients = conditions[conditions["CODE"].isin(hypertensive_codes)]["PATIENT"].unique()
print(f"Number of hypertensive patients: {len(hypertensive_patients)}")

In [None]:
# Filter for Systolic, Diastolic BP, and BMI
systolic_code = "8480-6"
diastolic_code = "8462-4"
bmi_code = "39156-5"

In [None]:
# Make sure 'CODE' is string in observations too
observations["CODE"] = observations["CODE"].astype(str).str.strip()

In [None]:
# Extract relevant observations
bp_sys = observations[observations["CODE"] == systolic_code][["PATIENT", "VALUE"]].rename(columns={"VALUE": "SYSTOLIC_BP"})
bp_dia = observations[observations["CODE"] == diastolic_code][["PATIENT", "VALUE"]].rename(columns={"VALUE": "DIASTOLIC_BP"})
bmi = observations[observations["CODE"] == bmi_code][["PATIENT", "VALUE"]].rename(columns={"VALUE": "BMI"})

In [None]:
# Merge BP readings
bp = pd.merge(bp_sys, bp_dia, on="PATIENT", how="inner")

In [None]:
# Merge with BMI
bp_bmi = pd.merge(bp, bmi, on="PATIENT", how="inner")

In [None]:
# Tag hypertensive vs non-hypertensive
bp_bmi["HYPERTENSIVE"] = bp_bmi["PATIENT"].isin(hypertensive_patients)

In [None]:
# Convert columns to numeric, forcing errors to NaN
bp_bmi["SYSTOLIC_BP"] = pd.to_numeric(bp_bmi["SYSTOLIC_BP"], errors='coerce')
bp_bmi["DIASTOLIC_BP"] = pd.to_numeric(bp_bmi["DIASTOLIC_BP"], errors='coerce')
bp_bmi["BMI"] = pd.to_numeric(bp_bmi["BMI"], errors='coerce')

In [None]:
# Check shapes of the relevant data to ensure they are 1D
print("\n--- Checking Shapes of Data ---")
print(f"Shape of Systolic BP: {bp_bmi['SYSTOLIC_BP'].dropna().shape}")
print(f"Shape of BMI: {bp_bmi['BMI'].dropna().shape}")

In [None]:
# Plotting
# Ensure we pass numpy arrays for the KDE plot
plt.figure(figsize=(14, 7))

In [None]:
# For faster plotting, sample a subset of data (e.g., 10% of the data)
sample_size = 0.1
hyper_sample = bp_bmi[bp_bmi["HYPERTENSIVE"]].sample(frac=sample_size, random_state=42)
nonhyper_sample = bp_bmi[~bp_bmi["HYPERTENSIVE"]].sample(frac=sample_size, random_state=42)

In [None]:
# Convert the SYSTOLIC_BP column to a numpy array and flatten it
hypertensive_systolic_bp = np.ravel(hyper_sample["SYSTOLIC_BP"].dropna().values)
non_hypertensive_systolic_bp = np.ravel(nonhyper_sample["SYSTOLIC_BP"].dropna().values)

In [None]:
# Plotting the density plots
sns.kdeplot(hypertensive_systolic_bp, label="Hypertensive", color="red")
sns.kdeplot(non_hypertensive_systolic_bp, label="Non-Hypertensive", color="blue")

In [None]:
plt.title("Systolic BP Distribution (Sampled Data)")
plt.xlabel("Systolic BP")
plt.ylabel("Density")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# BMI Plot
plt.figure(figsize=(14, 7))
sns.kdeplot(np.ravel(hyper_sample["BMI"].dropna().values), label="Hypertensive", color="red")
sns.kdeplot(np.ravel(nonhyper_sample["BMI"].dropna().values), label="Non-Hypertensive", color="blue")
plt.title("BMI Distribution (Sampled Data)")
plt.xlabel("BMI")
plt.ylabel("Density")
plt.legend()
plt.tight_layout()
plt.show()

09_hypertension_prevalence.py

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load data
print("Loading data...")
patients = pd.read_csv("data/original/patients.csv.gz")
conditions = pd.read_csv("data/original/conditions.csv.gz")
observations = pd.read_csv("data/original/observations.csv.gz")

In [None]:
print("\n--- Sample of Conditions ---")
print(conditions.head())

In [None]:
# Update hypertension codes from sample
hypertension_codes = {"10509002", "283371005", "444814009", "16114001"}

In [None]:
conditions["CODE"] = conditions["CODE"].astype(str).str.rstrip(".0")

In [None]:
print("\nIdentifying hypertensive patients...")
hypertensive_patients = conditions[conditions["CODE"].isin(hypertension_codes)]["PATIENT"].unique()
print("Number of hypertensive patients:", len(hypertensive_patients))

In [None]:
print("\n--- Summary: Blood Pressure ---")

In [None]:
bp_codes = {"8480-6": "SYSTOLIC_BP", "8462-4": "DIASTOLIC_BP"}
bmi_code = "39156-5"

In [None]:
# Filter and clean observations
bp_obs = observations[observations["CODE"].isin(bp_codes.keys())].copy()
bmi_obs = observations[observations["CODE"] == bmi_code].copy()

In [None]:
bp_obs["VALUE"] = pd.to_numeric(bp_obs["VALUE"], errors="coerce")
bmi_obs["VALUE"] = pd.to_numeric(bmi_obs["VALUE"], errors="coerce")

In [None]:
# Pivot BP
bp_wide = bp_obs.pivot_table(index=["PATIENT", "DATE"], 
                              columns="CODE", values="VALUE", aggfunc="mean").reset_index()
bp_wide.rename(columns=bp_codes, inplace=True)

In [None]:
bmi_clean = bmi_obs[["PATIENT", "DATE", "VALUE"]].rename(columns={"VALUE": "BMI"})

In [None]:
# Merge and tag
data = pd.merge(bp_wide, bmi_clean, on=["PATIENT", "DATE"], how="outer")
data["HYPERTENSION"] = data["PATIENT"].isin(hypertensive_patients)

In [None]:
# Split and clean
hyper = data[data["HYPERTENSION"] == True].copy()
non_hyper = data[data["HYPERTENSION"] == False].copy()

In [None]:
for df in [hyper, non_hyper]:
    df["SYSTOLIC_BP"] = pd.to_numeric(df["SYSTOLIC_BP"], errors="coerce")
    df["DIASTOLIC_BP"] = pd.to_numeric(df["DIASTOLIC_BP"], errors="coerce")
    df["BMI"] = pd.to_numeric(df["BMI"], errors="coerce")

In [None]:
# Summary stats
print("\nHypertensive BP:\n", hyper[["SYSTOLIC_BP", "DIASTOLIC_BP"]].describe())
print("\nNon-Hypertensive BP:\n", non_hyper[["SYSTOLIC_BP", "DIASTOLIC_BP"]].describe())

In [None]:
print("\n--- Summary: BMI ---")
print("\nHypertensive BMI:\n", hyper["BMI"].describe())
print("\nNon-Hypertensive BMI:\n", non_hyper["BMI"].describe())

In [None]:
# --- Plots ---
# Explicitly convert data to a 1D NumPy array using np.ravel()
import numpy as np

In [None]:
sns.kdeplot(np.array(hyper["SYSTOLIC_BP"].dropna()), label="Hypertensive", color="red")
sns.kdeplot(np.array(non_hyper["SYSTOLIC_BP"].dropna()), label="Non-Hypertensive", color="blue")

In [None]:
plt.title("Systolic Blood Pressure Distribution")
plt.xlabel("Systolic BP (mmHg)")
plt.legend()
plt.show()

In [None]:
sns.kdeplot(hyper["DIASTOLIC_BP"].dropna().values, label="Hypertensive", color="red")
sns.kdeplot(non_hyper["DIASTOLIC_BP"].dropna().values, label="Non-Hypertensive", color="blue")
plt.title("Diastolic Blood Pressure Distribution")
plt.xlabel("Diastolic BP (mmHg)")
plt.legend()
plt.show()

In [None]:
sns.kdeplot(hyper["BMI"].dropna().values, label="Hypertensive", color="red")
sns.kdeplot(non_hyper["BMI"].dropna().values, label="Non-Hypertensive", color="blue")
plt.title("BMI Distribution")
plt.xlabel("BMI (kg/m²)")
plt.legend()
plt.show()

In [None]:
# --- Crude prevalence ---
print("\n--- Crude Prevalence of Hypertension ---")
total_patients = patients["Id"].nunique()
crude_prevalence = len(hypertensive_patients) / total_patients
print(f"Crude prevalence: {crude_prevalence:.2%}")

In [None]:
# --- Adjusted prevalence ---
print("\n--- Adjusted Prevalence (Placeholder) ---")
print("Adjusted prevalence estimation requires UK population age distribution.")