Describe the characteristics of the participants for the table in result section.


In [9]:
import pandas as pd
from tqdm import tqdm
import sqlite3
import sys

sys.path.append("../..")

from utils.constants import DatabaseConfig, TableNames
from utils.csv_utils import create_descriptive_stats

In [10]:
conn = sqlite3.connect(DatabaseConfig.DB_PATH)
cursor = conn.cursor()
primary_key = "eid"

# Count Missing Rates

In [11]:
# * This exclusion criteria may be modified for sensitivity analysis.
exclusion_criteria = "WHERE s.statins = 0 AND s.ecg_hrv_ok = 1 AND s.ecg_before_cvd == 0"

cursor.execute(f"PRAGMA table_info({TableNames.COVARIATES});")
columns = [row[1] for row in cursor.fetchall()]
columns = [col for col in columns if col != primary_key]

query_sql = f"""
SELECT c.{primary_key}, 
       {', '.join(f'SUM(CASE WHEN {col} IS NULL THEN 1 ELSE 0 END) AS {col}_null' for col in columns)}, 
       COUNT(*) AS total
FROM {TableNames.COVARIATES} c INNER JOIN {TableNames.STATUS} s ON c.eid = s.eid
{exclusion_criteria}
"""
cursor.execute(query_sql)

print("Missing rates:")
for row in tqdm(cursor.fetchall()):
    eid = row[0]
    values = row[1:-1]
    total = row[-1]

    for i, value in enumerate(values):
        print(f"{columns[i]}: {value} / {total} -> {(value / total * 100):.2f}%")

Missing rates:


100%|██████████| 1/1 [00:00<00:00, 11214.72it/s]

age: 0 / 35159 -> 0.00%
sex: 0 / 35159 -> 0.00%
ethnicity: 0 / 35159 -> 0.00%
BMI: 18 / 35159 -> 0.05%
smoking: 68 / 35159 -> 0.19%
diabetes: 68 / 35159 -> 0.19%
systolic_bp: 157 / 35159 -> 0.45%
hypertension_treatment: 183 / 35159 -> 0.52%
total_chol: 2661 / 35159 -> 7.57%
hdl_chol: 4262 / 35159 -> 12.12%
education: 68 / 35159 -> 0.19%
activity: 5764 / 35159 -> 16.39%
max_workload: 0 / 35159 -> 0.00%
max_heart_rate: 1 / 35159 -> 0.00%





# Process Data to Get Descriptive Statistics

In [12]:
query_sql = f"""
SELECT c.{primary_key}, s.event,
       {', '.join(f'{col} AS {col}' for col in columns)}
FROM {TableNames.COVARIATES} c INNER JOIN {TableNames.STATUS} s ON c.eid = s.eid
{exclusion_criteria}
"""

cursor.execute(query_sql)

data = cursor.fetchall()
df = pd.DataFrame(
     data,
     columns=[description[0] for description in cursor.description]
)

In [13]:
df.head()

Unnamed: 0,eid,event,age,sex,ethnicity,BMI,smoking,diabetes,systolic_bp,hypertension_treatment,total_chol,hdl_chol,education,activity,max_workload,max_heart_rate
0,1000205,0,40,1,1,21.5595,0.0,0.0,149.0,0.0,4.569,1.228,2.0,0.0,130.0,139.0
1,1000239,0,65,0,1,22.9214,1.0,0.0,137.0,0.0,5.78,2.221,1.0,1.0,60.0,126.0
2,1000677,0,42,0,1,37.892,2.0,0.0,124.0,0.0,5.874,1.323,3.0,1.0,80.0,109.0
3,1000737,0,52,1,1,22.8374,0.0,0.0,148.0,0.0,4.429,,4.0,2.0,110.0,112.0
4,1000779,0,56,1,1,25.0194,0.0,0.0,144.0,0.0,6.258,1.406,3.0,2.0,110.0,112.0


In [14]:
df["hypertension_treatment"] = df["hypertension_treatment"].fillna(-1)
categorical_columns = [
    'event',
    'sex',              
    'ethnicity', 
    'smoking',
    "diabetes",
    'hypertension_treatment',       
    'education',   
    'activity'    
]

df[categorical_columns] = df[categorical_columns].astype('category')


df['sex'] = df['sex'].cat.set_categories([0, 1], ordered=False)  # Female=0, Male=1
df['ethnicity'] = df['ethnicity'].cat.set_categories([1, 2, 3, 4, 5, 6, -1, -3], ordered=False)  # White=1, Mixed=2, Asian=3, Black=4, Chinese=5, Other=6
df['smoking'] = df['smoking'].cat.set_categories([0, 1, 2, -3], ordered=False)  # Never=0, Previous=1, Current=2
df['diabetes'] = df['diabetes'].cat.set_categories([0, 1, -1, -3], ordered=False)  # No=0, Yes=1
df['education'] = df['education'].cat.set_categories([1, 2, 3, 4, 5, 6, -7, -3], ordered=True)  # College=1, A levels=2, O levels=3, etc.
df['activity'] = df['activity'].cat.set_categories([0, 1, 2], ordered=True)  # low=0, moderate=1, high=2
df['hypertension_treatment'] = df['hypertension_treatment'].cat.set_categories([0, 1, -1], ordered=False)  # Heart attack=1, Angina=2, etc.
df['event'] = df['event'].cat.set_categories([0, 1], ordered=False)  # No=0, Yes=1

We need to process some of data.

In [15]:
df_order = [
    'age',
    'sex',
    'ethnicity',
    'education',
    
    'BMI',
    'systolic_bp',
    
    'diabetes',
    'hypertension_treatment',
    
    'total_chol',
    'hdl_chol',
    
    'smoking',
    'activity'
]

In [16]:
df_stats = create_descriptive_stats(df, event_col_name="event", order=df_order)
print(f"There are {len(df)} participants in the dataset.")
for col, result in df_stats.items():
    print(col)
    print(result['stats'])
    if result['p_value'] < 0.001:
        print("p-value: <0.001")
    else:
        print(f"p-value: {result['p_value']:.3f}")
    print(f"Missing rate: {result['missing_rate']:.2%}")
    print(f"Missing number: {result['missing_n']}")
    print("\n")

There are 35159 participants in the dataset.
age
        count  mean  std
event                   
0      30,719  54.8  8.1
1       4,440  59.9  7.1
p-value: <0.001
Missing rate: 0.00%
Missing number: 0


sex
event  sex
0      0      17,664 (57.50%)
       1      13,055 (42.50%)
1      1       2,611 (58.80%)
       0       1,829 (41.20%)
dtype: object
p-value: <0.001
Missing rate: 0.00%
Missing number: 0


ethnicity
event  ethnicity
0      1            26,607 (86.80%)
       3              1,751 (5.70%)
       2              1,494 (4.90%)
       6                402 (1.30%)
       4                249 (0.80%)
       5                164 (0.50%)
       -3                 0 (0.00%)
       -1                 0 (0.00%)
1      1             3,935 (88.90%)
       3                216 (4.90%)
       2                202 (4.60%)
       6                 37 (0.80%)
       4                 24 (0.50%)
       5                 10 (0.20%)
       -1                 0 (0.00%)
       -3              