Count the number of eligible patients in the database.

In [23]:
import sqlite3
import pandas as pd
import sys

sys.path.append("../..")

from utils.constants import DatabaseConfig, TableNames

In [2]:
conn = sqlite3.connect(DatabaseConfig.DB_PATH)
cursor = conn.cursor()
primary_key = "eid"

## Count Number of Eligible Participants

In [12]:
# Total participants
cursor.execute(f"""
    SELECT COUNT(*) AS Total_count
    FROM {TableNames.PROCESSED}
""")
num1 = cursor.fetchall()[0][0]
print(num1)

77888


In [13]:
cursor.execute(f"""
    SELECT COUNT(*) AS Total_count 
    FROM {TableNames.PROCESSED} 
    WHERE ECG_date IS NOT NULL;
""")
num2 = cursor.fetchall()[0][0]
print(num2)

61927


In [14]:
num1 - num2

15961

Those with ECG_date being NULL have empty or corrupted ECG XML files.

In [15]:
# Complete test and has valid ECG
cursor.execute(f"""
    SELECT COUNT(*) AS Total_count 
    FROM {TableNames.PROCESSED} 
    WHERE HRV_available = 1  AND ECG_date IS NOT NULL;
""")
num3 = cursor.fetchall()[0][0]
print(num3)

42216


In [17]:
num2 - num3

19711

In [19]:
# Complete test, has valid ECG and don't take statin
cursor.execute(f"""
    SELECT COUNT(*) AS Total_count
    FROM {TableNames.PROCESSED} 
    WHERE statins = 0 AND HRV_available = 1 AND ECG_date IS NOT NULL;
""")
num4 = cursor.fetchall()[0][0]
print(num4)

35891


In [21]:
num3 - num4

6325

In [33]:
# Complete test, has valid ECG and don't take statin
cursor.execute(f"""
    SELECT COUNT(*) AS Total_count, 
           SUM(CASE WHEN ECG_date > CVD_date THEN 1 ELSE 0 END) AS CVD_before_ECG, 
           SUM(CASE WHEN ECG_date < CVD_date THEN 1 ELSE 0 END) AS CVD_after_ECG
    FROM {TableNames.PROCESSED} 
    WHERE statins = 0 AND HRV_available = 1 AND ECG_date IS NOT NULL
    GROUP BY CVD;
""")
df = pd.DataFrame(cursor.fetchall(), columns=["Total_count", "CVD_before_ECG", "CVD_after_ECG"], index=["no CVD", "CVD"])
print(df)
num5 = df["Total_count"].sum()
num6 = df["CVD_before_ECG"].sum()
num7 = df["CVD_after_ECG"].sum()
print(f"The final study population has {num5 - num6} participants, out of which {num7} have incident CVD")

        Total_count  CVD_before_ECG  CVD_after_ECG
no CVD        30719               0              0
CVD            5172             732           4440
The final study population has 35159 participants, out of which 4440 have incident CVD


12.6% of eligible participants have incident CVD, while the remaining ones don't experience CVD so far.

The censoring date is 2022-10-31.

## Count Missing Rates

In [10]:
selected_columns_ID = [
    34,
    52,
    20116,
    31,
    2443,
    4080,
    21000,
    50,
    21002,
    6177,
    6153,
    6150,
    30690,
    30760,
    6138,
    22032
]

baseline_columns = [f"v.`{column_ID}-0.0`" for column_ID in selected_columns_ID]

missing_values_dict = {}

query_sql = f"""SELECT p.eid, {', '.join(baseline_columns)} 
FROM {processed_table_name} p INNER JOIN {variables_table_name} v on p.{primary_key} = v.{primary_key}
WHERE p.statins = 0 AND p.test_status = 1 AND p.ECG_date IS NOT NULL AND (p.CVD = 0 OR p.ECG_date < p.CVD_date);"""
cursor.execute(query_sql)

cnt = 0
for row in cursor:
    cnt += 1
    eid = row[0]
    missing_columns_index = [i for i, value in enumerate(row[1:]) if value is None]
    missing_columns = [baseline_columns[i] for i in missing_columns_index]
    if missing_columns:
        missing_values_dict[eid] = missing_columns

In [11]:
# Note 6153 and 6177 only applicable to a single gender, we thus remove them from the dictionary
columns_to_remove = ['6153-0.0', '6177-0.0']

for eid in list(missing_values_dict.keys()): 
    missing_values_dict[eid] = [col for col in missing_values_dict[eid] 
                               if not any(col_id in col for col_id in ['6153', '6177'])]
    # remove those with no missing values
    if not missing_values_dict[eid]:
        del missing_values_dict[eid]
    
missing_values_count = {col: 0 for col in baseline_columns}
for eid, missing_columns in missing_values_dict.items():
    for col in missing_columns:
        missing_values_count[col] += 1

for col, cnt_missing in missing_values_count.items():
    print(f"{col}: {cnt_missing}/{cnt} -> {(cnt_missing/cnt * 100):.2f}%")

v.`34-0.0`: 0/41274
v.`52-0.0`: 0/41274
v.`20116-0.0`: 85/41274
v.`31-0.0`: 0/41274
v.`2443-0.0`: 85/41274
v.`4080-0.0`: 196/41274
v.`21000-0.0`: 85/41274
v.`50-0.0`: 20/41274
v.`21002-0.0`: 1/41274
v.`6177-0.0`: 0/41274
v.`6153-0.0`: 0/41274
v.`6150-0.0`: 85/41274
v.`30690-0.0`: 3156/41274
v.`30760-0.0`: 5006/41274
v.`6138-0.0`: 85/41274
v.`22032-0.0`: 6740/41274
