Count the number of eligible patients in the database.

In [1]:
import sqlite3
import sys

sys.path.append("../..")

from utils.constants import db_file, processed_table_name, variables_table_name, primary_key

In [2]:
conn = sqlite3.connect(db_file)
cursor = conn.cursor()

In [4]:
# Total participants
cursor.execute(f"""
    SELECT COUNT(*) AS Total_count
    FROM {processed_table_name}
""")
print(cursor.fetchall())

[(77888,)]


In [7]:
cursor.execute(f"""
    SELECT COUNT(*) AS Total_count 
    FROM {processed_table_name} 
    WHERE ECG_date IS NOT NULL;
""")
print(cursor.fetchall())

[(61927,)]


In [10]:
77888 - 61927

15961

Those with ECG_date being NULL have empty ECG XML files.

In [8]:
# Complete test and has valid ECG
cursor.execute(f"""
    SELECT COUNT(*) AS Total_count 
    FROM {processed_table_name} 
    WHERE test_status = 1  AND ECG_date IS NOT NULL;
""")
print(cursor.fetchall())

[(49608,)]


In [11]:
61927 - 49608

12319

In [12]:
# Complete test, has valid ECG and don't take statin
cursor.execute(f"""
    SELECT COUNT(*) AS Total_count
    FROM {processed_table_name} 
    WHERE statins = 0 AND test_status = 1 AND ECG_date IS NOT NULL;
""")
print(cursor.fetchall())

[(42143,)]


In [13]:
49608 - 42143

7465

In [14]:
# Complete test, has valid ECG and don't take statin
cursor.execute(f"""
    SELECT COUNT(*) AS Total_count, SUM(CASE WHEN ECG_date < CVD_date THEN 1 ELSE 0 END) AS ECG_before_CVD FROM {processed_table_name} 
    WHERE statins = 0 AND test_status = 1 AND ECG_date IS NOT NULL
    GROUP BY CVD;
""")
print(cursor.fetchall())

[(36060, 0), (6083, 5214)]


In [12]:
print(36060 + 5214)
print(5214 / (36060 + 5214))

41274
0.12632650094490477


In [15]:
6083 - 5214

869

12.6% of eligible participants have incident CVD, while the remaining ones don't experience CVD so far.

The censoring date is 2022-10-31.

In [10]:
# count missing rate for eligible participants

# Get column names
conn = sqlite3.connect(db_file)
cursor = conn.cursor()

selected_columns_ID = [
    34,
    52,
    20116,
    31,
    2443,
    4080,
    21000,
    50,
    21002,
    6177,
    6153,
    6150,
    30690,
    30760,
    6138,
    22032
]

baseline_columns = [f"v.`{column_ID}-0.0`" for column_ID in selected_columns_ID]

missing_values_dict = {}

query_sql = f"""SELECT p.eid, {', '.join(baseline_columns)} 
FROM {processed_table_name} p INNER JOIN {variables_table_name} v on p.{primary_key} = v.{primary_key}
WHERE p.statins = 0 AND p.test_status = 1 AND p.ECG_date IS NOT NULL AND (p.CVD = 0 OR p.ECG_date < p.CVD_date);"""
cursor.execute(query_sql)

cnt = 0
for row in cursor:
    cnt += 1
    eid = row[0]
    missing_columns_index = [i for i, value in enumerate(row[1:]) if value is None]
    missing_columns = [baseline_columns[i] for i in missing_columns_index]
    if missing_columns:
        missing_values_dict[eid] = missing_columns

In [11]:
# Note 6153 and 6177 only applicable to a single gender, we thus remove them from the dictionary
columns_to_remove = ['6153-0.0', '6177-0.0']

for eid in list(missing_values_dict.keys()): 
    missing_values_dict[eid] = [col for col in missing_values_dict[eid] 
                               if not any(col_id in col for col_id in ['6153', '6177'])]
    # remove those with no missing values
    if not missing_values_dict[eid]:
        del missing_values_dict[eid]
    
missing_values_count = {col: 0 for col in baseline_columns}
for eid, missing_columns in missing_values_dict.items():
    for col in missing_columns:
        missing_values_count[col] += 1

for col, cnt_missing in missing_values_count.items():
    print(f"{col}: {cnt_missing}/{cnt}")

v.`34-0.0`: 0/41274
v.`52-0.0`: 0/41274
v.`20116-0.0`: 85/41274
v.`31-0.0`: 0/41274
v.`2443-0.0`: 85/41274
v.`4080-0.0`: 196/41274
v.`21000-0.0`: 85/41274
v.`50-0.0`: 20/41274
v.`21002-0.0`: 1/41274
v.`6177-0.0`: 0/41274
v.`6153-0.0`: 0/41274
v.`6150-0.0`: 85/41274
v.`30690-0.0`: 3156/41274
v.`30760-0.0`: 5006/41274
v.`6138-0.0`: 85/41274
v.`22032-0.0`: 6740/41274


In [13]:
5006/41274

0.12128700877065465

In [1]:
6740/41274

0.16329892910791297