Using database, we determine whether each patient has CVD using ICD-10 codes.

In [1]:
import sqlite3
from tqdm import tqdm
import sys

sys.path.append("..")

import importlib
module = importlib.import_module('utils.constants')
importlib.reload(module)

from utils.constants import db_file, ICD_table_name, processed_table_name, primary_key

## Determine whether the patient has CVD using ICD-10 codes

In [7]:
conn = sqlite3.connect(db_file)
cursor = conn.cursor()

cursor.execute(f"PRAGMA table_info({ICD_table_name});")
# Choose all columns that start with 41270
columns = [row[1] for row in cursor.fetchall() if row[1].startswith('41270')]
columns_escaped = [f"`{col}`" for col in columns]
print(columns_escaped)

['`41270-0.0`', '`41270-0.1`', '`41270-0.2`', '`41270-0.3`', '`41270-0.4`', '`41270-0.5`', '`41270-0.6`', '`41270-0.7`', '`41270-0.8`', '`41270-0.9`', '`41270-0.10`', '`41270-0.11`', '`41270-0.12`', '`41270-0.13`', '`41270-0.14`', '`41270-0.15`', '`41270-0.16`', '`41270-0.17`', '`41270-0.18`', '`41270-0.19`', '`41270-0.20`', '`41270-0.21`', '`41270-0.22`', '`41270-0.23`', '`41270-0.24`', '`41270-0.25`', '`41270-0.26`', '`41270-0.27`', '`41270-0.28`', '`41270-0.29`', '`41270-0.30`', '`41270-0.31`', '`41270-0.32`', '`41270-0.33`', '`41270-0.34`', '`41270-0.35`', '`41270-0.36`', '`41270-0.37`', '`41270-0.38`', '`41270-0.39`', '`41270-0.40`', '`41270-0.41`', '`41270-0.42`', '`41270-0.43`', '`41270-0.44`', '`41270-0.45`', '`41270-0.46`', '`41270-0.47`', '`41270-0.48`', '`41270-0.49`', '`41270-0.50`', '`41270-0.51`', '`41270-0.52`', '`41270-0.53`', '`41270-0.54`', '`41270-0.55`', '`41270-0.56`', '`41270-0.57`', '`41270-0.58`', '`41270-0.59`', '`41270-0.60`', '`41270-0.61`', '`41270-0.62`', '

In [8]:
# Create a new column in table
cursor.execute(f"ALTER TABLE {processed_table_name} ADD COLUMN CVD INTEGER DEFAULT 0;")

select_query = f"SELECT {primary_key}, {', '.join(columns_escaped)} FROM {ICD_table_name};"
cursor.execute(select_query)

for row in tqdm(cursor.fetchall()):
    # skip the first row
    eid = row[0]
    values = row[1:]

    CVD = 0
    for value in values:
        if value == "nan" or not value:
            continue
        if value.startswith("I"):
            # select first 2 digits
            value_two = int(value[1:3])
           
            # Ischamic Heart Disease
            if value_two >= 20 and value_two <= 25:
                CVD = 1
                break
            # Arrhythmia
            if value_two >= 44 and value_two <= 49:
                CVD = 1
                break
            # Cerebrovascular Disease
            if value_two >= 60 and value_two <= 69:
                    CVD = 1
                    break
            # Heart Failure
            if value_two == 50:
                    CVD = 1
                    break
            if value_two == 42:
                value_three = int(value[1:4])
                if value_three in [420, 428, 429]:
                    CVD = 1
                    break
    if CVD:
        update_query = f"UPDATE {processed_table_name} SET CVD = ? WHERE {primary_key} = ?"
        cursor.execute(update_query, (CVD, eid))

conn.commit()

100%|██████████| 77888/77888 [00:00<00:00, 89793.86it/s]


In [10]:
import pandas as pd

conn = sqlite3.connect(db_file)
cursor = conn.cursor()

cursor.execute(f"SELECT CVD FROM {processed_table_name};")
CVD_list = cursor.fetchall()

In [None]:
# count the number of True in the list
df = pd.DataFrame(CVD_list)
df.value_counts()

0    61641
1    16247
Name: count, dtype: int64

## Determine the time that patients got the CVD

In [22]:
conn = sqlite3.connect(db_file)
cursor = conn.cursor()

cursor.execute(f"PRAGMA table_info({ICD_table_name});")
# Choose all columns that start with 41270 and 41280
columns = [row[1] for row in cursor.fetchall() if row[1].startswith('41270') or row[1].startswith('41280')]
columns_escaped = [f"`{col}`" for col in columns]
print(columns_escaped)

['`41270-0.0`', '`41270-0.1`', '`41270-0.2`', '`41270-0.3`', '`41270-0.4`', '`41270-0.5`', '`41270-0.6`', '`41270-0.7`', '`41270-0.8`', '`41270-0.9`', '`41270-0.10`', '`41270-0.11`', '`41270-0.12`', '`41270-0.13`', '`41270-0.14`', '`41270-0.15`', '`41270-0.16`', '`41270-0.17`', '`41270-0.18`', '`41270-0.19`', '`41270-0.20`', '`41270-0.21`', '`41270-0.22`', '`41270-0.23`', '`41270-0.24`', '`41270-0.25`', '`41270-0.26`', '`41270-0.27`', '`41270-0.28`', '`41270-0.29`', '`41270-0.30`', '`41270-0.31`', '`41270-0.32`', '`41270-0.33`', '`41270-0.34`', '`41270-0.35`', '`41270-0.36`', '`41270-0.37`', '`41270-0.38`', '`41270-0.39`', '`41270-0.40`', '`41270-0.41`', '`41270-0.42`', '`41270-0.43`', '`41270-0.44`', '`41270-0.45`', '`41270-0.46`', '`41270-0.47`', '`41270-0.48`', '`41270-0.49`', '`41270-0.50`', '`41270-0.51`', '`41270-0.52`', '`41270-0.53`', '`41270-0.54`', '`41270-0.55`', '`41270-0.56`', '`41270-0.57`', '`41270-0.58`', '`41270-0.59`', '`41270-0.60`', '`41270-0.61`', '`41270-0.62`', '

In [17]:
from sqlite3 import OperationalError
columns = [f'"41280-0.{i}"' for i in range(215, 259)]
sql_statements = []

for col in columns:
    sql_statements.append(f"ALTER TABLE ICD10 DROP COLUMN {col};")

for sql in tqdm(sql_statements):
    try:
        cursor.execute(sql)
    except OperationalError:
        pass

conn.commit()

100%|██████████| 44/44 [12:09<00:00, 16.58s/it]


In [33]:
from datetime import datetime

# create a new column in the processed table
# cursor.execute(f"ALTER TABLE {processed_table_name} ADD COLUMN CVD_date DATE;")

# For those with CVD, we need to find the earliest time of the CVD
query_sql = f"""
SELECT i.{primary_key}, {', '.join(f'i.{col}' for col in columns_escaped)} 
FROM {ICD_table_name} i INNER JOIN {processed_table_name} p ON i.{primary_key} = p.{primary_key}
WHERE p.CVD = 1;
"""

cursor.execute(query_sql)
# For each row, find the earliest time of the CVD
for row in tqdm(cursor.fetchall()):
    eid = row[0]
    values = row[1:]
    values_41270 = values[:215]
    values_41280 = values[215:]

    date_list = []
    for i, value in enumerate(values_41270):
        if not value:
            continue
        if not value.startswith("I"):
            continue
        # select first 2 digits
        value_two = int(value[1:3])
        try:
            date_i = datetime.strptime(values_41280[i], "%Y-%m-%d").date()
        except ValueError:
            print(value, values_41280[i])
        
        # Ischamic Heart Disease
        if value_two >= 20 and value_two <= 25:
            date_list.append(date_i)
        # Arrhythmia
        if value_two >= 44 and value_two <= 49:
            date_list.append(date_i)
        # Cerebrovascular Disease
        if value_two >= 60 and value_two <= 69:
            date_list.append(date_i)
        # Heart Failure
        if value_two == 50:
            date_list.append(date_i)
        if value_two == 42:
            value_three = int(value[1:4])
            if value_three in [420, 428, 429]:
                date_list.append(date_i)

    date_earliest = min(date_list)

    update_query = f"UPDATE {processed_table_name} SET CVD_date = ? WHERE {primary_key} = ?"
    cursor.execute(update_query, (date_earliest, eid))

conn.commit()

 90%|█████████ | 14658/16247 [00:00<00:00, 19308.22it/s]

I10 nan
I209 nan
I251 nan
I259 nan
I451 nan
I48 nan
I501 nan


100%|██████████| 16247/16247 [00:00<00:00, 19266.59it/s]


Note some patients have CVD before the baseline and need to be excluded.

In [47]:
conn = sqlite3.connect(db_file)
cursor = conn.cursor()

# Determine how many participants have CVD before the baseline
query_sql = f"""
SELECT COUNT(*) AS total,
       SUM(CASE WHEN ECG_date < CVD_date THEN 1 ELSE 0 END) as ECG_before_CVD
       FROM {processed_table_name}
       WHERE test_status = 1 AND ECG_date IS NOT NULL
       GROUP BY CVD;
"""
cursor.execute(query_sql)
print(cursor.fetchall())

[(40531, 0), (9077, 6805)]


Some patients also take statins at baseline and need to be excluded.

In [7]:
conn = sqlite3.connect(db_file)
cursor = conn.cursor()

medication_encoding = [
    1141146234, # Atorvastatin
    1141192410, # Rosuvastatin
    1140861958, # Simvastatin
    1140888648, # Pravastatin
    1140888594  # Fluvastatin
]

cursor.execute(f"PRAGMA table_info({ICD_table_name});")
# Choose all columns that start with 20003 at baseline
columns = [row[1] for row in cursor.fetchall() if row[1].startswith('20003-0')]
columns_escaped = [f"`{col}`" for col in columns]

cursor.execute(f"ALTER TABLE {processed_table_name} ADD COLUMN statins INTEGER DEFAULT 0;")

query_sql = f"""
SELECT {primary_key}, {', '.join(columns_escaped)} FROM {ICD_table_name};
"""
cursor.execute(query_sql)

for row in tqdm(cursor.fetchall()):
    # skip the first row
    eid = row[0]
    values = row[1:]
    for value in values:
        if value in medication_encoding:
            update_query = f"UPDATE {processed_table_name} SET statins = 1 WHERE {primary_key} = ?"
            cursor.execute(update_query, (eid,))

conn.commit()

100%|██████████| 77888/77888 [00:01<00:00, 69353.55it/s]


Eventually, we choose the latest date in ICD-10 field as the censoring date.

In [7]:
conn = sqlite3.connect(db_file)
cursor = conn.cursor()

cursor.execute(f"PRAGMA table_info({ICD_table_name});")
# Choose all columns that start with 41280
columns = [row[1] for row in cursor.fetchall() if row[1].startswith('41280')]
columns_escaped = [f"`{col}`" for col in columns]
print(columns_escaped)

['`41280-0.0`', '`41280-0.1`', '`41280-0.2`', '`41280-0.3`', '`41280-0.4`', '`41280-0.5`', '`41280-0.6`', '`41280-0.7`', '`41280-0.8`', '`41280-0.9`', '`41280-0.10`', '`41280-0.11`', '`41280-0.12`', '`41280-0.13`', '`41280-0.14`', '`41280-0.15`', '`41280-0.16`', '`41280-0.17`', '`41280-0.18`', '`41280-0.19`', '`41280-0.20`', '`41280-0.21`', '`41280-0.22`', '`41280-0.23`', '`41280-0.24`', '`41280-0.25`', '`41280-0.26`', '`41280-0.27`', '`41280-0.28`', '`41280-0.29`', '`41280-0.30`', '`41280-0.31`', '`41280-0.32`', '`41280-0.33`', '`41280-0.34`', '`41280-0.35`', '`41280-0.36`', '`41280-0.37`', '`41280-0.38`', '`41280-0.39`', '`41280-0.40`', '`41280-0.41`', '`41280-0.42`', '`41280-0.43`', '`41280-0.44`', '`41280-0.45`', '`41280-0.46`', '`41280-0.47`', '`41280-0.48`', '`41280-0.49`', '`41280-0.50`', '`41280-0.51`', '`41280-0.52`', '`41280-0.53`', '`41280-0.54`', '`41280-0.55`', '`41280-0.56`', '`41280-0.57`', '`41280-0.58`', '`41280-0.59`', '`41280-0.60`', '`41280-0.61`', '`41280-0.62`', '

In [9]:
from datetime import datetime

query_sql = f"""
SELECT {primary_key}, {', '.join(columns_escaped)} FROM {ICD_table_name};
"""
cursor.execute(query_sql)

date_largest = None

for row in tqdm(cursor):
    eid = row[0]
    values = row[1:]
    
    for value in values:
        if value == "nan":
            break
        if value:
            if not date_largest:
                date_largest = value
            else:
                if datetime.strptime(value, "%Y-%m-%d") > datetime.strptime(date_largest, "%Y-%m-%d"):
                    date_largest = value
                    print(f"Largest date updated: {date_largest}")

662it [00:00, 3324.05it/s]

Largest date updated: 2021-12-03
Largest date updated: 2022-01-21
Largest date updated: 2022-08-20
Largest date updated: 2022-08-22
Largest date updated: 2022-09-10
Largest date updated: 2022-10-25
Largest date updated: 2022-10-29


1379it [00:00, 3496.45it/s]

Largest date updated: 2022-10-31


77888it [00:25, 3102.12it/s]


Therefore, the censoring date is 2022-10-31.