Using database, we determine whether each patient has CVD using ICD-10 codes.

In [8]:
import sqlite3
from tqdm import tqdm
from datetime import datetime
import sys

sys.path.append("../..")

from utils.constants import DatabaseConfig, TableNames

In [5]:
conn = sqlite3.connect(DatabaseConfig.DB_PATH)
cursor = conn.cursor()
primary_key = "eid"

## Determine whether the patient has CVD using ICD-10 codes

+ Create `CVD` column in `Processed` table to record if the subject has CVD diagnosed using ICD-10 codes.

In [8]:
cursor.execute(f"""
ALTER TABLE {TableNames.PROCESSED} ADD COLUMN CVD INTEGER DEFAULT 0;
""")

# We will use data field 41270.
cursor.execute(f"PRAGMA table_info({TableNames.ICD});")
columns = [row[1] for row in cursor.fetchall() if row[1].startswith('41270')]
columns_escaped = [f"`{col}`" for col in columns]

query_sql = f"""
SELECT {primary_key}, {', '.join(columns_escaped)} 
FROM {TableNames.ICD};
"""
cursor.execute(query_sql)

for row in tqdm(cursor.fetchall()):
    # Cursor will not include the column name
    eid = row[0]
    values = row[1:]

    CVD = 0
    for value in values:
        if value == "nan" or not value:
            continue
        if value.startswith("I"):
            # select first 2 digits
            value_two = int(value[1:3])
           
            # Ischamic Heart Disease
            if value_two >= 20 and value_two <= 25:
                CVD = 1
                break
            # Arrhythmia
            if value_two >= 44 and value_two <= 49:
                CVD = 1
                break
            # Cerebrovascular Disease
            if value_two >= 60 and value_two <= 69:
                CVD = 1
                break
            # Heart Failure
            if value_two == 50:
                CVD = 1
                break
            if value_two == 42:
                value_three = int(value[1:4])
                if value_three in [420, 428, 429]:
                    CVD = 1
                    break
    if CVD:
        # * Here we use UPDATE, since we don't have the entire column available at once.
        update_query = f"UPDATE {TableNames.PROCESSED} SET CVD = ? WHERE {primary_key} = ?"
        cursor.execute(update_query, (CVD, eid))

conn.commit()

100%|██████████| 77888/77888 [00:00<00:00, 89793.86it/s]


## Determine the time that patients got the CVD

In [33]:
cursor.execute(f"""
ALTER TABLE {TableNames.PROCESSED} ADD COLUMN CVD_date DATE;
""")

# We will use data field 41270 and 41280.
cursor.execute(f"PRAGMA table_info({TableNames.ICD});")
columns = [row[1] for row in cursor.fetchall() if row[1].startswith('41270') or row[1].startswith('41280')]
columns_escaped = [f"`{col}`" for col in columns]

# For those with CVD, we need to find the earliest time of the CVD and record them
query_sql = f"""
SELECT i.{primary_key}, {', '.join(f'i.{col}' for col in columns_escaped)} 
FROM {TableNames.ICD} i INNER JOIN {TableNames.PROCESSED} p ON i.{primary_key} = p.{primary_key}
WHERE p.CVD = 1;
"""
cursor.execute(query_sql)

# For each row, find the earliest time of the CVD
for row in tqdm(cursor.fetchall()):
    eid = row[0]
    values = row[1:]
    values_41270 = values[:215]
    values_41280 = values[215:]

    date_list = []
    for i, value in enumerate(values_41270):
        if not value:
            continue
        if not value.startswith("I"):
            continue
        # select first 2 digits
        value_two = int(value[1:3])
        try:
            date_i = datetime.strptime(values_41280[i], "%Y-%m-%d").date()
        except ValueError:
            print(value, values_41280[i])
        
        # Ischamic Heart Disease
        if value_two >= 20 and value_two <= 25:
            date_list.append(date_i)
        # Arrhythmia
        if value_two >= 44 and value_two <= 49:
            date_list.append(date_i)
        # Cerebrovascular Disease
        if value_two >= 60 and value_two <= 69:
            date_list.append(date_i)
        # Heart Failure
        if value_two == 50:
            date_list.append(date_i)
        if value_two == 42:
            value_three = int(value[1:4])
            if value_three in [420, 428, 429]:
                date_list.append(date_i)

    date_earliest = min(date_list)

    # * Here we use UPDATE, since we don't have the entire column available at once.
    update_query = f"UPDATE {TableNames.PROCESSED} SET CVD_date = ? WHERE {primary_key} = ?"
    cursor.execute(update_query, (date_earliest, eid))

conn.commit()

 90%|█████████ | 14658/16247 [00:00<00:00, 19308.22it/s]

I10 nan
I209 nan
I251 nan
I259 nan
I451 nan
I48 nan
I501 nan


100%|██████████| 16247/16247 [00:00<00:00, 19266.59it/s]


# Determine whether the patient has statins at baseline

+ Create `statins` column in `Processed` table to record if the subject has statins at baseline. These patients need to be excluded.


In [None]:
medication_encoding = [
    1141146234,  # Atorvastatin
    1141192410,  # Rosuvastatin
    1140861958,  # Simvastatin
    1140888648,  # Pravastatin
    1140888594   # Fluvastatin
]

In [7]:
cursor.execute(f"""
ALTER TABLE {TableNames.PROCESSED} ADD COLUMN statins INTEGER DEFAULT 0;
""")

# We will use data field 20003
cursor.execute(f"PRAGMA table_info({TableNames.ICD});")
columns = [row[1] for row in cursor.fetchall() if row[1].startswith('20003-0')]
columns_escaped = [f"`{col}`" for col in columns]

query_sql = f"""
SELECT {primary_key}, {', '.join(columns_escaped)} 
FROM {TableNames.ICD};
"""
cursor.execute(query_sql)

for row in tqdm(cursor.fetchall()):
    eid = row[0]
    values = row[1:]
    for value in values:
        if value in medication_encoding:
            update_query = f"UPDATE {TableNames.PROCESSED} SET statins = 1 WHERE {primary_key} = ?"
            cursor.execute(update_query, (eid,))
            break

conn.commit()

100%|██████████| 77888/77888 [00:01<00:00, 69353.55it/s]


# Determine the censoring date
+ Create `Censor_date` column in `Processed` table to record the censoring date, which will be the latest date recorded in ICD-10 field.

In [9]:
cursor.execute(f"PRAGMA table_info({TableNames.ICD});")
# We will use data field 41280
columns = [row[1] for row in cursor.fetchall() if row[1].startswith('41280')]
columns_escaped = [f"`{col}`" for col in columns]

query_sql = f"""
SELECT {primary_key}, {', '.join(columns_escaped)} 
FROM {TableNames.ICD};
"""
cursor.execute(query_sql)

date_largest = None

for row in tqdm(cursor):
    eid = row[0]
    values = row[1:]
    
    for value in values:
        if value == "nan":
            break
        if value:
            if not date_largest:
                date_largest = value
            else:
                if datetime.strptime(value, "%Y-%m-%d") > datetime.strptime(date_largest, "%Y-%m-%d"):
                    date_largest = value
                    print(f"Largest date updated: {date_largest}")

0it [00:00, ?it/s]

661it [00:00, 6586.45it/s]

Largest date updated: 2021-12-03
Largest date updated: 2022-01-21
Largest date updated: 2022-08-20
Largest date updated: 2022-08-22
Largest date updated: 2022-09-10
Largest date updated: 2022-10-25
Largest date updated: 2022-10-29
Largest date updated: 2022-10-31


77888it [00:11, 6608.17it/s]


Therefore, the censoring date is 2022-10-31.

In [10]:
# * Since the censoring date should be the same for all participants, we will use INSERT instead of UPDATE.
cursor.execute(f"""
ALTER TABLE {TableNames.PROCESSED} ADD COLUMN Censor_date DATE DEFAULT '2022-10-31';
""")
conn.commit()