We analyze two types of variables here:
+ Confounding variables (such as age and BMI)
+ Process variables (such as maximum load and maximum heart rate during fitness test)

In [7]:
import sqlite3
from tqdm import tqdm
import sys

sys.path.append("../..")

from utils.constants import DatabaseConfig, TableNames

In [8]:
conn = sqlite3.connect(DatabaseConfig.DB_PATH)
cursor = conn.cursor()
primary_key = 'eid'

# Confounding variables 
+ Create `ethnicity` column in `Processed` table to record the ethnicity of the subject. We will only use the major categories such as white, mixed instead of minor categories such as British white, Indian white, etc.
+ Create `BMI` column in `Processed` table to record the BMI of the subject.
+ Create `birth_date` column in `Processed` table to record the date of birth of the subject. Note this only includes the year and month, as the day is a restricted variable.
+ Create `hypertension_treatment` column in `Processed` table to record if the subject self reports hypertension diagnosed by a doctor, or is taking antihypertensive medication.

In [12]:
# Create a new column in table
cursor.execute(f"""
ALTER TABLE {TableNames.PROCESSED} ADD COLUMN ethnicity INTEGER;
""")

query_sql = f"""
SELECT {primary_key}, `21000-0.0` 
FROM {TableNames.CONFOUNDERS};
"""
cursor.execute(query_sql)

for row in tqdm(cursor.fetchall()):
    eid = row[0]
    ethnicity_full = row[1]
    ethnicity_major = str(ethnicity_full)[-1]
    update_sql = f"UPDATE {TableNames.PROCESSED} SET ethnicity = ? WHERE eid = ?;"
    cursor.execute(update_sql, (ethnicity_major, eid))

conn.commit()

100%|██████████| 77888/77888 [00:00<00:00, 103970.61it/s]


In [18]:
# Create a new column in table
cursor.execute(f"""
ALTER TABLE {TableNames.PROCESSED} ADD COLUMN BMI REAL;
""")

query_sql = f"""
SELECT {primary_key}, `50-0.0` AS height, `21002-0.0` AS body_weight
FROM {TableNames.CONFOUNDERS};
"""
cursor.execute(query_sql)

for row in tqdm(cursor.fetchall()):
    eid = row[0]
    height = row[1]
    body_weight = row[2]
    if height is None or body_weight is None:
        BMI = None
    else:
        BMI = round(body_weight / (height / 100) ** 2, 4)
    update_sql = f"UPDATE {TableNames.PROCESSED} SET BMI = ? WHERE eid = ?;"
    cursor.execute(update_sql, (BMI, eid))

conn.commit()

100%|██████████| 77888/77888 [00:00<00:00, 109550.43it/s]


In [6]:
cursor.execute(f"""
ALTER TABLE {TableNames.PROCESSED} ADD COLUMN birth_date DATE;
""")

query_sql = f"""
SELECT {primary_key}, `34-0.0`, `52-0.0` 
FROM {TableNames.CONFOUNDERS};
"""
cursor.execute(query_sql)

for row in tqdm(cursor.fetchall()):
    eid = row[0]
    birth_year = row[1]
    birth_month = row[2]
    # * SQL doesn't allow we only specify the year and month. We will use 01 as the placeholder.
    birth_date = f"{birth_year}-{birth_month:02d}-01"
    update_sql = f"UPDATE {TableNames.PROCESSED} SET birth_date = ? WHERE eid = ?;"
    cursor.execute(update_sql, (birth_date, eid))

conn.commit()

100%|██████████| 77888/77888 [00:00<00:00, 128652.33it/s]


In [22]:
cursor.execute(f"""
ALTER TABLE {TableNames.PROCESSED} ADD COLUMN hypertension_treatment INTEGER DEFAULT 0;
""")

# We will use data field 6177, 6153 and 6150
cursor.execute(f"PRAGMA table_info({TableNames.CONFOUNDERS});")
columns = [
    row[1]
    for row in cursor.fetchall()
    if row[1].startswith("6177-0") or row[1].startswith("6153-0") or row[1].startswith("6150-0")
]
columns_escaped = [f"`{col}`" for col in columns]
print(columns_escaped)

query_sql = f"""
SELECT {primary_key}, {', '.join(columns_escaped)} 
FROM {TableNames.CONFOUNDERS};
"""
cursor.execute(query_sql)

for row in tqdm(cursor.fetchall()):
    # define If a subject didn't use the bike, or didn't complete the test, the subject will be excluded.
    hypertension_treatment = 0

    eid = row[0]
    
    # Order: 6150(4) + 6153(4) + 6177(3)
    disease_reported = row[1:5]  # 6150
    medication_female = row[5:9]  # 6153
    medication_male = row[9:12]  # 6177

    # medication_male and medication_female are incompatible.
    male_all_none = all(x is None for x in medication_male)
    female_all_none = all(x is None for x in medication_female)
    assert male_all_none or female_all_none  # At least one of two lists should only have None values.

    if 2 in medication_male or 2 in medication_female:
        # 6177:
        # 1	Cholesterol lowering medication
        # 2	Blood pressure medication
        # 3	Insulin
        # -7	None of the above
        # -1	Do not know
        # -3	Prefer not to answer

        # 6153:
        # 1	Cholesterol lowering medication
        # 2	Blood pressure medication
        # 3	Insulin
        # 4	Hormone replacement therapy
        # 5	Oral contraceptive pill or minipill
        # -7	None of the above
        # -1	Do not know
        # -3	Prefer not to answer
        hypertension_treatment = 1

    if 4 in disease_reported:
        # 1	Heart attack
        # 2	Angina
        # 3	Stroke
        # 4	High blood pressure
        # -7	None of the above
        # -3	Prefer not to answer
        hypertension_treatment = 1

    medication_unknown = any(code in (medication_male + medication_female) for code in (-1, -3))
    disease_unknown = any(code in disease_reported for code in (-3, -7))
    hypertension_treatment = None if medication_unknown and disease_unknown else hypertension_treatment

    update_sql = f"UPDATE {TableNames.PROCESSED} SET hypertension_treatment = ? WHERE eid = ?;"
    cursor.execute(update_sql, (hypertension_treatment, eid))

conn.commit()

['`6150-0.0`', '`6150-0.1`', '`6150-0.2`', '`6150-0.3`', '`6153-0.0`', '`6153-0.1`', '`6153-0.2`', '`6153-0.3`', '`6177-0.0`', '`6177-0.1`', '`6177-0.2`']


100%|██████████| 77888/77888 [00:00<00:00, 94393.62it/s] 


# Process variables 

+ Create `test_status` column in `Processed` table to record the status of the fitness test. It will be True if the subject completed the test using the bike.

In [15]:
cursor.execute(f"""
ALTER TABLE {TableNames.PROCESSED} ADD COLUMN test_status INTEGER DEFAULT 1;
""")

# We will use data field 6019 and 6020
query_sql = f"""
SELECT {primary_key}, `6019-0.0`, `6020-0.0` FROM {TableNames.ECG};
"""
cursor.execute(query_sql)

for row in tqdm(cursor.fetchall()):
    # define If a subject didn't use the bike, or didn't complete the test, the subject will be excluded.
    test_status = True
    
    eid = row[0]
    # ECG/Bike method for fitness test
    if row[1] != 1.0:
        # 2.0: resting only
        test_status = False
    # Completion status of fitness test
    if row[2] != 1.0:
        # 31.0: participant wanted to stop early
        # 32.0: participant reported chest-pain or other discomfort
        # 33.0: heart rate too high
        test_status = False

    if not test_status:
        update_sql = f"UPDATE {TableNames.PROCESSED} SET test_status = ? WHERE eid = ?;"
        cursor.execute(update_sql, (test_status, eid))

conn.commit()

100%|██████████| 77888/77888 [00:00<00:00, 245921.61it/s]
