We analyze two types of variables here:
+ Confounding variables (such as age and BMI)
+ Process variables (such as maximum load and maximum heart rate during fitness test)

In [1]:
import sqlite3
import json
from tqdm import tqdm
import sys
sys.path.append("..")
from utils.sql_utils import print_table_info

# reimport the sql_utils
import importlib
module = importlib.import_module('utils.sql_utils')
importlib.reload(module)
from utils.sql_utils import print_table_info, update_table_from_csv

In [3]:
db_file = "/work/users/y/u/yuukias/BIOS-Material/BIOS992/data/ukbiobank.db"
variables_table_name = 'Variables'
processed_table_name = 'Processed'
primary_key = 'eid'

In [20]:
update_table_from_csv(variables_table_name, [189])

[]
There will be 0 new columns added to the table: Variables
['eid']


# Confounding variables 

We determine whether there is any missing values in the variables table.

In [3]:
print_table_info(variables_table_name)

(0, 'eid', 'INTEGER', 0, None, 1)
(1, '31-0.0', '', 0, None, 0)
(2, '34-0.0', '', 0, None, 0)
(3, '50-0.0', '', 0, None, 0)
(4, '50-1.0', '', 0, None, 0)
(5, '50-2.0', '', 0, None, 0)
(6, '50-3.0', '', 0, None, 0)
(7, '52-0.0', '', 0, None, 0)
(8, '2443-0.0', '', 0, None, 0)
(9, '2443-1.0', '', 0, None, 0)
(10, '2443-2.0', '', 0, None, 0)
(11, '2443-3.0', '', 0, None, 0)
(12, '4080-0.0', '', 0, None, 0)
(13, '4080-0.1', '', 0, None, 0)
(14, '4080-1.0', '', 0, None, 0)
(15, '4080-1.1', '', 0, None, 0)
(16, '4080-2.0', '', 0, None, 0)
(17, '4080-2.1', '', 0, None, 0)
(18, '4080-3.0', '', 0, None, 0)
(19, '4080-3.1', '', 0, None, 0)
(20, '5983-0.0', '', 0, None, 0)
(21, '5983-0.1', '', 0, None, 0)
(22, '5983-0.2', '', 0, None, 0)
(23, '5983-0.3', '', 0, None, 0)
(24, '5983-0.4', '', 0, None, 0)
(25, '5983-0.5', '', 0, None, 0)
(26, '5983-0.6', '', 0, None, 0)
(27, '5983-0.7', '', 0, None, 0)
(28, '5983-0.8', '', 0, None, 0)
(29, '5983-0.9', '', 0, None, 0)
(30, '5983-0.10', '', 0, None, 0

In [22]:
# Get column names
conn = sqlite3.connect(db_file)
cursor = conn.cursor()


selected_columns_ID = [
    34,
    52,
    20116,
    31,
    2443,
    4080,
    21000,
    50,
    21002,
    6177,
    6153,
    6150,
    30690,
    30760,
    6138,
    22032
]

baseline_columns = [f"`{column_ID}-0.0`" for column_ID in selected_columns_ID]

missing_values_dict = {}

query_sql = f"SELECT eid, {', '.join(baseline_columns)} FROM {variables_table_name};"
cursor.execute(query_sql)

with tqdm(total=77888) as pbar:
    for row in cursor:
        eid = row[0]
        missing_columns_index = [i for i, value in enumerate(row[1:]) if value is None]
        missing_columns = [baseline_columns[i] for i in missing_columns_index]
        if missing_columns:
            missing_values_dict[eid] = missing_columns

        pbar.update(1)

  0%|          | 0/77888 [00:00<?, ?it/s]

100%|██████████| 77888/77888 [01:28<00:00, 880.48it/s] 


In [29]:
# Note 6153 and 6177 only applicable to a single gender, we thus remove them from the dictionary
columns_to_remove = ['6153-0.0', '6177-0.0']

for eid in list(missing_values_dict.keys()): 
    missing_values_dict[eid] = [col for col in missing_values_dict[eid] 
                               if not any(col_id in col for col_id in ['6153', '6177'])]
    
    if not missing_values_dict[eid]:
        del missing_values_dict[eid]

In [30]:
print(f"There are {len(missing_values_dict)} participants with missing values.")
for eid, missing_columns in missing_values_dict.items():
    print(f"eid: {eid}, Missing columns: {[missing_column.replace('`', '').split('-')[0] for missing_column in missing_columns]}")

There are 22634 participants with missing values.
eid: 1000361, Missing columns: ['30690', '30760']
eid: 1000587, Missing columns: ['30690', '30760']
eid: 1000659, Missing columns: ['22032']
eid: 1000737, Missing columns: ['30760']
eid: 1000928, Missing columns: ['30690', '30760']
eid: 1001249, Missing columns: ['22032']
eid: 1001324, Missing columns: ['22032']
eid: 1002260, Missing columns: ['30760']
eid: 1002613, Missing columns: ['22032']
eid: 1002778, Missing columns: ['30760', '22032']
eid: 1002863, Missing columns: ['22032']
eid: 1002908, Missing columns: ['30690', '30760']
eid: 1003273, Missing columns: ['22032']
eid: 1003286, Missing columns: ['22032']
eid: 1003295, Missing columns: ['22032']
eid: 1003328, Missing columns: ['30760', '22032']
eid: 1003534, Missing columns: ['30760']
eid: 1003727, Missing columns: ['30760']
eid: 1003821, Missing columns: ['22032']
eid: 1003870, Missing columns: ['22032']
eid: 1003952, Missing columns: ['30760']
eid: 1004276, Missing columns: ['22

In [32]:
# Now count the number of missing values for each column
missing_values_count = {col: 0 for col in baseline_columns}
for eid, missing_columns in missing_values_dict.items():
    for col in missing_columns:
        missing_values_count[col] += 1

for col, count in missing_values_count.items():
    print(f"{col}: {count}")

`34-0.0`: 0
`52-0.0`: 0
`20116-0.0`: 209
`31-0.0`: 0
`2443-0.0`: 209
`4080-0.0`: 458
`21000-0.0`: 209
`50-0.0`: 162
`21002-0.0`: 223
`6177-0.0`: 0
`6153-0.0`: 0
`6150-0.0`: 209
`30690-0.0`: 6238
`30760-0.0`: 9657
`6138-0.0`: 209
`22032-0.0`: 14241


Therefore, we will not use `22032`, the column for IPAQ activity group.

# Process variables 

In [12]:
conn = sqlite3.connect(db_file)
cursor = conn.cursor()

In [15]:
# cursor.execute(f"ALTER TABLE {processed_table_name} ADD COLUMN test_status INTEGER DEFAULT 1;")

query_sql = f"SELECT eid, `6019-0.0`, `6020-0.0` FROM {variables_table_name};"
cursor.execute(query_sql)

for row in tqdm(cursor.fetchall()):
    # define If a subject didn't use the bike, or didn't complete the test, the subject will be excluded.
    test_status = True
    
    eid = row[0]
    # ECG/Bike method for fitness test
    if row[1] != 1.0:
        # 2.0: resting only
        test_status = False
    # Completion status of fitness test
    if row[2] != 1.0:
        # 31.0: participant wanted to stop early
        # 32.0: participant reported chest-pain or other discomfort
        # 33.0: heart rate too high
        test_status = False

    if not test_status:
        update_query = f"UPDATE {processed_table_name} SET test_status = ? WHERE eid = ?;"
        cursor.execute(update_query, (test_status, eid))

conn.commit()

100%|██████████| 77888/77888 [00:00<00:00, 245921.61it/s]


Following columns have three indices: 0~2.

In [None]:
# Phase name
cursor.execute(f"SELECT `5991-0.0` FROM {variables_table_name}")
cursor.execute(f"SELECT `5991-0.1` FROM {variables_table_name}")
cursor.execute(f"SELECT `5991-0.2` FROM {variables_table_name}")

# Phase duration
cursor.execute(f"SELECT `5992-0.0` FROM {variables_table_name}")
cursor.execute(f"SELECT `5992-0.1` FROM {variables_table_name}")
cursor.execute(f"SELECT `5992-0.2` FROM {variables_table_name}")