Describe the characteristics of the participants for the table in result section.

In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3
import sys
sys.path.append("../..")

from utils.constants import db_file, processed_table_name, variables_table_name, primary_key

# Read from Database

In [2]:
conn = sqlite3.connect(db_file)
cursor = conn.cursor()

In [18]:
query_sql = f"""
SELECT 
    p.`{primary_key}` AS eid,
    -- Demographics
    v.`34-0.0` AS birth_year,
    v.`52-0.0` AS birth_month,
    v.`31-0.0` AS sex,
    v.`21000-0.0` AS ethnicity,
    v.`6138-0.0` AS education,
    
    -- Clinical Parameters
    v.`50-0.0` AS height,
    v.`21002-0.0` AS weight,
    v.`4080-0.0` AS systolic_bp,
    v.`2443-0.0` AS diabetes_status,
    
    -- Medications
    v.`6177-0.0` AS med_male,
    v.`6153-0.0` AS med_female,
    v.`6150-0.0` AS diagnosed_problem,
    
    -- Laboratory Values
    v.`30690-0.0` AS total_cholesterol,
    v.`30760-0.0` AS hdl_cholesterol,
    
    -- Lifestyle
    `20116-0.0` AS smoking_status,
    `22032-0.0` AS ipaq_group,

    -- CVD Status
    p.CVD AS CVD,
    p.CVD_date AS CVD_date,
    p.ECG_date AS ECG_date
FROM {variables_table_name} v
INNER JOIN {processed_table_name} p ON v.`{primary_key}` = p.`{primary_key}`
WHERE p.statins = 0 
AND p.test_status = 1 
AND p.ECG_date IS NOT NULL 
AND (p.CVD_date IS NULL OR p.ECG_date < p.CVD_date);
"""
cursor.execute(query_sql)
df_raw = cursor.fetchall()

In [25]:
columns = [
    # Demographics
    'eid',                  # 参与者ID
    'birth_year',          # 出生年份
    'birth_month',         # 出生月份
    'sex',                 # 性别
    'ethnicity',           # 种族
    'education',           # 教育水平
    
    # Clinical Parameters
    'height',              # 身高
    'weight',              # 体重
    'systolic_bp',         # 收缩压
    'diabetes_status',     # 糖尿病状态
    
    # Medications
    'med_male',            # 男性用药
    'med_female',          # 女性用药
    'diagnosed_problem',   # 诊断问题
    
    # Laboratory Values
    'total_cholesterol',   # 总胆固醇
    'hdl_cholesterol',     # 高密度脂蛋白胆固醇
    
    # Lifestyle
    'smoking_status',      # 吸烟状态
    'ipaq_group',           # 体力活动水平组

    # CVD Status
    'CVD',
    'CVD_date',
    'ECG_date'
]
df = pd.DataFrame(df_raw, columns=columns)
df.shape

df.to_csv("../../data/ukbiobank_step0_participant_characteristics.csv")

(41274, 20)

# Process Data to Get Descriptive Statistics

In [101]:
df = pd.read_csv("../../data/ukbiobank_step0_participant_characteristics.csv", index_col=0)

In [3]:
df

Unnamed: 0,eid,birth_year,birth_month,sex,ethnicity,education,height,weight,systolic_bp,diabetes_status,med_male,med_female,diagnosed_problem,total_cholesterol,hdl_cholesterol,smoking_status,ipaq_group,CVD,CVD_date,ECG_date
0,1000205,1970,1,1,1001.0,2.0,188.0,76.2,149.0,0.0,-7.0,,-7.0,4.569,1.228,0.0,0.0,0,,2010-04-22
1,1000239,1944,6,0,1001.0,1.0,163.0,60.9,137.0,0.0,,-7.0,-7.0,5.780,2.221,1.0,1.0,0,,2010-02-18
2,1000361,1954,6,0,1001.0,1.0,155.0,49.2,116.0,0.0,,-7.0,-7.0,,,2.0,1.0,0,,2010-06-25
3,1000677,1967,6,0,1001.0,3.0,157.0,93.4,124.0,0.0,,-7.0,-7.0,5.874,1.323,2.0,1.0,0,,2010-04-07
4,1000737,1957,4,1,1001.0,4.0,170.0,66.0,148.0,0.0,-7.0,,-7.0,4.429,,0.0,2.0,0,,2010-03-26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41269,6025697,1961,5,0,3001.0,4.0,156.0,56.5,153.0,0.0,,-7.0,-7.0,5.266,1.136,0.0,1.0,0,,2010-04-21
41270,6025872,1948,1,0,1001.0,5.0,158.0,61.7,161.0,0.0,,-7.0,-7.0,7.404,1.948,0.0,2.0,0,,2010-02-26
41271,6025947,1942,1,0,1003.0,1.0,169.0,108.7,143.0,0.0,,-7.0,-7.0,,,0.0,2.0,0,,2010-07-05
41272,6026055,1942,8,0,1001.0,-7.0,166.0,58.3,110.0,0.0,,-7.0,-7.0,7.064,1.636,2.0,1.0,0,,2009-12-11


In [102]:
# include missing values
df["ethnicity"].value_counts(dropna=False)

ethnicity
 1001.0    34355
 1003.0     1976
 1002.0     1428
 4001.0      708
 3001.0      690
 6.0         532
 4002.0      388
 5.0         196
 3004.0      188
-3.0         155
 3002.0      123
 2003.0      120
 2004.0      107
 NaN          85
 2001.0       82
 2002.0       52
 1.0          40
-1.0          19
 3003.0       15
 4003.0        9
 2.0           4
 4.0           2
Name: count, dtype: int64

In [103]:
categorical_columns = [
    'sex',              
    'ethnicity',        
    'education',       
    'diabetes_status',  
    'smoking_status',   
    'ipaq_group',       
    'med_male',         
    'med_female',       
    'diagnosed_problem',
    'CVD'
]

df[categorical_columns] = df[categorical_columns].astype('category')

df['sex'] = df['sex'].cat.set_categories([0, 1], ordered=False)  # Female=0, Male=1

df['smoking_status'] = df['smoking_status'].cat.set_categories([0, 1, 2, -3], ordered=False)  # Never=0, Previous=1, Current=2

df['diabetes_status'] = df['diabetes_status'].cat.set_categories([0, 1, -1, -3], ordered=False)  # No=0, Yes=1

# for ethnicity, we only need the last digits
df['ethnicity'] = df['ethnicity'].astype(str).apply(lambda x: x.split('.')[0] if x in ['-1.0', '-3.0'] else x[-3]).replace('n', '-2').astype(int).replace(-2, pd.NA).astype('category')  # need specical treatment for NaN values
df['ethnicity'] = df['ethnicity'].cat.set_categories([1, 2, 3, 4, 5, 6, -1, -3], ordered=False)  # White=1, Mixed=2, Asian=3, Black=4, Chinese=5, Other=6

df['education'] = df['education'].cat.set_categories([1, 2, 3, 4, 5, 6, -7, -3], ordered=True)  # College=1, A levels=2, O levels=3, etc.

df['ipaq_group'] = df['ipaq_group'].cat.set_categories([0, 1, 2], ordered=True)  # low=0, moderate=1, high=2

df['med_male'] = df['med_male'].cat.set_categories([1, 2, 3, -7, -1, -3], ordered=False)  # Cholesterol=1, BP=2, Insulin=3

df['med_female'] = df['med_female'].cat.set_categories([1, 2, 3, 4, 5, -7, -1, -3], ordered=False)

df['diagnosed_problem'] = df['diagnosed_problem'].cat.set_categories([1, 2, 3, 4, -7, -3], ordered=False)  # Heart attack=1, Angina=2, etc.

df['CVD'] = df['CVD'].cat.set_categories([0, 1], ordered=False)  # No=0, Yes=1

We need to process some of data.

In [104]:
df["age"] = pd.to_datetime(df["ECG_date"]) - pd.to_datetime(df["birth_year"].astype(str) + "-" + df["birth_month"].astype(str))
df["age"] = df["age"].dt.days // 365

df["BMI"] = df["weight"] / (df["height"] / 100) ** 2

In [105]:
df_order = [
    'age',
    'sex',
    'ethnicity',
    'education',
    
    'BMI',
    'systolic_bp',
    
    'diabetes_status',
    'diagnosed_problem',
    
    'total_cholesterol',
    'hdl_cholesterol',
    
    'med_male',
    'med_female',
    
    'smoking_status',
    'ipaq_group'
]

In [106]:
import scipy.stats as stats
import warnings

def create_descriptive_stats(df, filter_cols = ["eid", "CVD_date", "ECG_date"], order=None):
    """
    Create descriptive statistics tables for continuous and categorical variables
    """
    descriptive_stats = {}

    if filter_cols is not None:
        df = df.drop(columns=filter_cols)

    continuous_cols = df.select_dtypes(include=['float64', 'int64']).columns
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=FutureWarning)

        # * Here we will analyze by the CVD status
        for col in continuous_cols:
            # remove NaN values
            num_na = df[col].isna().sum()

            numeric_stats = df.groupby('CVD')[col].agg(['count', 'mean', 'std']).round(1)
            numeric_stats['count'] = numeric_stats['count'].apply(lambda x: f"{int(x):,d}")  # add comma as thousand separator
            no_cvd = df[df['CVD'] == 0][col].dropna()
            cvd = df[df['CVD'] == 1][col].dropna()
            _, p_val = stats.ttest_ind(no_cvd, cvd, equal_var=False)  # two sample t-test for independent samples -> statistic, pvalue
            descriptive_stats[col] = {
                'stats': numeric_stats,
                'p_value': p_val,
                'missing_rate': num_na/len(df),
                'missing_n': num_na
            }
        
        for col in categorical_cols:
            # remove NaN values
            num_na = df[col].isna().sum()

            categorical_counts = df.groupby('CVD')[col].value_counts()
            categorical_freqs = df.groupby('CVD')[col].value_counts(normalize=True).mul(100).round(1)  # frequency in percentage
            
            categorical_stats = pd.concat([categorical_counts, categorical_freqs], axis=1).apply(
                lambda x: f"{int(x[0]):,d} ({x[1]:.2f}%)", axis=1
            )
                
            contingency = pd.crosstab(df[col], df['CVD'])
            _, p_val, _, _ = stats.chi2_contingency(contingency)  # Chi-square test ->statistic, pvalue, dof, expected_freq
            descriptive_stats[col] = {
                'stats': categorical_stats,
                'p_value': p_val,
                'missing_rate': num_na/len(df),
                'missing_n': num_na
            }

    if order:
        descriptive_stats_ordered = {}
        for col in order:
            descriptive_stats_ordered[col] = descriptive_stats[col]
        descriptive_stats = descriptive_stats_ordered
    
    return descriptive_stats

The above code has been verified using R.

In [108]:
df_stats = create_descriptive_stats(df, order=df_order)
print(f"There are {len(df)} participants in the dataset.")
for col, result in df_stats.items():
    print(col)
    print(result['stats'])
    if result['p_value'] < 0.001:
        print("p-value: <0.001")
    else:
        print(f"p-value: {result['p_value']:.3f}")
    print(f"Missing rate: {result['missing_rate']:.2%}")
    print(f"Missing number: {result['missing_n']}")
    print("\n")

There are 41274 participants in the dataset.
age
      count  mean  std
CVD                   
0    36,060  54.8  8.1
1     5,214  60.0  7.1
p-value: <0.001
Missing rate: 0.00%
Missing number: 0


sex
CVD  sex
0    0      20,675 (57.30%)
     1      15,385 (42.70%)
1    1       3,083 (59.10%)
     0       2,131 (40.90%)
dtype: object
p-value: <0.001
Missing rate: 0.00%
Missing number: 0


ethnicity
CVD  ethnicity
0    1            31,250 (86.80%)
     3              1,900 (5.30%)
     2              1,751 (4.90%)
     6                483 (1.30%)
     4                273 (0.80%)
     5                186 (0.50%)
     -3               134 (0.40%)
     -1                16 (0.00%)
1    1             4,625 (89.00%)
     2                244 (4.70%)
     3                220 (4.20%)
     6                 49 (0.90%)
     4                 24 (0.50%)
     -3                21 (0.40%)
     5                 10 (0.20%)
     -1                 3 (0.10%)
dtype: object
p-value: <0.001
Missing r

In [57]:
df["birth_year"].isna().sum()

0