In [None]:
import numpy as np
import pandas as pd
from scipy import stats
import os
from utils import show_pvalue, mkdir_if_needed

In [None]:
res_path_root = 'results/non_insomnia'

In [None]:
df = pd.read_csv(os.path.join('data', 'ukb_ecg_data_balanced_cluster.csv'))
print(df.columns)
res_path = f'{res_path_root}/compare_groups'
os.makedirs(res_path, exist_ok=True)
print(df['cluster_assign'].value_counts())

In [None]:
df0 = df[df['insomnia_score'] == 0].copy() # control
df1 = df[df['insomnia_score'] == 1].copy() # case
print(df0['cluster_assign'].value_counts())
print(df1['cluster_assign'].value_counts())

In [None]:
biomarkers = ['event', 'Sex', 'Age', 'TDI', 'BMI', 'SBP', 'DBP', 'CHOL', 'CRP', 'LDLc', 'HDLc', 'TG', 'Glucose', 'TyG', 'smoking_healthy', 'alcohol_healthy']
mkdir_if_needed('results/compare_groups')
df_biomaker_info = pd.read_csv('data/biomarkers_list.csv')
df_biomaker_info = df_biomaker_info[['Biomarker', 'Field', 'Units']]
# add field and units for age and sex
df_biomaker_info.loc[len(df_biomaker_info.index)] = ['Age', 'Age', 'years']
df_biomaker_info.loc[len(df_biomaker_info.index)] = ['Sex', 'Sex (Female %)', '%']
df_biomaker_info.loc[len(df_biomaker_info.index)] = ['TDI', 'Townsend Deprivation Index', '/']
df_biomaker_info.loc[len(df_biomaker_info.index)] = ['smoking_healthy', 'Smoking healthy (1 not smoked)', '%']
df_biomaker_info.loc[len(df_biomaker_info.index)] = ['alcohol_healthy', 'Alcohol drinking Healthy (1 not drunk)', '%']
df_biomaker_info.loc[len(df_biomaker_info.index)] = ['TyG', 'Triglyceride-glucose index', '/']
df_biomaker_info.loc[len(df_biomaker_info.index)] = ['event', 'CVD events', '1: Yes, 0: No']

In [None]:
def compare_two_groups(g1, g2, g1_name, g2_name, save_name):
    df_biomarker_comp_2subtypes = pd.DataFrame(columns=['Biomarker', g1_name, g2_name, 'p-value'])
    for biomarker in biomarkers:
        # print(biomarker)
        mean0 = g1[biomarker].mean()
        mean1 = g2[biomarker].mean()
        if biomarker in ['Sex', 'smoking_healthy', 'alcohol_healthy']:
            # convert to percentage of female
            mean0 = 100 - mean0 * 100
            mean1 = 100 - mean1 * 100
        if biomarker == 'event':
            mean0 = g1[biomarker].sum()
            mean1 = g2[biomarker].sum()
        # p-value for 2-sample t-test
        p = stats.ttest_ind(g1[biomarker].dropna(), g2[biomarker].dropna())[1]
        df_biomarker_comp_2subtypes.loc[len(df_biomarker_comp_2subtypes)] = [biomarker, f'{mean0:.2f}', f'{mean1:.2f}', show_pvalue(p)]

    df_biomarker_comp_2subtypes = df_biomarker_comp_2subtypes.merge(df_biomaker_info[['Biomarker', 'Field', 'Units']], on='Biomarker', how='left')
    # reorder columns
    df_biomarker_comp_2subtypes = df_biomarker_comp_2subtypes[['Biomarker', 'Field', 'Units', g1_name, g2_name, 'p-value']]
    df_biomarker_comp_2subtypes.to_csv(os.path.join(res_path, save_name), index=False)
    return df_biomarker_comp_2subtypes

In [None]:
g1 = df[df['insomnia_score'] == 0]
g2 = df[df['insomnia_score'] == 1]
N_cluster0, N_cluster1 = g1.shape[0], g2.shape[0]
compare_two_groups(g1, g2, f'Insomnia (N={N_cluster0:,})', f'Non_insomnia (N={N_cluster1:,})', 'comp_case_control.csv')

In [None]:
g1 = df[df['cluster_assign'] == 0]
g2 = df[df['cluster_assign'] == 1]
N_cluster0, N_cluster1 = g1.shape[0], g2.shape[0]
compare_two_groups(g1, g2, f'T+ (N={N_cluster0:,})', f'T- (N={N_cluster1:,})', 'comp_2subtypes.csv')

In [None]:
# compare two subtypes in control group
g1 = df0[df0['cluster_assign'] == 0]
g2 = df0[df0['cluster_assign'] == 1]
N_cluster0, N_cluster1 = g1.shape[0], g2.shape[0]
compare_two_groups(g1, g2, f'T+ (N={N_cluster0:,})', f'T- (N={N_cluster1:,})', 'comp_2subtype_control.csv')

In [None]:
# compare two subtypes in case group
g1 = df1[df1['cluster_assign'] == 0]
g2 = df1[df1['cluster_assign'] == 1]
N_cluster0, N_cluster1 = g1.shape[0], g2.shape[0]
compare_two_groups(g1, g2, f'T+ (N={N_cluster0:,})', f'T- (N={N_cluster1:,})', 'comp_2subtype_cases.csv')

In [None]:
# compare two subtypes seperated by sex
g1 = df[((df['cluster_assign'] == 0) & (df['Sex'] == 0))]
g2 = df[((df['cluster_assign'] == 1) & (df['Sex'] == 0))]
N_cluster0, N_cluster1 = g1.shape[0], g2.shape[0]
print(N_cluster0, N_cluster1)
compare_two_groups(g1, g2, f'Female T+ (N={N_cluster0:,})', f'Female T- (N={N_cluster1:,})', 'comp_2subtype_female.csv')

In [None]:
# compare two subtypes seperated by sex
g1 = df[((df['cluster_assign'] == 0) & (df['Sex'] == 1))]
g2 = df[((df['cluster_assign'] == 1) & (df['Sex'] == 1))]
N_cluster0, N_cluster1 = g1.shape[0], g2.shape[0]
print(N_cluster0, N_cluster1)
compare_two_groups(g1, g2, f'Male T+ (N={N_cluster0:,})', f'Male T- (N={N_cluster1:,})', 'comp_2subtype_male.csv')

In [None]:
df_left = pd.read_csv('data/ukb_ecg_data_left_cluster.csv')
df_left['TyG'] = np.log(df_left['Glucose'] * 88.57 * df_left['TG'] * 18 / 2)
df_left['propensity_score'] = -1 # placeholder for propensity score
df_left = df_left[df.columns]
df_all = pd.concat([df, df_left], ignore_index=True)
df_all.to_csv('data/ukb_ecg_data_all_cluster.csv', index=False)

In [None]:
g1 = df_all[df_all['insomnia_score'] == 0]
g2 = df_all[df_all['insomnia_score'] == 1]
N_cluster0, N_cluster1 = g1.shape[0], g2.shape[0]
compare_two_groups(g1, g2, f'Insomnia (N={N_cluster0:,})', f'Non_insomnia (N={N_cluster1:,})', 'comp_case_control_all.csv')

In [None]:
# compare insomnia and non-insomnia groups in T+ subgroup
g1 = df0[df0['cluster_assign'] == 0] # insomnia T+
g2 = df1[df1['cluster_assign'] == 0] # non-insomnia T+
N_cluster0, N_cluster1 = g1.shape[0], g2.shape[0]
compare_two_groups(g1, g2, f'Insomnia T+ (N={N_cluster0:,})', f'Non_insomnia T+ (N={N_cluster1:,})', 'comp_case_control_Tplus.csv')

In [None]:
# compare insomnia and non-insomnia groups in T- subgroup
g1 = df0[df0['cluster_assign'] == 1] # insomnia T-
g2 = df1[df1['cluster_assign'] == 1] # non-insomnia T-
N_cluster0, N_cluster1 = g1.shape[0], g2.shape[0]
compare_two_groups(g1, g2, f'Insomnia T- (N={N_cluster0:,})', f'Non_insomnia T- (N={N_cluster1:,})', 'comp_case_control_Tminus.csv')