In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from utils import psm, calc_smd
from scipy.stats import ttest_ind

In [None]:
df = pd.read_csv('data/ukb_ecg_data_imputed.csv')

In [None]:
df['insomnia'].value_counts(dropna=False)

In [None]:
df = df[~(df['insomnia_score'].isna())]
print(df['insomnia_score'].value_counts())

In [None]:
print(df.columns)

In [None]:
# covars, TDI: townsend deprivation index
covars = ['Sex', 'Age', 'Ethnicity', 'TDI', 'BMI',  'SBP', 'DBP', 'CRP', 'Glucose', 'HDLc', 'LDLc', 'CHOL', 'TG', 'TyG', 'smoking_healthy', 'alcohol_healthy']

In [None]:
cases = df[df['insomnia_score'] == 1]
controls = df[df['insomnia_score'] == 0]
for c in covars:
    mean_case = cases[c].mean()
    mean_control = controls[c].mean()
    pval = ttest_ind(cases[c], controls[c])[1]
    smd = calc_smd(df, c, type='insomnia_score')
    print(f'{c}: Case mean = {mean_case:.2f}, Control mean = {mean_control:.2f}, p-value = {pval:.4f}, SMD = {smd:.4f}')

In [None]:
_, df_balanced = psm(df, type='insomnia_score', covars=covars, need_shuffle=True, seed=42)

In [None]:
print('Number of rows after matching:', len(df_balanced))
cases = df_balanced[df_balanced['insomnia_score'] == 1]
controls = df_balanced[df_balanced['insomnia_score'] == 0]
print(f'Number of cases: {len(cases)}',
      f'Number of controls: {len(controls)}')

for c in covars:
    mean_case = cases[c].mean()
    mean_control = controls[c].mean()
    pval = ttest_ind(cases[c], controls[c])[1]
    smd = calc_smd(df_balanced, c, type='insomnia_score')
    print(f'{c}: Case mean = {mean_case:.2f}, Control mean = {mean_control:.2f}, p-value = {pval:.4f}, SMD = {smd:.4f}')

In [None]:
df_balanced.to_csv('data/ukb_ecg_data_balanced.csv', index=False)

In [None]:
df_balanced['event'].value_counts()