# Reading and calculations

In [None]:
import h5py
import utils
file_path = "our_processing\ML_Data.h5"

def read_data(file_path, group_name, signal_name= "filtered_intervals"):
    with h5py.File(file_path, "r") as hdf:
        group = hdf[group_name]
        data = group[signal_name][()]
    return data

group_names = [
"19070921", "19072205", "19072214",
"19072938", "19072939", "19072940",
"19080106", "19080715", "19081506",
"19082406", "19090308", "19090320",
"19101607", "19101619", "19102102",
"19102103", "19102524", "19102622",
"19112609", "19120302", "19120323",
"19120704", "19120723", "19121303",
"19121735", "20010826", "20010827",
"20011712", "20050628", "20052606",
"20061729", "20092226", "20092535",
"20101424", "20101822", "20102029",
"20120116", "20120922", "20121033",
"20121716", "20121718", "20122932", "20123017"]

patient_arrays = {group_name: read_data(file_path, group_name) for group_name in group_names}

patient_arrays

In [None]:
import pandas as pd
metrics_df = pd.DataFrame()

for id, signal in patient_arrays.items():
    individual_metrics = utils.patients_metrics(signal)
    individual_metrics['id'] = id
    print(individual_metrics)
    metrics_df = pd.concat([metrics_df, individual_metrics], ignore_index=True)

Save the processed metrics_df

Read the clinical indicators

In [None]:
clin_indic = utils.df_from_excel('actionable_data/Clinical indicators.xlsx')
clin_indic.rename(columns={'number': 'id'}, inplace=True)
clin_indic['id'] = clin_indic['id'].astype(str)
display(clin_indic.columns)

Merge and save both

In [None]:
# Join combined_metrics and clin_indic on 'id'
metrics_df["id"].astype(str)
merged_data = metrics_df.merge(clin_indic, on='id', how='inner')

# Display the merged dataframe
merged_data.head()

In [None]:
merged_data.to_pickle('our_processing/Waj_05_02_2025.pkl')
merged_data.to_csv('our_processing/Waj_05_02_2025.csv', index=False)

merged_data.head(6)

# Analysis

Keep only individuals with no complications or diabetic neuropathy only.

In [1]:
import pandas as pd

data = pd.read_pickle('our_processing/Waj_05_02_2025.pkl')
data.head()

Unnamed: 0,t_start,t_end,SDRR,RMSSD,pNN50 (%),Mean HR (bpm),ULF Power,ULF Peak Frequency,VLF Power,VLF Peak Frequency,...,UMA (mg),UCr (g),UACR (mg/g),Diabetic Complications,Diabetic nephropathy,Diabetic retinopathy and cataract,Diabetic peripheral neuropathy,Coronary artery disease and cardiac insufficiency,Lower extremity atherosclerosis or stenosis,Carotid plaque
0,0,300,57.334978,38.317409,5.596107,82.370347,0.0,0.0,1.849318,0.030068,...,411.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,300,600,85.691235,33.612203,12.064343,74.785043,0.0,0.0,6.452692,0.023383,...,411.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,600,900,76.251431,30.704001,8.991826,73.642222,0.0,0.0,2.295485,0.020062,...,411.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,900,1200,51.471703,34.198269,12.680115,69.584228,0.0,0.0,4.614101,0.03676,...,411.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1200,1500,71.01453,63.691131,15.12605,71.547532,0.0,0.0,4.093889,0.033405,...,411.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
data = data[ (data["Diabetic Complications"] == 0)| (data["Diabetic peripheral neuropathy"] == 1)]
data["Diabetic peripheral neuropathy"] = data["Diabetic peripheral neuropathy"].astype(bool)

In [3]:
data['Condition'] = data['Diabetic peripheral neuropathy'].apply(lambda x: 'DPN' if x else 'Diabetes')
data.head()

Unnamed: 0,t_start,t_end,SDRR,RMSSD,pNN50 (%),Mean HR (bpm),ULF Power,ULF Peak Frequency,VLF Power,VLF Peak Frequency,...,UCr (g),UACR (mg/g),Diabetic Complications,Diabetic nephropathy,Diabetic retinopathy and cataract,Diabetic peripheral neuropathy,Coronary artery disease and cardiac insufficiency,Lower extremity atherosclerosis or stenosis,Carotid plaque,Condition
0,0,300,57.334978,38.317409,5.596107,82.370347,0.0,0.0,1.849318,0.030068,...,,,0.0,0.0,0.0,False,0.0,0.0,0.0,Diabetes
1,300,600,85.691235,33.612203,12.064343,74.785043,0.0,0.0,6.452692,0.023383,...,,,0.0,0.0,0.0,False,0.0,0.0,0.0,Diabetes
2,600,900,76.251431,30.704001,8.991826,73.642222,0.0,0.0,2.295485,0.020062,...,,,0.0,0.0,0.0,False,0.0,0.0,0.0,Diabetes
3,900,1200,51.471703,34.198269,12.680115,69.584228,0.0,0.0,4.614101,0.03676,...,,,0.0,0.0,0.0,False,0.0,0.0,0.0,Diabetes
4,1200,1500,71.01453,63.691131,15.12605,71.547532,0.0,0.0,4.093889,0.033405,...,,,0.0,0.0,0.0,False,0.0,0.0,0.0,Diabetes


In [18]:


data.rename(columns={'LF/HF Ratio': 'LF HF ratio'}, inplace=True)
indicator_columns = data.columns[2:15]
print(indicator_columns)

Index(['SDRR', 'RMSSD', 'pNN50 (%)', 'Mean HR (bpm)', 'ULF Power',
       'ULF Peak Frequency', 'VLF Power', 'VLF Peak Frequency', 'LF Power',
       'LF Peak Frequency', 'HF Power', 'HF Peak Frequency', 'LF HF ratio'],
      dtype='object')


We plot all the images as html within the folder of [html_plots](html_plots)

In [None]:
from population_metric_plotter import population_metric_plotter
for column in indicator_columns:
    population_metric_plotter(data, column, 'Condition')

We can then run a t_test for two independent means.

In [35]:
from scipy.stats import ttest_ind

result_df = pd.DataFrame()
for column in indicator_columns:
    group1 = data[data['Condition'] == 'Diabetes']
    group2 = data[data['Condition'] == 'DPN']

    t_stat, p_val = ttest_ind(group1[column], group2[column], nan_policy='omit')
    column_results = {'metric': column, 't-statistic': t_stat, 'p-value': p_val, 'p<0.05': p_val<0.05}
    result_df = pd.concat([result_df, pd.DataFrame([column_results])], ignore_index=True)
result_df.index = result_df.metric

pd.options.display.float_format = '{:.2e}'.format

result_df.columns.name = 'Student T Test'
result_df.drop(columns=['metric'], inplace=True)

display("First Group: Diabetes")
display("Second Group: DPN")
display(result_df)



'First Group: Diabetes'

'Second Group: DPN'

Student T Test,t-statistic,p-value,p<0.05
metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SDRR,20.5,1.6e-90,True
RMSSD,13.1,7.84e-39,True
pNN50 (%),13.9,2.5100000000000002e-43,True
Mean HR (bpm),-16.0,2.22e-56,True
ULF Power,,,False
ULF Peak Frequency,,,False
VLF Power,10.6,5.6e-26,True
VLF Peak Frequency,5.62,1.97e-08,True
LF Power,12.5,1.53e-35,True
LF Peak Frequency,0.559,0.576,False


In [34]:
from scipy.stats import mannwhitneyu

result_df = pd.DataFrame()
for column in indicator_columns:
    group1 = data[data['Condition'] == 'Diabetes']
    group2 = data[data['Condition'] == 'DPN']

    u_stat, p_val = mannwhitneyu(group1[column], group2[column], alternative='two-sided')
    column_results = {'metric': column, 'U-statistic': u_stat, 'p-value': p_val, 'p<0.05': p_val<0.05}
    result_df = pd.concat([result_df, pd.DataFrame([column_results])], ignore_index=True)
result_df.index = result_df.metric

result_df.columns.name = 'Mann Whitney U Test'
result_df.drop(columns=['metric'], inplace=True)

pd.options.display.float_format = '{:.2e}'.format

display("First Group: Diabetes")
display("Second Group: DPN")
display(result_df)

'First Group: Diabetes'

'Second Group: DPN'

Mann Whitney U Test,U-statistic,p-value,p<0.05
metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SDRR,4540000.0,2.23e-130,True
RMSSD,4100000.0,1.11e-63,True
pNN50 (%),4000000.0,1.83e-52,True
Mean HR (bpm),2110000.0,6.950000000000001e-64,True
ULF Power,3100000.0,1.0,False
ULF Peak Frequency,3100000.0,1.0,False
VLF Power,4660000.0,1.66e-153,True
VLF Peak Frequency,3390000.0,1.04e-06,True
LF Power,4730000.0,1.47e-166,True
LF Peak Frequency,3170000.0,0.264,False


In [None]:
group1 = data[data['Condition'] == 'Diabetes']
group2 = data[data['Condition'] == 'DPN']

In [37]:
group1.describe()

Unnamed: 0,t_start,t_end,SDRR,RMSSD,pNN50 (%),Mean HR (bpm),ULF Power,ULF Peak Frequency,VLF Power,VLF Peak Frequency,...,LDL-C (mmol/L),UMA (mg),UCr (g),UACR (mg/g),Diabetic Complications,Diabetic nephropathy,Diabetic retinopathy and cataract,Coronary artery disease and cardiac insufficiency,Lower extremity atherosclerosis or stenosis,Carotid plaque
count,5660.0,5660.0,5660.0,5660.0,5660.0,5660.0,5660.0,5660.0,5660.0,5660.0,...,4480.0,4510.0,3930.0,3930.0,5660.0,5660.0,5660.0,5660.0,5660.0,5660.0
mean,39600.0,39900.0,52.2,34.3,9.65,75.0,0.0,0.0,3.8,0.0282,...,2.81,97.5,12.2,7.05,0.0,0.0,0.0,0.0,0.0,0.0
std,24000.0,24000.0,30.1,28.8,14.4,12.8,0.0,0.0,7.7,0.00747,...,0.865,119.0,5.65,10.8,0.0,0.0,0.0,0.0,0.0,0.0
min,0.0,300.0,5.61,4.72,0.0,48.5,0.0,0.0,0.0064,0.00333,...,0.86,2.93,3.89,0.534,0.0,0.0,0.0,0.0,0.0,0.0
25%,19200.0,19500.0,30.9,16.4,0.52,65.3,0.0,0.0,0.574,0.0234,...,2.35,9.68,6.32,1.11,0.0,0.0,0.0,0.0,0.0,0.0
50%,38400.0,38700.0,44.7,26.0,2.97,74.1,0.0,0.0,1.44,0.0301,...,2.83,37.6,12.5,1.77,0.0,0.0,0.0,0.0,0.0,0.0
75%,58500.0,58800.0,65.6,43.0,12.4,83.9,0.0,0.0,3.57,0.0335,...,3.45,145.0,15.4,9.32,0.0,0.0,0.0,0.0,0.0,0.0
max,96900.0,97200.0,285.0,360.0,79.8,135.0,0.0,0.0,113.0,0.0369,...,4.17,411.0,22.3,39.2,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
group2.describe()