# Reading and calculations

In [1]:
import h5py
import utils
file_path = "our_processing\ML_Data.h5"

def read_data(file_path, group_name, signal_name= "filtered_intervals"):
    with h5py.File(file_path, "r") as hdf:
        group = hdf[group_name]
        data = group[signal_name][()]
    return data

group_names = [
"19070921", "19072205", "19072214",
"19072938", "19072939", "19072940",
"19080106", "19080715", "19081506",
"19082406", "19090308", "19090320",
"19101607", "19101619", "19102102",
"19102103", "19102524", "19102622",
"19112609", "19120302", "19120323",
"19120704", "19120723", "19121303",
"19121735", "20010826", "20010827",
"20011712", "20050628", "20052606",
"20061729", "20092226", "20092535",
"20101424", "20101822", "20102029",
"20120116", "20120922", "20121033",
"20121716", "20121718", "20122932", "20123017"]

patient_arrays = {group_name: read_data(file_path, group_name) for group_name in group_names}

patient_arrays

{'19070921': array([736., 728., 756., ..., 692., 700., 708.]),
 '19072205': array([832., 804., 768., ..., 784., 824., 904.]),
 '19072214': array([620., 580., 616., ..., 620., 632., 712.]),
 '19072938': array([ 704.,  664.,  700., ...,  652.,  688., 1100.]),
 '19072939': array([676., 680., 692., ..., 648., 672., 668.]),
 '19072940': array([668., 660., 648., ..., 704., 680., 772.]),
 '19080106': array([852., 896., 888., ..., 812., 812., 812.]),
 '19080715': array([ 636., 1032.,  824., ...,  908.,  948.,  816.]),
 '19081506': array([992., 876., 828., ..., 840., 836., 892.]),
 '19082406': array([664., 656., 632., ..., 732., 868., 580.]),
 '19090308': array([636., 628., 612., ..., 756., 776., 732.]),
 '19090320': array([712., 720., 692., ..., 656., 640., 608.]),
 '19101607': array([ 952.,  928.,  960., ..., 1108.,  892.,  548.]),
 '19101619': array([936., 964., 944., ..., 744., 752., 736.]),
 '19102102': array([800., 796., 852., ..., 648., 648., 660.]),
 '19102103': array([740., 740., 748.,

In [2]:
import pandas as pd
metrics_df = pd.DataFrame()

for id, signal in patient_arrays.items():
    individual_metrics = utils.patients_metrics(signal)
    individual_metrics['id'] = id
    print(individual_metrics)
    metrics_df = pd.concat([metrics_df, individual_metrics], ignore_index=True)

     t_start  t_end        SDRR       RMSSD  pNN50 (%)  Mean HR (bpm)  \
0          0    300   57.334978   38.317409   5.596107      82.370347   
1        300    600   85.691235   33.612203  12.064343      74.785043   
2        600    900   76.251431   30.704001   8.991826      73.642222   
3        900   1200   51.471703   34.198269  12.680115      69.584228   
4       1200   1500   71.014530   63.691131  15.126050      71.547532   
..       ...    ...         ...         ...        ...            ...   
277    83100  83400   55.779412   33.980682   6.516291      79.857322   
278    83400  83700   63.793593   61.594752   8.396947      78.807355   
279    83700  84000   59.391012   45.112282   6.923077      78.257389   
280    84000  84300   54.625639   30.256542   6.770833      76.883138   
281    84300  84600  117.737284  141.195631  24.226804      77.798963   

     ULF Power  ULF Peak Frequency  VLF Power  VLF Peak Frequency    LF Power  \
0          0.0                 0.0   1.849

Save the processed metrics_df

Read the clinical indicators

In [11]:
clin_indic = utils.df_from_excel('actionable_data/Clinical indicators.xlsx')
clin_indic.rename(columns={'number': 'id'}, inplace=True)
clin_indic['id'] = clin_indic['id'].astype(str)
display(clin_indic.columns)

Index(['id', 'gender', 'age', 'height', 'weight', 'admission FBG (mmol/L)',
       'Discharge FBG (mmol/L)', 'HbA1c (%)', 'SBP (mmHg)', 'DBP (mmHg)',
       'WBC (×109/L)', 'N% (%)', 'Hb (g/L)', 'PLT (×109/L)', 'CRP (mg/L)',
       'ALT (U/L)', ' AST (U/L)', 'AST/ALT', 'GGT (U/L)', 'BUN (mmol/L)',
       ' UA (mmol/L)', 'TG (mmol/L)', 'HDL-C (mmol/L)', 'LDL-C (mmol/L)',
       'UMA (mg)', 'UCr (g)', 'UACR (mg/g)', 'Diabetic Complications',
       'Diabetic nephropathy', 'Diabetic retinopathy and cataract',
       'Diabetic peripheral neuropathy',
       'Coronary artery disease and cardiac insufficiency',
       'Lower extremity atherosclerosis or stenosis', 'Carotid plaque'],
      dtype='object')

Merge and save both

In [12]:
# Join combined_metrics and clin_indic on 'id'
metrics_df["id"].astype(str)
merged_data = metrics_df.merge(clin_indic, on='id', how='inner')

# Display the merged dataframe
merged_data.head()

Unnamed: 0,t_start,t_end,SDRR,RMSSD,pNN50 (%),Mean HR (bpm),ULF Power,ULF Peak Frequency,VLF Power,VLF Peak Frequency,...,UMA (mg),UCr (g),UACR (mg/g),Diabetic Complications,Diabetic nephropathy,Diabetic retinopathy and cataract,Diabetic peripheral neuropathy,Coronary artery disease and cardiac insufficiency,Lower extremity atherosclerosis or stenosis,Carotid plaque
0,0,300,57.334978,38.317409,5.596107,82.370347,0.0,0.0,1.849318,0.030068,...,411.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,300,600,85.691235,33.612203,12.064343,74.785043,0.0,0.0,6.452692,0.023383,...,411.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,600,900,76.251431,30.704001,8.991826,73.642222,0.0,0.0,2.295485,0.020062,...,411.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,900,1200,51.471703,34.198269,12.680115,69.584228,0.0,0.0,4.614101,0.03676,...,411.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1200,1500,71.01453,63.691131,15.12605,71.547532,0.0,0.0,4.093889,0.033405,...,411.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
merged_data.to_pickle('our_processing/Waj_05_02_2025.pkl')
merged_data.to_csv('our_processing/Waj_05_02_2025.csv', index=False)

merged_data.head(6)

Unnamed: 0,t_start,t_end,SDRR,RMSSD,pNN50 (%),Mean HR (bpm),ULF Power,ULF Peak Frequency,VLF Power,VLF Peak Frequency,...,UMA (mg),UCr (g),UACR (mg/g),Diabetic Complications,Diabetic nephropathy,Diabetic retinopathy and cataract,Diabetic peripheral neuropathy,Coronary artery disease and cardiac insufficiency,Lower extremity atherosclerosis or stenosis,Carotid plaque
0,0,300,57.334978,38.317409,5.596107,82.370347,0.0,0.0,1.849318,0.030068,...,411.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,300,600,85.691235,33.612203,12.064343,74.785043,0.0,0.0,6.452692,0.023383,...,411.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,600,900,76.251431,30.704001,8.991826,73.642222,0.0,0.0,2.295485,0.020062,...,411.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,900,1200,51.471703,34.198269,12.680115,69.584228,0.0,0.0,4.614101,0.03676,...,411.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1200,1500,71.01453,63.691131,15.12605,71.547532,0.0,0.0,4.093889,0.033405,...,411.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1500,1800,45.39678,30.151204,9.798271,69.678969,0.0,0.0,1.206096,0.033455,...,411.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Analysis

Keep only individuals with no complications or diabetic neuropathy only.

In [29]:
data = merged_data[ (merged_data["Diabetic Complications"] == 0)| (merged_data["Diabetic peripheral neuropathy"] == 1)]
data["Diabetic peripheral neuropathy"] = data["Diabetic peripheral neuropathy"].astype(bool)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [30]:
import plotly.express as px

fig = px.violin(data, x='id', y='SDRR', color='Diabetic peripheral neuropathy', box=True, points=None, title='SDRR Distribution by Diabetic Complications')

fig.update_layout(showlegend=False)
fig.show()

In [31]:

fig = px.box(data, x='id', y='SDRR', color='Diabetic peripheral neuropathy', title='SDRR Distribution by Diabetic Complications')

fig.update_layout(showlegend=True, legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5))
fig.show()