##  NOTE - This notebook was adapted from explore_cgm.ipynb from ai-readi-notebooks repository

In [1]:
import json
from datetime import datetime, timedelta
from calculate_cgm_features import compute_spike_metrics, get_aligned_df

import matplotlib.dates as mdates  # to use ConciseDateFormatter
import matplotlib.pyplot as plt  # to make plots
import pandas as pd
import numpy as np

In [2]:
data_root = "C:\\Users\\preet\\Documents\\AI_READI\\"  # change this to your own path

# Read the manifest

In [3]:
# dfm is the manifest.tsv file
manifest_path = data_root + "wearable_blood_glucose\\manifest.tsv"
print(manifest_path)
dfm = pd.read_csv(manifest_path, sep="\t")
print(dfm.columns)

C:\Users\preet\Documents\AI_READI\wearable_blood_glucose\manifest.tsv
Index(['participant_id', 'glucose_filepath', 'glucose_level_record_count',
       'average_glucose_level_mg_dl', 'glucose_sensor_sampling_duration_days',
       'glucose_sensor_id', 'manufacturer', 'manufacturer_model_name'],
      dtype='object')


In [4]:
dfm["participant_id"].nunique()  # number of unique participants

1049

In [5]:
key_columns = [
    "participant_id",
    "average_glucose_level_mg_dl",
    "glucose_sensor_sampling_duration_days",
]  # optionally view only a few columns

dfm[key_columns].head(2)

Unnamed: 0,participant_id,average_glucose_level_mg_dl,glucose_sensor_sampling_duration_days
0,1001,123.304272,11
1,1002,116.446203,11


In [6]:
def convert_time_string_to_datetime(t_str):
    """Converts time string to datetime format. Does not convert to local time.
    Args:
        t_str (str): UTC time string such as 2023-08-01T20:39:33Z
    Returns: datetime object
    """
    datetime_object = datetime.strptime(t_str, "%Y-%m-%dT%H:%M:%SZ")  # 4 digit Year
    return datetime_object

def flatten_json(y):
    out = {}

    def flatten(x, name=""):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + "_")
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + "_")
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

def get_glucose_list(df):
   lst = df["blood_glucose_value"].to_numpy()
   return lst



In [7]:
def replace_alt(val, low_value, high_value):
    if val == "Low":
        return low_value
    elif val == "High":
        return high_value
    else:
        return val

# Function to cleanup CGM data. Does two tasks: First drops patients who do not have a week of CGM data (2016) and takes only one week 
# of data for patients who have more than 1 week.
# Also clips the CGM values
def create_df_with_glucose_values(dfm, required_num_samples=2016,low_value = 20, high_value = 500):
    patient_ids = []
    glucose_lists=[]
    avg_spike_resolution_min_list=[]
    expected_daily_spikes_list=[]
    mean_glucose_list=[]
    expected_max_spike_relative_value_list=[]
    hyper_time_pct_list=[]
    nocturnal_hypoglycemia_list=[]
    for pid in dfm['participant_id']:
        pid_cgm = dfm[dfm["participant_id"] == pid]["glucose_filepath"].values[0]
        cgm_path = data_root + pid_cgm
        with open(cgm_path, "r") as f:
            data = json.load(f)
        cgm_header = flatten_json(data["header"])
        list_of_body_dicts = list()
        for observation in data["body"]["cgm"]:
            flat_obs = flatten_json(observation)
            list_of_body_dicts.append(flat_obs)
        df = pd.DataFrame.from_records(list_of_body_dicts)
        df.rename(
        columns={
            "effective_time_frame_time_interval_start_date_time": "start_time",
            "effective_time_frame_time_interval_end_date_time": "end_time",
        },
        inplace=True,)
        df["start_dtime"] = df.apply(lambda row: convert_time_string_to_datetime(row["start_time"]), axis=1)
        #df = get_aligned_df(df, time_col="start_time")
# Clip the values
        mask_bg_int_only = ~df["blood_glucose_value"].isin(["Low", "High"])
        bg_min = df[mask_bg_int_only]["blood_glucose_value"].min()
        bg_max = df[mask_bg_int_only]["blood_glucose_value"].max()
        df["blood_glucose_value"] = df.apply(
            lambda x: replace_alt(x["blood_glucose_value"], low_value, high_value), axis=1
        )
        # Before we do anything, check the glucose_values for the patient first
        glucose_values = get_glucose_list(df)
        if len(glucose_values) <=required_num_samples:
            print("Skipping",pid)
            continue
        else:
            glucose_values = glucose_values[:required_num_samples]

        glucose_lists.append(glucose_values)
        patient_ids.append(pid) 

        spike_metrics = compute_spike_metrics(df)

        avg_spike_resolution_min_list.append(spike_metrics['avg_spike_resolution_min'])
        expected_daily_spikes_list.append(spike_metrics['expected_daily_spikes'])
        mean_glucose_list.append(spike_metrics['mean_glucose'])
        expected_max_spike_relative_value_list.append(spike_metrics['expected_max_spike_relative_value'])  # unitless ratio
        hyper_time_pct_list.append(spike_metrics['hyper_time_pct'])
        nocturnal_hypoglycemia_list.append(spike_metrics['nocturnal_hypoglycemia'])
   


    dataframe_with_glucose_values = pd.DataFrame()
    dataframe_with_glucose_values["patient_ids"] = patient_ids
    dataframe_with_glucose_values["Glucose Lists"] = glucose_lists
    dataframe_with_glucose_values["spike_resolutions"] = avg_spike_resolution_min_list
    dataframe_with_glucose_values["expected_daily_spikes"] = expected_daily_spikes_list
    dataframe_with_glucose_values["mean_glucose"] = mean_glucose_list
    dataframe_with_glucose_values["relative_spikes"] = expected_max_spike_relative_value_list
    dataframe_with_glucose_values["hyper_time_pcts"] = hyper_time_pct_list
    dataframe_with_glucose_values["nocturnal_hypoglycemias"] = nocturnal_hypoglycemia_list
    return dataframe_with_glucose_values


In [8]:
print(dfm.shape)
dataframe_with_glucose_values = create_df_with_glucose_values(dfm)
print(dataframe_with_glucose_values.shape)

(1049, 8)
Skipping 1063
Skipping 1095
Skipping 1111
Skipping 1185
Skipping 1304
Skipping 1324
Skipping 1351
Skipping 1361
Skipping 4004
Skipping 4028
Skipping 4034
Skipping 4107
Skipping 4114
Skipping 4158
Skipping 4232
Skipping 4241
Skipping 7008
Skipping 7009
Skipping 7016
Skipping 7032
Skipping 7052
Skipping 7114
Skipping 7144
Skipping 7150
Skipping 7153
Skipping 7164
Skipping 7173
Skipping 7212
Skipping 7226
Skipping 7270
Skipping 7271
Skipping 7286
Skipping 7287
Skipping 7294
Skipping 7300
Skipping 7346
Skipping 7392
Skipping 7408
(1011, 8)


In [9]:
patients = pd.read_csv(r"C:\Users\preet\Documents\AI_READI\participants.tsv", sep="\t")
print(patients.columns)
r = patients[["recommended_split", "participant_id","study_group"]]
print(patients.head())
r = r.rename(columns = {"participant_id": "patient_ids"})
# Mapping dictionary
mapping = {
    "healthy": 0,
    "pre_diabetes_lifestyle_controlled": 1,
    "oral_medication_and_or_non_insulin_injectable_medication_controlled": 2,
    "insulin_dependent": 3
}

# Create new column
r["study_group_id"] = r["study_group"].map(mapping)
r.drop(columns=["study_group"], inplace=True)
dataframe_with_glucose_values = dataframe_with_glucose_values.merge(r, on="patient_ids", how="left")
print(dataframe_with_glucose_values.columns)

Index(['participant_id', 'clinical_site', 'study_group', 'age',
       'study_visit_date', 'recommended_split', 'cardiac_ecg', 'clinical_data',
       'environment', 'retinal_flio', 'retinal_oct', 'retinal_octa',
       'retinal_photography', 'wearable_activity_monitor',
       'wearable_blood_glucose'],
      dtype='object')
   participant_id clinical_site  \
0            1001            UW   
1            1002            UW   
2            1003            UW   
3            1004            UW   
4            1005            UW   

                                         study_group  age study_visit_date  \
0                  pre_diabetes_lifestyle_controlled   69       2023-07-27   
1                                            healthy   69       2023-08-01   
2  oral_medication_and_or_non_insulin_injectable_...   82       2023-08-02   
3  oral_medication_and_or_non_insulin_injectable_...   61       2023-08-08   
4                                  insulin_dependent   58       2023-08

In [10]:
dataframe_with_glucose_values.to_pickle(r"dataframe_with_glucose_info.pkl")


In [11]:
dataframe_with_glucose_values=pd.read_pickle(r"dataframe_with_glucose_info.pkl")

dataframe_with_glucose_values.drop(columns=['Glucose Lists'], inplace=True)
dataframe_with_glucose_values.rename(columns={'patient_ids': 'participant_id'}, inplace=True)
dataframe_with_glucose_values.to_csv(r"dataframe_glucose_feats.csv")
dataframe_with_glucose_values.head()

Unnamed: 0,participant_id,spike_resolutions,expected_daily_spikes,mean_glucose,relative_spikes,hyper_time_pcts,nocturnal_hypoglycemias,recommended_split,study_group_id
0,1001,30.131579,3.8,123.302871,0.39199,8.333333,106.7,train,1
1,1002,29.318182,2.75,106.862518,0.317751,9.423347,86.7,train,0
2,1003,42.166667,3.0,209.019258,0.444408,93.662465,175.5,train,2
3,1004,33.859649,5.7,188.659664,0.407537,90.581232,153.1,val,2
4,1005,36.2,2.5,275.486695,0.405447,97.163866,183.6,val,3
