In [32]:
import pandas as pd
import numpy as np
import os
import glob

In [33]:
# get all csv file paths from data folder
csv_files = glob.glob("data/CGMacros-*.csv")  

# columns we want to keep
selected_columns = ["Timestamp", "Libre GL", "Dexcom GL", "HR", "Calories", "Meal Type", "Carbs", "Protein", "Fat", "Fiber"]


# list storing data frames per participant
data_list = []

for file in csv_files:
    df = pd.read_csv(file)
    df["subject"] = int(file.split("-")[-1].split(".")[0]) 
    df = df[["subject"] + selected_columns]
    data_list.append(df)  

merged_df = pd.concat(data_list, ignore_index=True)

In [34]:
# merged_df.to_csv("all_data.csv", index=False)

In [35]:
df = pd.read_csv('all_data.csv')

In [36]:
df

Unnamed: 0,subject,Timestamp,Libre GL,Dexcom GL,HR,Calories,Meal Type,Carbs,Protein,Fat,Fiber
0,1,2020-05-01 10:30:00,84.000000,,56.0,,,,,,
1,1,2020-05-01 10:31:00,84.133333,,56.0,,,,,,
2,1,2020-05-01 10:32:00,84.266667,,57.0,,,,,,
3,1,2020-05-01 10:33:00,84.400000,,54.0,,,,,,
4,1,2020-05-01 10:34:00,84.533333,,55.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
687575,49,2025-05-22 00:03:00,262.000000,,,,,,,,
687576,49,2025-05-22 00:04:00,261.600000,,,,,,,,
687577,49,2025-05-22 00:05:00,261.200000,,,,,,,,
687578,49,2025-05-22 00:06:00,260.800000,,,,,,,,


In [37]:
bio_df = pd.read_csv('bio.csv')

In [38]:
# only keeping columns we want from bio.csv
bio_df = bio_df[["subject", "A1c PDL (Lab)"]].rename(columns={"A1c PDL (Lab)": "HbA1c"})

# merge df with bio_df
merged_data = df.merge(bio_df, on="subject", how="left")


# classify participants into Diabetes status groups
def classify_diabetes(hba1c):
    if hba1c < 5.7:
        return "No Diabetes"
    elif 5.7 <= hba1c <= 6.4:
        return "Pre-Diabetes"
    else:
        return "Type 2 Diabetes"


merged_data["Diabetes Status"] = merged_data["HbA1c"].apply(classify_diabetes)


# split into separate dataframes
no_diabetes_df = merged_data[merged_data["Diabetes Status"] == "No Diabetes"]
pre_diabetes_df = merged_data[merged_data["Diabetes Status"] == "Pre-Diabetes"]
t2d_df = merged_data[merged_data["Diabetes Status"] == "Type 2 Diabetes"]



In [39]:
merged_data

Unnamed: 0,subject,Timestamp,Libre GL,Dexcom GL,HR,Calories,Meal Type,Carbs,Protein,Fat,Fiber,HbA1c,Diabetes Status
0,1,2020-05-01 10:30:00,84.000000,,56.0,,,,,,,5.4,No Diabetes
1,1,2020-05-01 10:31:00,84.133333,,56.0,,,,,,,5.4,No Diabetes
2,1,2020-05-01 10:32:00,84.266667,,57.0,,,,,,,5.4,No Diabetes
3,1,2020-05-01 10:33:00,84.400000,,54.0,,,,,,,5.4,No Diabetes
4,1,2020-05-01 10:34:00,84.533333,,55.0,,,,,,,5.4,No Diabetes
...,...,...,...,...,...,...,...,...,...,...,...,...,...
687575,49,2025-05-22 00:03:00,262.000000,,,,,,,,,7.2,Type 2 Diabetes
687576,49,2025-05-22 00:04:00,261.600000,,,,,,,,,7.2,Type 2 Diabetes
687577,49,2025-05-22 00:05:00,261.200000,,,,,,,,,7.2,Type 2 Diabetes
687578,49,2025-05-22 00:06:00,260.800000,,,,,,,,,7.2,Type 2 Diabetes


In [40]:
merged_data.groupby("Diabetes Status")["subject"].nunique()


Diabetes Status
No Diabetes        15
Pre-Diabetes       16
Type 2 Diabetes    14
Name: subject, dtype: int64