## Combine CGM

In [16]:
import os
import pandas as pd

# Define directories
standardized_folder = "../../data/processed/cgm/"  # Folder containing standardized files
output_file = "data/processed/combined_cgm.csv"

# Ensure the output folder exists
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# Initialize an empty list to store DataFrames
dataframes = []

# Define standard column names
standard_columns = ["ID", "time", "glc"]

# Process each file
for filename in os.listdir(standardized_folder):
    if filename.endswith(".csv"):
        file_path = os.path.join(standardized_folder, filename)
        print(f"Processing file: {filename}")
        
        # Extract the prefix from the filename (everything before the first "_")
        prefix = filename.split("_")[0] + "_"
        
        # Read the file
        df = pd.read_csv(file_path)
        
        # Rename columns to standard names
        df.rename(columns={"id": "ID", "gl": "glc"}, inplace=True)
        
        # Ensure the file has the required columns
        missing_columns = [col for col in standard_columns if col not in df.columns]
        if missing_columns:
            raise ValueError(f"File {filename} is missing columns: {missing_columns}")
        
        # Add the prefix to the ID column
        df["ID"] = prefix + df["ID"].astype(str)
        
        # Reorder columns to standard order
        df = df[standard_columns]
        
        # Add the DataFrame to the list
        dataframes.append(df)

# Concatenate all DataFrames
combined_df = pd.concat(dataframes, ignore_index=True)



Processing file: aleppo_cgm_processed.csv
Processing file: chase_cgm_processed.csv
Processing file: weinstock_cgm_processed.csv
Processing file: omalley_cgm_processed.csv
Processing file: tamborlane_processed_cgm.csv
Processing file: lynch_cgm_preprocessed.csv


In [17]:
combined_df.sort_values(by=["ID", "time"], inplace=True)

In [18]:
combined_df

Unnamed: 0,ID,time,glc
1388288,aleppo_110,2015-05-22 11:14:20,136.0
1388287,aleppo_110,2015-05-22 12:32:37,157.0
1388286,aleppo_110,2015-05-22 12:33:45,168.0
1407754,aleppo_110,2015-05-22 12:36:12,155.0
1407753,aleppo_110,2015-05-22 12:41:12,149.0
...,...,...,...
3373005,weinstock_99,1990-01-15 08:10:00,163.0
3373006,weinstock_99,1990-01-15 08:15:00,170.0
3373007,weinstock_99,1990-01-15 08:20:00,172.0
3373008,weinstock_99,1990-01-15 08:25:00,163.0


## Clean CGM

In [19]:
import pandas as pd
import numpy as np

df = combined_df

# Ensure `time` is in datetime format
df["time"] = pd.to_datetime(df["time"], errors="coerce")

# Round `time` to the nearest 5 minutes
df["time"] = df["time"].dt.round("5min")

# Drop duplicate rows after rounding
df = df.drop_duplicates(subset=["ID", "time"])

# Sort values by ID and time
df = df.sort_values(by=["ID", "time"])

# Ensure glucose values are numeric
df["glc"] = pd.to_numeric(df["glc"], errors="coerce")
#df = df.dropna(subset=["glc"])


## Reduce to 15mins

In [20]:
# Ensure timestamps are in datetime format
df["time"] = pd.to_datetime(df["time"])

# Round timestamps to the nearest 15-minute mark
df["timestamp_rounded"] = df["time"].dt.round("15min")

# Group by patient ID and rounded timestamp, then take the mean of glucose values
df_resampled = df.groupby(["ID", "timestamp_rounded"], as_index=False).mean()

# Drop time col
df_resampled = df_resampled.drop(columns=['time'])

# Rename the timestamp column to match expected output
df_resampled = df_resampled.rename(columns={"timestamp_rounded": "time"})

df_resampled['glc'] = df_resampled['glc'].round(2)

In [21]:
df_resampled

Unnamed: 0,ID,time,glc
0,aleppo_110,2015-05-22 11:15:00,136.00
1,aleppo_110,2015-05-22 12:30:00,157.00
2,aleppo_110,2015-05-22 12:45:00,149.00
3,aleppo_110,2015-05-22 13:00:00,158.67
4,aleppo_110,2015-05-22 13:15:00,163.00
...,...,...,...
16068384,weinstock_99,1990-01-15 07:30:00,146.33
16068385,weinstock_99,1990-01-15 07:45:00,161.00
16068386,weinstock_99,1990-01-15 08:00:00,161.33
16068387,weinstock_99,1990-01-15 08:15:00,168.33


In [22]:
df_resampled

Unnamed: 0,ID,time,glc
0,aleppo_110,2015-05-22 11:15:00,136.00
1,aleppo_110,2015-05-22 12:30:00,157.00
2,aleppo_110,2015-05-22 12:45:00,149.00
3,aleppo_110,2015-05-22 13:00:00,158.67
4,aleppo_110,2015-05-22 13:15:00,163.00
...,...,...,...
16068384,weinstock_99,1990-01-15 07:30:00,146.33
16068385,weinstock_99,1990-01-15 07:45:00,161.00
16068386,weinstock_99,1990-01-15 08:00:00,161.33
16068387,weinstock_99,1990-01-15 08:15:00,168.33


## Remove missing periods

In [23]:
# Import dt 
df_resampled['time'] = pd.to_datetime(df_resampled['time'])

# Create a `date` column for daily grouping
df_resampled["date"] = df_resampled["time"].dt.date

# Define the minimum threshold for valid readings (90% of 29 = 259)
min_readings = int(96 * 0.9)

# Function to resample per day
def resample_per_day(group):
    # Count only non-NaN glucose values
    non_nan_count = group["glc"].notna().sum()
    
    # Check if the group has at least the minimum number of valid readings
    if non_nan_count < min_readings:
        print(f"Skipping day {group['date'].iloc[0]} for participant {group['ID'].iloc[0]}: {non_nan_count} readings.")
        return pd.DataFrame()  # Skip this day entirely
    
    # Create a full day of 5-minute intervals
    full_day_index = pd.date_range(
        start=f"{group['time'].iloc[0].date()} 00:00:00",
        end=f"{group['time'].iloc[0].date()} 23:45:00",
        freq="15min"
    )
    
    # Resample to ensure exactly 288 readings for the day
    resampled = group.set_index("time").reindex(full_day_index)
    resampled.index.name = "time"
    
    
    # Add back metadata
    resampled["ID"] = group["ID"].iloc[0]
    resampled["date"] = group["date"].iloc[0]
    return resampled.reset_index()

# Apply the function per participant and date
df_resampled = df_resampled.groupby(["ID", "date"]).apply(resample_per_day).reset_index(drop=True)


Skipping day 2015-05-22 for participant aleppo_110: 46 readings.
Skipping day 2015-05-29 for participant aleppo_110: 80 readings.
Skipping day 2015-06-05 for participant aleppo_110: 79 readings.
Skipping day 2015-06-18 for participant aleppo_110: 78 readings.
Skipping day 2015-07-02 for participant aleppo_110: 73 readings.
Skipping day 2015-07-09 for participant aleppo_110: 55 readings.
Skipping day 2015-07-17 for participant aleppo_110: 77 readings.
Skipping day 2015-07-24 for participant aleppo_110: 79 readings.
Skipping day 2015-07-30 for participant aleppo_110: 67 readings.
Skipping day 2015-08-09 for participant aleppo_110: 58 readings.
Skipping day 2015-08-15 for participant aleppo_110: 47 readings.
Skipping day 2015-08-30 for participant aleppo_110: 84 readings.
Skipping day 2015-09-04 for participant aleppo_110: 77 readings.
Skipping day 2015-09-09 for participant aleppo_110: 83 readings.
Skipping day 2015-09-13 for participant aleppo_110: 31 readings.
Skipping day 2015-09-20 f

In [None]:
df_resampled

Unnamed: 0,time,ID,glc,date
0,2015-05-23 00:00:00,aleppo_110,141.67,2015-05-23
1,2015-05-23 00:15:00,aleppo_110,140.00,2015-05-23
2,2015-05-23 00:30:00,aleppo_110,132.67,2015-05-23
3,2015-05-23 00:45:00,aleppo_110,123.00,2015-05-23
4,2015-05-23 01:00:00,aleppo_110,101.00,2015-05-23
...,...,...,...,...
691291,2016-02-04 22:45:00,aleppo_81,115.00,2016-02-04
691292,2016-02-04 23:00:00,aleppo_81,122.00,2016-02-04
691293,2016-02-04 23:15:00,aleppo_81,124.33,2016-02-04
691294,2016-02-04 23:30:00,aleppo_81,120.00,2016-02-04


## Normalise

In [None]:
from sklearn.preprocessing import MinMaxScaler
# Normalize glucose values
scaler = MinMaxScaler(feature_range=(0, 1))  # Use z-score if preferred
df_resampled["glc"] = scaler.fit_transform(df_resampled["glc"].values.reshape(-1, 1))

# After scaling, set missing values to -1
df_resampled["glc"] = df_resampled["glc"].fillna(-1)  


In [None]:
import joblib
# ✅ Save the fitted scaler
joblib.dump(scaler, "data/processed/scaler_all.pkl")


['data/processed/scaler_lynch.pkl']

## Windowing

In [None]:


# ✅ Validate daily counts
daily_counts = df_resampled.groupby(["ID", "date"]).size()
print(f"Daily counts distribution:\n{daily_counts.value_counts()}")

# ✅ Define window size and stride for 15-min intervals
window_size = 96  # Full 24-hour window (96 readings)
stride = 48  # 50% overlap (12 hours)

# ✅ Create sliding windows
def create_windows(group):
    group = group.sort_values("time")  # Ensure correct order

    # ✅ Check if the group has enough data points
    if len(group) < window_size:
        print(f"Skipping ID {group['ID'].iloc[0]} with only {len(group)} points.")
        return pd.DataFrame()

    # ✅ Extract glucose values and timestamps
    values = group["glc"].values
    times = group["time"].values

    # ✅ Create sliding windows
    windows = []
    for i in range(0, len(values) - window_size + 1, stride):
        if i + window_size <= len(values):  # Ensure full window fits
            window = {
                "ID": group["ID"].iloc[0],
                "start_time": times[i],
                **{f"glc_{j}": values[i + j] for j in range(window_size)},
            }
            windows.append(window)

    return pd.DataFrame(windows)

# ✅ Apply windowing per participant
windows = df_resampled.groupby("ID").apply(create_windows).reset_index(drop=True)


windows = windows.drop(columns=["ID", "start_time"])

# ✅ Validate window sizes
window_sizes = windows.apply(lambda x: len(x), axis=1)
print(f"Window sizes distribution:\n{window_sizes.value_counts()}")



Daily counts distribution:
96    45609
Name: count, dtype: int64


  windows = df_resampled.groupby("ID").apply(create_windows).reset_index(drop=True)


Window sizes distribution:
96    90778
Name: count, dtype: int64


In [None]:
windows.shape

(14364, 96)

## Positional encoding

In [None]:
import numpy as np
import pandas as pd

# ✅ Parameters
window_size = 96  # 24-hour window with 15-min intervals
embed_dim = 32  # Positional encoding dimensions

# ✅ Positional encoding function (without flattening)
def positional_encoding(window_size, embed_dim):
    positions = np.arange(window_size).reshape(-1, 1)
    div_terms = np.exp(np.arange(0, embed_dim, 2) * -(np.log(10000.0) / embed_dim))
    sinusoidals = np.zeros((window_size, embed_dim))
    sinusoidals[:, 0::2] = np.sin(positions * div_terms)  # Sin for even indices
    sinusoidals[:, 1::2] = np.cos(positions * div_terms)  # Cos for odd indices
    return sinusoidals

# ✅ Generate PE for 96 time steps
pe = positional_encoding(window_size, embed_dim)  # Shape: (96, 32)

# ✅ Column naming convention
glucose_colnames = [f"glc_{t}" for t in range(window_size)]
pe_colnames = [f"pe_{t}_{d}" for t in range(window_size) for d in range(embed_dim)]

df = windows

# ✅ Extract glucose data
glucose_columns = [col for col in df.columns if col.startswith("glc_")]
glucose_data = df[glucose_columns].values  # Shape: (num_windows, 96)

# ✅ Repeat PE for all rows
pe_repeated = np.tile(pe, (len(glucose_data), 1, 1))  # Shape: (num_windows, 96, 32)
pe_reshaped = pe_repeated.reshape(len(glucose_data), -1)  # Flatten to (num_windows, 96*32)

# ✅ Convert to DataFrames
glucose_df = pd.DataFrame(glucose_data, columns=glucose_colnames)
pe_df = pd.DataFrame(pe_reshaped, columns=pe_colnames)

# ✅ Combine glucose + PE
final_df = pd.concat([glucose_df, pe_df], axis=1)  # Keep metadata

print("Final shape before saving:", final_df.shape)


Final shape before saving: (90778, 3168)


In [None]:
final_df

Unnamed: 0,glc_0,glc_1,glc_2,glc_3,glc_4,glc_5,glc_6,glc_7,glc_8,glc_9,...,pe_95_22,pe_95_23,pe_95_24,pe_95_25,pe_95_26,pe_95_27,pe_95_28,pe_95_29,pe_95_30,pe_95_31
0,0.222229,0.218750,0.203479,0.183333,0.137500,0.140271,0.168063,0.164583,0.162500,0.164583,...,0.168134,0.985764,0.094857,0.995491,0.053397,0.998573,0.030037,0.999549,0.016893,0.999857
1,0.360417,0.366667,0.373604,0.381937,0.393062,0.403479,0.387500,0.326396,0.304854,0.281250,...,0.168134,0.985764,0.094857,0.995491,0.053397,0.998573,0.030037,0.999549,0.016893,0.999857
2,0.270833,0.358333,0.364583,0.393750,0.412500,0.427083,0.442354,0.414583,0.386812,0.380562,...,0.168134,0.985764,0.094857,0.995491,0.053397,0.998573,0.030037,0.999549,0.016893,0.999857
3,0.169437,0.146521,0.116667,0.094437,0.072229,0.059021,0.108333,0.142354,0.150687,0.152083,...,0.168134,0.985764,0.094857,0.995491,0.053397,0.998573,0.030037,0.999549,0.016893,0.999857
4,0.131250,0.118750,0.114583,0.101396,0.088187,0.088187,0.088187,0.079854,0.083333,0.097229,...,0.168134,0.985764,0.094857,0.995491,0.053397,0.998573,0.030037,0.999549,0.016893,0.999857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14359,0.098604,0.084021,0.063187,0.040271,0.033333,0.082646,0.122229,0.172229,0.206937,0.215979,...,0.168134,0.985764,0.094857,0.995491,0.053397,0.998573,0.030037,0.999549,0.016893,0.999857
14360,0.267354,0.209729,0.129167,0.081937,0.082646,0.090979,0.088187,0.090979,0.069437,0.061813,...,0.168134,0.985764,0.094857,0.995491,0.053397,0.998573,0.030037,0.999549,0.016893,0.999857
14361,0.252771,0.237500,0.222229,0.213188,0.213896,0.219438,0.227771,0.224312,0.227771,0.229854,...,0.168134,0.985764,0.094857,0.995491,0.053397,0.998573,0.030037,0.999549,0.016893,0.999857
14362,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,0.070833,-1.000000,-1.000000,-1.000000,-1.000000,...,0.168134,0.985764,0.094857,0.995491,0.053397,0.998573,0.030037,0.999549,0.016893,0.999857


## Make masks

In [None]:

# ✅ Use existing final_df in Jupyter Notebook (remove file I/O)
df = final_df.copy()  # Ensure we work on a separate copy

# ✅ Masking parameters
mask_prob = 0.2  # Probability of masking

# ✅ Extract glucose and positional encoding columns
glucose_columns = [col for col in df.columns if "glc_" in col]
pos_enc_columns = [col for col in df.columns if "pe_" in col]

glucose_data = df[glucose_columns].values  # Extract glucose values
positional_encodings = df[pos_enc_columns].values  # Extract positional encodings

print(f"Glucose data shape: {glucose_data.shape}")
print(f"Positional encoding shape: {positional_encodings.shape}")

# ✅ Masking function
def mask_values(window, mask_prob=0.2):
    mask = np.random.rand(*window.shape) < mask_prob  # Create mask (match window shape)
    masked_window = window.copy()
    masked_window[mask] = -1  # Replace masked positions with -1
    return masked_window, mask

# ✅ Apply masking to the entire dataset
masked_data = []
mask_labels = []

for i in range(len(glucose_data)):
    masked_window, mask = mask_values(glucose_data[i], mask_prob)
    combined_window = np.hstack([masked_window, positional_encodings[i]])  # Combine masked glucose + PE
    masked_data.append(combined_window)
    mask_labels.append(mask)

# ✅ Convert to NumPy arrays
masked_data = np.array(masked_data, dtype=np.float32)  # Ensure consistent dtype
mask_labels = np.array(mask_labels, dtype=np.float32)  # Ensure consistent dtype

# ✅ Verify final shapes
print(f"Masked data shape: {masked_data.shape}")
print(f"Mask labels shape: {mask_labels.shape}")



Glucose data shape: (90778, 96)
Positional encoding shape: (90778, 3072)
Masked data shape: (90778, 3168)
Mask labels shape: (90778, 96)


In [None]:
output_masked_file = "../../data/processed/masked_windows_all.npy"
output_labels_file = "../../data/processed/mask_labels_all.npy"

np.save(output_masked_file, masked_data)
np.save(output_labels_file, mask_labels)