# Predict w/parameter, multiplier

In [1]:
import pandas as pd


# Specify the file path for the .xlsl file
df = pd.read_excel('/Users/bg.lim/Downloads/Texas A&M_Agrilife/Projects/4.New_CGM/Datas/0206_CGM.xlsx', engine='openpyxl')  # 파일 경로 지정


# Display the dataframe
df

Unnamed: 0,StudyID,Timestamp,Glucose mg/dL,Relative_Day
0,1,2022-10-06 01:34:00,78,1
1,1,2022-10-06 01:49:00,76,1
2,1,2022-10-06 02:04:00,78,1
3,1,2022-10-06 02:19:00,75,1
4,1,2022-10-06 02:34:00,75,1
...,...,...,...,...
71712,89,2023-05-09 19:08:00,114,13
71713,89,2023-05-09 19:23:00,136,13
71714,89,2023-05-09 19:38:00,142,13
71715,89,2023-05-09 19:53:00,137,13


In [2]:
print(df.columns)


Index(['StudyID', 'Timestamp', 'Glucose mg/dL', 'Relative_Day'], dtype='object')


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


## Endpoint-parameter, X days, Multiplier(1,1.5SD)

In [125]:
# Convert Timestamp to datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'])


# Sort data by StudyID and Timestamp
df = df.sort_values(by=['StudyID', 'Timestamp']).reset_index(drop=True)

## Set - Window_days: 7, SD:1

In [31]:
import pandas as pd
import numpy as np

def calculate_fixed_ge_threshold(df, study_id_col, timestamp_col, glucose_col, window_days=10, sd_multiplier=1.5):
    """
    Function to define the Glucose Excursion (GE) threshold using the first X days' data for calculation.
    The threshold remains constant for the entire dataset based on the initial window_days period.

    Parameters:
        df (pd.DataFrame): The CGM dataset.
        study_id_col (str): The column representing the Study ID (e.g., "StudyID").
        timestamp_col (str): The column containing the timestamp (e.g., "Timestamp").
        glucose_col (str): The column containing glucose values (e.g., "Glucose mg/dL").
        window_days (int): The number of initial days to consider for calculating the threshold.
        sd_multiplier (float): The multiplier for the standard deviation.

    Returns:
        pd.DataFrame: The original dataframe with an added column for GE upper threshold value (GE_Threshold_Upper).
    """
    df = df.copy()

    # Sort by StudyID and Timestamp to ensure chronological order
    df = df.sort_values(by=[study_id_col, timestamp_col]).reset_index(drop=True)

    # Initialize new column for GE upper threshold
    df["GE_Threshold_Upper"] = np.nan

    # Calculate threshold based on the first `window_days` days of data for each StudyID
    for study_id in df[study_id_col].unique():
        study_data = df[df[study_id_col] == study_id].copy()

        # Determine the first `window_days` days
        initial_data = study_data.iloc[:window_days]  # First `window_days` rows

        if len(initial_data) < window_days:
            continue  # Skip if not enough data

        # Compute mean and standard deviation for the first `window_days` days
        fixed_mean = initial_data[glucose_col].mean()
        fixed_std = initial_data[glucose_col].std()

        # Compute fixed GE upper threshold for all rows of this StudyID
        fixed_upper_threshold = fixed_mean + (sd_multiplier * fixed_std)

        # Ensure all rows have the same GE_Threshold_Upper
        df.loc[df[study_id_col] == study_id, "GE_Threshold_Upper"] = fixed_upper_threshold

    return df

# === ⚡ Example Usage === #
# Setting X days (window_days=10) and standard deviation multiplier (sd_multiplier=1.5)
df_with_fixed_threshold = calculate_fixed_ge_threshold(df, 
                                                       study_id_col="StudyID", 
                                                       timestamp_col="Timestamp", 
                                                       glucose_col="Glucose mg/dL", 
                                                       window_days=7,  # 🔹 First 10 days used for threshold calculation
                                                       sd_multiplier=1)  # 🔹 Modify this to 1.5, 2.0, etc.


# Display the first 50 rows of the DataFrame

df_with_fixed_threshold

Unnamed: 0,StudyID,Timestamp,Glucose mg/dL,Relative_Day,GE_Threshold_Upper
0,1,2022-10-06 01:34:00,78,1,78.094657
1,1,2022-10-06 01:49:00,76,1,78.094657
2,1,2022-10-06 02:04:00,78,1,78.094657
3,1,2022-10-06 02:19:00,75,1,78.094657
4,1,2022-10-06 02:34:00,75,1,78.094657
...,...,...,...,...,...
71712,89,2023-05-09 19:08:00,114,13,118.403038
71713,89,2023-05-09 19:23:00,136,13,118.403038
71714,89,2023-05-09 19:38:00,142,13,118.403038
71715,89,2023-05-09 19:53:00,137,13,118.403038


## Set - Window_days: 14, SD:1.5

In [29]:
import pandas as pd
import numpy as np

def calculate_fixed_ge_threshold(df, study_id_col, timestamp_col, glucose_col, window_days=10, sd_multiplier=1.5):
    """
    Function to define the Glucose Excursion (GE) threshold using the first X days' data for calculation.
    The threshold remains constant for the entire dataset based on the initial window_days period.

    Parameters:
        df (pd.DataFrame): The CGM dataset.
        study_id_col (str): The column representing the Study ID (e.g., "StudyID").
        timestamp_col (str): The column containing the timestamp (e.g., "Timestamp").
        glucose_col (str): The column containing glucose values (e.g., "Glucose mg/dL").
        window_days (int): The number of initial days to consider for calculating the threshold.
        sd_multiplier (float): The multiplier for the standard deviation.

    Returns:
        pd.DataFrame: The original dataframe with an added column for GE upper threshold value (GE_Threshold_Upper).
    """
    df = df.copy()

    # Sort by StudyID and Timestamp to ensure chronological order
    df = df.sort_values(by=[study_id_col, timestamp_col]).reset_index(drop=True)

    # Initialize new column for GE upper threshold
    df["GE_Threshold_Upper"] = np.nan

    # Calculate threshold based on the first `window_days` days of data for each StudyID
    for study_id in df[study_id_col].unique():
        study_data = df[df[study_id_col] == study_id].copy()

        # Determine the first `window_days` days
        initial_data = study_data.iloc[:window_days]  # First `window_days` rows

        if len(initial_data) < window_days:
            continue  # Skip if not enough data

        # Compute mean and standard deviation for the first `window_days` days
        fixed_mean = initial_data[glucose_col].mean()
        fixed_std = initial_data[glucose_col].std()

        # Compute fixed GE upper threshold for all rows of this StudyID
        fixed_upper_threshold = fixed_mean + (sd_multiplier * fixed_std)

        # Ensure all rows have the same GE_Threshold_Upper
        df.loc[df[study_id_col] == study_id, "GE_Threshold_Upper"] = fixed_upper_threshold

    return df

# === ⚡ Example Usage === #
# Setting X days (window_days=10) and standard deviation multiplier (sd_multiplier=1.5)
df_with_fixed_threshold = calculate_fixed_ge_threshold(df, 
                                                       study_id_col="StudyID", 
                                                       timestamp_col="Timestamp", 
                                                       glucose_col="Glucose mg/dL", 
                                                       window_days=14,  # 🔹 First 10 days used for threshold calculation
                                                       sd_multiplier=1.5)  # 🔹 Modify this to 1.5, 2.0, etc.


# Display the first 50 rows of the DataFrame

df_with_fixed_threshold

Unnamed: 0,StudyID,Timestamp,Glucose mg/dL,Relative_Day,GE_Threshold_Upper
0,1,2022-10-06 01:34:00,78,1,80.436806
1,1,2022-10-06 01:49:00,76,1,80.436806
2,1,2022-10-06 02:04:00,78,1,80.436806
3,1,2022-10-06 02:19:00,75,1,80.436806
4,1,2022-10-06 02:34:00,75,1,80.436806
...,...,...,...,...,...
71712,89,2023-05-09 19:08:00,114,13,121.294747
71713,89,2023-05-09 19:23:00,136,13,121.294747
71714,89,2023-05-09 19:38:00,142,13,121.294747
71715,89,2023-05-09 19:53:00,137,13,121.294747


## Add GE count (14days, 1.5SD)

In [44]:
import pandas as pd
import numpy as np

def calculate_fixed_ge_threshold(df, study_id_col, timestamp_col, glucose_col, window_days=10, sd_multiplier=1.5):
    """
    Compute GE threshold based on the first X days' data for each StudyID.
    The threshold remains constant for the entire dataset.

    Parameters:
        df (pd.DataFrame): The CGM dataset.
        study_id_col (str): Column representing the Study ID.
        timestamp_col (str): Column containing the timestamp.
        glucose_col (str): Column containing glucose values.
        window_days (int): Number of initial days for threshold calculation.
        sd_multiplier (float): Multiplier for standard deviation.

    Returns:
        pd.DataFrame: Updated DataFrame with `GE_Threshold_Upper`.
    """
    df = df.copy()

    # Sort and reset index
    df = df.sort_values(by=[study_id_col, timestamp_col]).reset_index(drop=True)

    # Initialize threshold column
    df["GE_Threshold_Upper"] = np.nan

    # Compute threshold based on first `window_days` days for each StudyID
    for study_id in df[study_id_col].unique():
        study_data = df[df[study_id_col] == study_id].copy()

        # First `window_days` of data
        initial_data = study_data.iloc[:window_days]

        if len(initial_data) < window_days:
            continue  # Skip if insufficient data

        # Compute mean and standard deviation
        fixed_mean = initial_data[glucose_col].mean()
        fixed_std = initial_data[glucose_col].std()

        # Compute threshold
        fixed_upper_threshold = fixed_mean + (sd_multiplier * fixed_std)

        # Apply threshold to all data for this StudyID
        df.loc[df[study_id_col] == study_id, "GE_Threshold_Upper"] = fixed_upper_threshold

    return df

def add_ge_flag_and_count(df, glucose_col, threshold_col, study_id_col, relative_day_col):
    """
    Add GE flag (0/1) and count unique GE events per `Relative_Day`.

    Parameters:
        df (pd.DataFrame): The CGM dataset.
        glucose_col (str): Column containing glucose values.
        threshold_col (str): Column for GE threshold.
        study_id_col (str): Column representing the Study ID.
        relative_day_col (str): Column representing Relative Day.

    Returns:
        pd.DataFrame: Updated DataFrame with `GE_Flag` and `GE_Count`.
    """
    df = df.copy()

    # Compute GE flag (1 if glucose exceeds threshold)
    df["GE_Flag"] = (df[glucose_col] > df[threshold_col]).astype(int)
    df["GE_Count"] = 0

    # Iterate through each StudyID
    for study_id in df[study_id_col].unique():
        study_data = df[df[study_id_col] == study_id]

        # Iterate through each Relative_Day
        for day in study_data[relative_day_col].unique():
            daily_data = study_data[study_data[relative_day_col] == day].copy()

            # Detect new GE events (1 followed by 0 starts a new event)
            daily_data["GE_Start"] = daily_data["GE_Flag"].diff().fillna(0) == 1

            # Count unique GE occurrences per Relative_Day
            ge_count = daily_data["GE_Start"].sum()

            # Assign count to dataframe
            df.loc[(df[study_id_col] == study_id) & (df[relative_day_col] == day), "GE_Count"] = ge_count

    return df

# === ⚡ Example Usage === #
# Set window_days and sd_multiplier dynamically
window_days = 14  # Changeable parameter
sd_multiplier = 1.5  # Changeable parameter

# Convert timestamp to datetime
df["Timestamp"] = pd.to_datetime(df["Timestamp"])

# Compute GE Threshold dynamically
df_with_threshold = calculate_fixed_ge_threshold(df, 
                                                 study_id_col="StudyID", 
                                                 timestamp_col="Timestamp", 
                                                 glucose_col="Glucose mg/dL", 
                                                 window_days=window_days, 
                                                 sd_multiplier=sd_multiplier)

# Compute GE Flags and Count
df_with_ge_results = add_ge_flag_and_count(df_with_threshold, 
                                           glucose_col="Glucose mg/dL", 
                                           threshold_col="GE_Threshold_Upper", 
                                           study_id_col="StudyID", 
                                           relative_day_col="Relative_Day")

# Display results
df_with_ge_results.head(60)



Unnamed: 0,StudyID,Timestamp,Glucose mg/dL,Relative_Day,GE_Threshold_Upper,GE_Flag,GE_Count
0,1,2022-10-06 01:34:00,78,1,80.436806,0,7
1,1,2022-10-06 01:49:00,76,1,80.436806,0,7
2,1,2022-10-06 02:04:00,78,1,80.436806,0,7
3,1,2022-10-06 02:19:00,75,1,80.436806,0,7
4,1,2022-10-06 02:34:00,75,1,80.436806,0,7
5,1,2022-10-06 02:49:00,78,1,80.436806,0,7
6,1,2022-10-06 03:04:00,73,1,80.436806,0,7
7,1,2022-10-06 03:19:00,68,1,80.436806,0,7
8,1,2022-10-06 03:34:00,76,1,80.436806,0,7
9,1,2022-10-06 03:49:00,75,1,80.436806,0,7


In [45]:
# Randomly select and print 50 rows from df_with_ge_results
df_random_sample = df_with_ge_results.sample(n=50, random_state=42)

# Print the selected sample
df_random_sample

Unnamed: 0,StudyID,Timestamp,Glucose mg/dL,Relative_Day,GE_Threshold_Upper,GE_Flag,GE_Count
11987,19,2022-11-09 07:07:00,40,3,40.0,0,4
22498,40,2022-12-09 19:36:00,77,3,111.597275,0,2
34377,50,2023-02-23 16:55:00,70,3,97.337225,0,6
66581,84,2023-04-21 04:56:00,168,11,257.87279,0,0
71045,89,2023-05-02 20:23:00,87,7,121.294747,0,1
13856,24,2022-11-16 19:25:00,74,7,109.547603,0,1
11527,18,2022-11-10 22:38:00,119,6,117.898825,1,2
36420,52,2023-03-06 19:21:00,119,12,109.546669,1,3
31183,47,2023-02-26 06:41:00,120,9,79.864009,1,1
65979,84,2023-04-14 22:26:00,254,5,257.87279,0,1


## Add GE count (7days, 1SD)

In [42]:
import pandas as pd
import numpy as np

def calculate_fixed_ge_threshold(df, study_id_col, timestamp_col, glucose_col, window_days=10, sd_multiplier=1.5):
    """
    Compute GE threshold based on the first X days' data for each StudyID.
    The threshold remains constant for the entire dataset.

    Parameters:
        df (pd.DataFrame): The CGM dataset.
        study_id_col (str): Column representing the Study ID.
        timestamp_col (str): Column containing the timestamp.
        glucose_col (str): Column containing glucose values.
        window_days (int): Number of initial days for threshold calculation.
        sd_multiplier (float): Multiplier for standard deviation.

    Returns:
        pd.DataFrame: Updated DataFrame with `GE_Threshold_Upper`.
    """
    df = df.copy()

    # Sort and reset index
    df = df.sort_values(by=[study_id_col, timestamp_col]).reset_index(drop=True)

    # Initialize threshold column
    df["GE_Threshold_Upper"] = np.nan

    # Compute threshold based on first `window_days` days for each StudyID
    for study_id in df[study_id_col].unique():
        study_data = df[df[study_id_col] == study_id].copy()

        # First `window_days` of data
        initial_data = study_data.iloc[:window_days]

        if len(initial_data) < window_days:
            continue  # Skip if insufficient data

        # Compute mean and standard deviation
        fixed_mean = initial_data[glucose_col].mean()
        fixed_std = initial_data[glucose_col].std()

        # Compute threshold
        fixed_upper_threshold = fixed_mean + (sd_multiplier * fixed_std)

        # Apply threshold to all data for this StudyID
        df.loc[df[study_id_col] == study_id, "GE_Threshold_Upper"] = fixed_upper_threshold

    return df

def add_ge_flag_and_count(df, glucose_col, threshold_col, study_id_col, relative_day_col):
    """
    Add GE flag (0/1) and count unique GE events per `Relative_Day`.

    Parameters:
        df (pd.DataFrame): The CGM dataset.
        glucose_col (str): Column containing glucose values.
        threshold_col (str): Column for GE threshold.
        study_id_col (str): Column representing the Study ID.
        relative_day_col (str): Column representing Relative Day.

    Returns:
        pd.DataFrame: Updated DataFrame with `GE_Flag` and `GE_Count`.
    """
    df = df.copy()

    # Compute GE flag (1 if glucose exceeds threshold)
    df["GE_Flag"] = (df[glucose_col] > df[threshold_col]).astype(int)
    df["GE_Count"] = 0

    # Iterate through each StudyID
    for study_id in df[study_id_col].unique():
        study_data = df[df[study_id_col] == study_id]

        # Iterate through each Relative_Day
        for day in study_data[relative_day_col].unique():
            daily_data = study_data[study_data[relative_day_col] == day].copy()

            # Detect new GE events (1 followed by 0 starts a new event)
            daily_data["GE_Start"] = daily_data["GE_Flag"].diff().fillna(0) == 1

            # Count unique GE occurrences per Relative_Day
            ge_count = daily_data["GE_Start"].sum()

            # Assign count to dataframe
            df.loc[(df[study_id_col] == study_id) & (df[relative_day_col] == day), "GE_Count"] = ge_count

    return df

# === ⚡ Example Usage === #
# Set window_days and sd_multiplier dynamically
window_days = 7  # Changeable parameter
sd_multiplier = 1  # Changeable parameter

# Convert timestamp to datetime
df["Timestamp"] = pd.to_datetime(df["Timestamp"])

# Compute GE Threshold dynamically
df_with_threshold = calculate_fixed_ge_threshold(df, 
                                                 study_id_col="StudyID", 
                                                 timestamp_col="Timestamp", 
                                                 glucose_col="Glucose mg/dL", 
                                                 window_days=window_days, 
                                                 sd_multiplier=sd_multiplier)

# Compute GE Flags and Count
df_with_ge_results = add_ge_flag_and_count(df_with_threshold, 
                                           glucose_col="Glucose mg/dL", 
                                           threshold_col="GE_Threshold_Upper", 
                                           study_id_col="StudyID", 
                                           relative_day_col="Relative_Day")

# Display results
df_with_ge_results.head(60)



Unnamed: 0,StudyID,Timestamp,Glucose mg/dL,Relative_Day,GE_Threshold_Upper,GE_Flag,GE_Count
0,1,2022-10-06 01:34:00,78,1,78.094657,0,6
1,1,2022-10-06 01:49:00,76,1,78.094657,0,6
2,1,2022-10-06 02:04:00,78,1,78.094657,0,6
3,1,2022-10-06 02:19:00,75,1,78.094657,0,6
4,1,2022-10-06 02:34:00,75,1,78.094657,0,6
5,1,2022-10-06 02:49:00,78,1,78.094657,0,6
6,1,2022-10-06 03:04:00,73,1,78.094657,0,6
7,1,2022-10-06 03:19:00,68,1,78.094657,0,6
8,1,2022-10-06 03:34:00,76,1,78.094657,0,6
9,1,2022-10-06 03:49:00,75,1,78.094657,0,6


In [43]:
# Randomly select and print 50 rows from df_with_ge_results
df_random_sample = df_with_ge_results.sample(n=50, random_state=42)

# Print the selected sample
df_random_sample

Unnamed: 0,StudyID,Timestamp,Glucose mg/dL,Relative_Day,GE_Threshold_Upper,GE_Flag,GE_Count
11987,19,2022-11-09 07:07:00,40,3,40.0,0,4
22498,40,2022-12-09 19:36:00,77,3,99.635092,0,5
34377,50,2023-02-23 16:55:00,70,3,87.460076,0,7
66581,84,2023-04-21 04:56:00,168,11,249.000111,0,0
71045,89,2023-05-02 20:23:00,87,7,118.403038,0,1
13856,24,2022-11-16 19:25:00,74,7,84.825992,0,5
11527,18,2022-11-10 22:38:00,119,6,88.223165,1,5
36420,52,2023-03-06 19:21:00,119,12,105.329227,1,4
31183,47,2023-02-26 06:41:00,120,9,80.362716,1,2
65979,84,2023-04-14 22:26:00,254,5,249.000111,1,2


# Add 140mg/dl threshold for GE with SD (14days, 1.5SD)

In [46]:
import pandas as pd
import numpy as np

def add_final_ge_columns(df, glucose_col, threshold_col, study_id_col, relative_day_col):
    """
    Adds columns to check if glucose exceeds 140, determine final GE flag, and count final GE occurrences per Relative_Day.

    Parameters:
        df (pd.DataFrame): The CGM dataset.
        glucose_col (str): Column containing glucose values.
        threshold_col (str): Column for GE threshold.
        study_id_col (str): Column representing the Study ID.
        relative_day_col (str): Column representing Relative Day.

    Returns:
        pd.DataFrame: Updated DataFrame with `GE_Above_140`, `Final_GE_Flag`, and `Final_GE_Count`.
    """
    df = df.copy()

    # Compute flag for glucose exceeding 140
    df["GE_Above_140"] = (df[glucose_col] > 140).astype(int)

    # Compute final GE flag (1 if both 140 threshold and SD-based threshold are exceeded)
    df["Final_GE_Flag"] = ((df["GE_Above_140"] == 1) & (df["GE_Flag"] == 1)).astype(int)

    # Initialize final GE count column
    df["Final_GE_Count"] = 0

    # Iterate through each StudyID
    for study_id in df[study_id_col].unique():
        study_data = df[df[study_id_col] == study_id]

        # Iterate through each Relative_Day
        for day in study_data[relative_day_col].unique():
            daily_data = study_data[study_data[relative_day_col] == day].copy()  # Explicitly create a copy

            # Detect new final GE events (1 followed by 0 starts a new event)
            daily_data["Final_GE_Start"] = daily_data["Final_GE_Flag"].diff().fillna(0) == 1

            # Count unique Final GE occurrences per Relative_Day
            final_ge_count = daily_data["Final_GE_Start"].sum()

            # Assign count to dataframe
            df.loc[(df[study_id_col] == study_id) & (df[relative_day_col] == day), "Final_GE_Count"] = final_ge_count

    return df

# === ⚡ Example Usage === #
# Set window_days and sd_multiplier dynamically
window_days = 14  # Changeable parameter
sd_multiplier = 1.5  # Changeable parameter

# Convert timestamp to datetime
df["Timestamp"] = pd.to_datetime(df["Timestamp"])

# Compute GE Threshold dynamically
df_with_threshold = calculate_fixed_ge_threshold(df, 
                                                 study_id_col="StudyID", 
                                                 timestamp_col="Timestamp", 
                                                 glucose_col="Glucose mg/dL", 
                                                 window_days=window_days, 
                                                 sd_multiplier=sd_multiplier)

# Compute GE Flags and Count
df_with_ge_results = add_ge_flag_and_count(df_with_threshold, 
                                           glucose_col="Glucose mg/dL", 
                                           threshold_col="GE_Threshold_Upper", 
                                           study_id_col="StudyID", 
                                           relative_day_col="Relative_Day")

# Compute final GE conditions
df_with_final_ge = add_final_ge_columns(df_with_ge_results, 
                                        glucose_col="Glucose mg/dL", 
                                        threshold_col="GE_Threshold_Upper", 
                                        study_id_col="StudyID", 
                                        relative_day_col="Relative_Day")

# Display results
df_with_final_ge.head(60)




Unnamed: 0,StudyID,Timestamp,Glucose mg/dL,Relative_Day,GE_Threshold_Upper,GE_Flag,GE_Count,GE_Above_140,Final_GE_Flag,Final_GE_Count
0,1,2022-10-06 01:34:00,78,1,80.436806,0,7,0,0,0
1,1,2022-10-06 01:49:00,76,1,80.436806,0,7,0,0,0
2,1,2022-10-06 02:04:00,78,1,80.436806,0,7,0,0,0
3,1,2022-10-06 02:19:00,75,1,80.436806,0,7,0,0,0
4,1,2022-10-06 02:34:00,75,1,80.436806,0,7,0,0,0
5,1,2022-10-06 02:49:00,78,1,80.436806,0,7,0,0,0
6,1,2022-10-06 03:04:00,73,1,80.436806,0,7,0,0,0
7,1,2022-10-06 03:19:00,68,1,80.436806,0,7,0,0,0
8,1,2022-10-06 03:34:00,76,1,80.436806,0,7,0,0,0
9,1,2022-10-06 03:49:00,75,1,80.436806,0,7,0,0,0


In [53]:
# Randomly select and print 50 rows from df_with_ge_results
df_random_sample = df_with_final_ge.sample(n=30, random_state=42)

# Print the selected sample
df_random_sample



Unnamed: 0,StudyID,Timestamp,Glucose mg/dL,Relative_Day,GE_Threshold_Upper,GE_Flag,GE_Count,GE_Above_140,Final_GE_Flag,Final_GE_Count
11987,19,2022-11-09 07:07:00,40,3,40.0,0,4,0,0,0
22498,40,2022-12-09 19:36:00,77,3,111.597275,0,2,0,0,0
34377,50,2023-02-23 16:55:00,70,3,97.337225,0,6,0,0,0
66581,84,2023-04-21 04:56:00,168,11,257.87279,0,0,1,0,0
71045,89,2023-05-02 20:23:00,87,7,121.294747,0,1,0,0,0
13856,24,2022-11-16 19:25:00,74,7,109.547603,0,1,0,0,0
11527,18,2022-11-10 22:38:00,119,6,117.898825,1,2,0,0,2
36420,52,2023-03-06 19:21:00,119,12,109.546669,1,3,0,0,0
31183,47,2023-02-26 06:41:00,120,9,79.864009,1,1,0,0,0
65979,84,2023-04-14 22:26:00,254,5,257.87279,0,1,1,0,1


## Final GE Count

In [54]:
import pandas as pd

# Select only necessary columns
df_summary = df_with_final_ge[["StudyID", "Relative_Day", "GE_Count", "Final_GE_Count"]]

# Drop duplicate Relative_Day entries per StudyID (keeping the first occurrence)
df_summary = df_summary.drop_duplicates(subset=["StudyID", "Relative_Day"]).sort_values(by=["StudyID", "Relative_Day"])


df_summary

Unnamed: 0,StudyID,Relative_Day,GE_Count,Final_GE_Count
0,1,1,7,0
96,1,2,2,0
192,1,3,3,0
288,1,4,3,0
384,1,5,1,0
...,...,...,...,...
71237,89,9,6,1
71333,89,10,2,2
71429,89,11,6,3
71525,89,12,2,2


In [55]:
# Save the summarized DataFrame as a CSV file
file_path = "Final_GECounts_SD_140.csv"
df_summary.to_csv(file_path, index=False)


In [72]:
df

Unnamed: 0,StudyID,Timestamp,Glucose mg/dL,Relative_Day
0,1,2022-10-06 01:34:00,78,1
1,1,2022-10-06 01:49:00,76,1
2,1,2022-10-06 02:04:00,78,1
3,1,2022-10-06 02:19:00,75,1
4,1,2022-10-06 02:34:00,75,1
...,...,...,...,...
71712,89,2023-05-09 19:08:00,114,13
71713,89,2023-05-09 19:23:00,136,13
71714,89,2023-05-09 19:38:00,142,13
71715,89,2023-05-09 19:53:00,137,13


# Develop GE Presiction Model
## It seems like overfitting

In [57]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

def prepare_ge_prediction_data(df, study_id_col, timestamp_col, glucose_col, window_days=10, sd_multiplier=1.5):
    """
    Prepare data for GE prediction based on historical glucose levels.

    Parameters:
        df (pd.DataFrame): The CGM dataset.
        study_id_col (str): Column representing the Study ID.
        timestamp_col (str): Column containing the timestamp.
        glucose_col (str): Column containing glucose values.
        window_days (int): Number of past days for feature extraction.
        sd_multiplier (float): Multiplier for standard deviation.

    Returns:
        pd.DataFrame: Processed DataFrame with features and target labels.
    """
    df = df.copy()

    # Sort and reset index
    df = df.sort_values(by=[study_id_col, timestamp_col]).reset_index(drop=True)

    # Compute rolling mean and standard deviation for each StudyID
    df["GE_Threshold_Upper"] = np.nan
    df["Final_GE_Flag"] = 0

    for study_id in df[study_id_col].unique():
        study_data = df[df[study_id_col] == study_id].copy()

        # First `window_days` of data
        initial_data = study_data.iloc[:window_days]

        if len(initial_data) < window_days:
            continue

        # Compute mean and standard deviation
        fixed_mean = initial_data[glucose_col].mean()
        fixed_std = initial_data[glucose_col].std()

        # Compute threshold
        fixed_upper_threshold = fixed_mean + (sd_multiplier * fixed_std)

        # Apply threshold
        df.loc[df[study_id_col] == study_id, "GE_Threshold_Upper"] = fixed_upper_threshold

        # Compute final GE flag (both 140 mg/dL and threshold must be exceeded)
        df.loc[(df[study_id_col] == study_id) & 
               (df[glucose_col] > 140) & 
               (df[glucose_col] > fixed_upper_threshold), "Final_GE_Flag"] = 1

    return df

def train_ge_prediction_model(df, feature_cols, target_col):
    """
    Train a model to predict GE occurrences.

    Parameters:
        df (pd.DataFrame): Processed DataFrame with features and target labels.
        feature_cols (list): List of feature column names.
        target_col (str): Target column name.

    Returns:
        model (RandomForestClassifier): Trained machine learning model.
        accuracy (float): Accuracy score of the model.
    """
    # Drop NaN values
    df = df.dropna()

    # Prepare training data
    X = df[feature_cols]
    y = df[target_col]

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Train model (Random Forest Classifier)
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Predict on test set
    y_pred = model.predict(X_test)

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    print("Model Accuracy:", accuracy)
    print(classification_report(y_test, y_pred))

    return model, accuracy

# === ⚡ Example Usage === #
# Set window_days and sd_multiplier dynamically
window_days = 14  # Changeable parameter
sd_multiplier = 1.5  # Changeable parameter


# Convert timestamp to datetime
df["Timestamp"] = pd.to_datetime(df["Timestamp"])

# Prepare data with GE thresholds
df_with_features = prepare_ge_prediction_data(df, 
                                              study_id_col="StudyID", 
                                              timestamp_col="Timestamp", 
                                              glucose_col="Glucose mg/dL", 
                                              window_days=window_days, 
                                              sd_multiplier=sd_multiplier)

# Define features for prediction
feature_columns = ["Glucose mg/dL", "GE_Threshold_Upper"]  # 추가적인 feature 가능
target_column = "Final_GE_Flag"

# Train GE prediction model
ge_model, ge_accuracy = train_ge_prediction_model(df_with_features, feature_columns, target_column)


Model Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13954
           1       1.00      1.00      1.00       390

    accuracy                           1.00     14344
   macro avg       1.00      1.00      1.00     14344
weighted avg       1.00      1.00      1.00     14344



## Imbalanced Data
### The class distribution was 97% (0) vs. 2% (1), indicating severe imbalance
### SMOTE: data resampling


In [58]:
# Check distribution of Final_GE_Flag
print(df_with_features["Final_GE_Flag"].value_counts(normalize=True))


Final_GE_Flag
0    0.972782
1    0.027218
Name: proportion, dtype: float64


In [63]:
pip install imbalanced-learn


Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.4
Note: you may need to restart the kernel to use updated packages.


## The model is still overfitting, even after SMOTE
### Data Might Be Too Simple
### Features Are Too Predictive
### Training & Testing Data Are Too Similar
### Random Forest Might Be Too Strong

In [70]:
# Import necessary libraries
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Prepare feature and target variables
X = df_with_features[["Glucose mg/dL", "GE_Threshold_Upper"]]
y = df_with_features["Final_GE_Flag"]

# Display class distribution before resampling
print("Before Resampling:", y.value_counts(normalize=True))

# Split the dataset into training and testing sets while maintaining class distribution
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=y)

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Display class distribution after resampling
print("After Resampling:", pd.Series(y_resampled).value_counts(normalize=True))

# Train a Random Forest classifier with balanced data
model = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42)
model.fit(X_resampled, y_resampled)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate model performance
print("Model Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Before Resampling: Final_GE_Flag
0    0.972782
1    0.027218
Name: proportion, dtype: float64
After Resampling: Final_GE_Flag
0    0.5
1    0.5
Name: proportion, dtype: float64
Model Accuracy: 0.9978388176240937
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13954
           1       0.93      1.00      0.96       390

    accuracy                           1.00     14344
   macro avg       0.96      1.00      0.98     14344
weighted avg       1.00      1.00      1.00     14344



In [71]:
from sklearn.model_selection import cross_val_score

# Perform 5-Fold Cross-Validation
cv_scores = cross_val_score(model, X_resampled, y_resampled, cv=5, scoring="accuracy")

# Display cross-validation results
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean Accuracy:", np.mean(cv_scores))


Cross-Validation Accuracy Scores: [0.99847704 0.99829787 0.9982978  0.9991041  0.99883533]
Mean Accuracy: 0.9986024298968283


In [79]:
from sklearn.linear_model import LogisticRegression

# Train Logistic Regression Model
model = LogisticRegression()
model.fit(X_resampled, y_resampled)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
print("Model Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Model Accuracy: 0.9795036252091467
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     13954
           1       0.57      1.00      0.73       390

    accuracy                           0.98     14344
   macro avg       0.79      0.99      0.86     14344
weighted avg       0.99      0.98      0.98     14344



In [80]:
# Drop 'GE_Threshold_Upper' and only use raw glucose-related features


In [83]:
# Drop 'GE_Threshold_Upper' and only use raw glucose-related features
def prepare_features(df, study_id_col, timestamp_col, glucose_col, window_days=10, sd_multiplier=1.5):
    """
    Prepare features for GE prediction: rolling mean, standard deviation, threshold calculations.

    Parameters:
        df (pd.DataFrame): The CGM dataset.
        study_id_col (str): Column representing the Study ID.
        timestamp_col (str): Column containing the timestamp.
        glucose_col (str): Column containing glucose values.
        window_days (int): Number of past days for feature extraction.
        sd_multiplier (float): Multiplier for standard deviation.

    Returns:
        pd.DataFrame: Processed DataFrame with computed features.
    """
    df = df.copy()

    # Sort by StudyID and Timestamp to maintain chronological order
    df = df.sort_values(by=[study_id_col, timestamp_col]).reset_index(drop=True)

    # Initialize new columns
    df["Rolling_Mean"] = np.nan
    df["Rolling_Std"] = np.nan
    df["GE_Above_140"] = (df[glucose_col] > 140).astype(int)
    df["Final_GE_Flag"] = 0

    # Compute rolling mean and standard deviation for each StudyID separately
    for study_id in df[study_id_col].unique():
        study_data = df[df[study_id_col] == study_id].copy()

        # Compute rolling statistics
        study_data["Rolling_Mean"] = study_data[glucose_col].rolling(window=window_days, min_periods=1).mean()
        study_data["Rolling_Std"] = study_data[glucose_col].rolling(window=window_days, min_periods=1).std()

        # Compute upper threshold
        study_data["GE_Threshold_Upper"] = study_data["Rolling_Mean"] + (sd_multiplier * study_data["Rolling_Std"])

        # Assign computed values back to the main dataframe (ensuring correct index alignment)
        df.loc[df[study_id_col] == study_id, "Rolling_Mean"] = study_data["Rolling_Mean"].values
        df.loc[df[study_id_col] == study_id, "Rolling_Std"] = study_data["Rolling_Std"].values
        df.loc[df[study_id_col] == study_id, "GE_Threshold_Upper"] = study_data["GE_Threshold_Upper"].values

        # Define final GE flag (if both conditions are met)
        df.loc[(df[study_id_col] == study_id) & 
               (df[glucose_col] > 140) & 
               (df[glucose_col] > df["GE_Threshold_Upper"]), "Final_GE_Flag"] = 1

    return df



def train_ge_prediction_model(df, feature_cols, target_col, use_random_forest=True):
    """
    Train a machine learning model to predict GE occurrences.

    Parameters:
        df (pd.DataFrame): Processed DataFrame with features and target labels.
        feature_cols (list): List of feature column names.
        target_col (str): Target column name.
        use_random_forest (bool): If True, use Random Forest; if False, use Logistic Regression.

    Returns:
        model: Trained machine learning model.
    """
    # Drop NaN values
    df = df.dropna()

    # Prepare training data
    X = df[feature_cols]
    y = df[target_col]

    # Display class distribution before resampling
    print("Before Resampling:", y.value_counts(normalize=True))

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=0.2, 
                                                        random_state=42, 
                                                        stratify=y)

    # Apply SMOTE to balance the dataset
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

    # Display class distribution after resampling
    print("After Resampling:", pd.Series(y_resampled).value_counts(normalize=True))

    # Train model (Random Forest or Logistic Regression)
    if use_random_forest:
        model = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42)
    else:
        model = LogisticRegression()

    model.fit(X_resampled, y_resampled)

    # Predict on test set
    y_pred = model.predict(X_test)

    # Evaluate model performance
    print("Model Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    return model

# === ⚡ Example Usage === #
# Set experiment parameters
window_days = 14  # Changeable parameter (X-day rolling window)
sd_multiplier = 1.5  # Changeable parameter (SD multiplier)


# Convert timestamp to datetime
df["Timestamp"] = pd.to_datetime(df["Timestamp"])

# Generate features dynamically
df_with_features = prepare_features(df, 
                                    study_id_col="StudyID", 
                                    timestamp_col="Timestamp", 
                                    glucose_col="Glucose mg/dL", 
                                    window_days=window_days, 
                                    sd_multiplier=sd_multiplier)

# Define feature columns and target variable
feature_columns = ["Glucose mg/dL", "Rolling_Mean", "Rolling_Std"]  # Removed 'GE_Threshold_Upper'
target_column = "Final_GE_Flag"

# Train GE prediction model (Random Forest)
ge_model_rf = train_ge_prediction_model(df_with_features, feature_columns, target_column, use_random_forest=True)

# Train GE prediction model (Logistic Regression)
ge_model_lr = train_ge_prediction_model(df_with_features, feature_columns, target_column, use_random_forest=False)



Before Resampling: Final_GE_Flag
0    0.981422
1    0.018578
Name: proportion, dtype: float64
After Resampling: Final_GE_Flag
0    0.5
1    0.5
Name: proportion, dtype: float64
Model Accuracy: 0.9847860981226882
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     14063
           1       0.55      1.00      0.71       266

    accuracy                           0.98     14329
   macro avg       0.77      0.99      0.85     14329
weighted avg       0.99      0.98      0.99     14329

Before Resampling: Final_GE_Flag
0    0.981422
1    0.018578
Name: proportion, dtype: float64
After Resampling: Final_GE_Flag
0    0.5
1    0.5
Name: proportion, dtype: float64
Model Accuracy: 0.9838090585525857
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     14063
           1       0.53      1.00      0.70       266

    accuracy                           0.98     14329
   macro avg       0.77      0.

# 30 min interval GE Prediction Model with X day, SD and Threshold

In [87]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

def compute_fixed_ge_threshold(df, study_id_col, timestamp_col, glucose_col, window_days=14, sd_multiplier=1.5, interval_minutes=30):
    """
    Compute GE threshold based on fixed X-day intervals and apply it to predict GE every 30 minutes.

    Parameters:
        df (pd.DataFrame): The CGM dataset.
        study_id_col (str): Column representing the Study ID.
        timestamp_col (str): Column containing the timestamp.
        glucose_col (str): Column containing glucose values.
        window_days (int): Number of days for each fixed interval.
        sd_multiplier (float): Multiplier for standard deviation.
        interval_minutes (int): Interval in minutes for sliding prediction.

    Returns:
        pd.DataFrame: DataFrame with fixed GE threshold applied every 30 minutes.
    """
    df = df.copy()
    
    # Convert timestamp to datetime
    df[timestamp_col] = pd.to_datetime(df[timestamp_col])

    # Sort by StudyID and Timestamp
    df = df.sort_values(by=[study_id_col, timestamp_col]).reset_index(drop=True)

    # Initialize columns
    df["GE_Threshold_Upper"] = np.nan
    df["GE_Above_140"] = (df[glucose_col] > 140).astype(int)
    df["Final_GE_Flag"] = 0

    # Compute GE threshold for each StudyID separately
    for study_id in df[study_id_col].unique():
        study_data = df[df[study_id_col] == study_id].copy()

        # Determine unique window intervals based on X-day grouping
        study_data["Interval_Group"] = (study_data[timestamp_col] - study_data[timestamp_col].min()).dt.days // window_days

        # Create 30-minute time intervals
        min_time = study_data[timestamp_col].min()
        max_time = study_data[timestamp_col].max()
        time_intervals = pd.date_range(min_time, max_time, freq=f"{interval_minutes}T")

        for interval in study_data["Interval_Group"].unique():
            interval_data = study_data[study_data["Interval_Group"] == interval]

            if len(interval_data) < 1:
                continue  # Skip if not enough data

            # Compute Mean and SD for the entire X-day interval
            fixed_mean = interval_data[glucose_col].mean()
            fixed_std = interval_data[glucose_col].std()
            fixed_upper_threshold = fixed_mean + (sd_multiplier * fixed_std)

            # Apply threshold to each 30-minute interval
            for current_time in time_intervals:
                closest_record = interval_data[interval_data[timestamp_col] >= current_time].head(1)

                if not closest_record.empty:
                    idx = closest_record.index[0]

                    # Assign computed threshold values
                    df.loc[idx, "GE_Threshold_Upper"] = fixed_upper_threshold

                    # Define GE flag (if both conditions are met)
                    df.loc[idx, "Final_GE_Flag"] = (
                        (df.loc[idx, glucose_col] > 140) & 
                        (df.loc[idx, glucose_col] > fixed_upper_threshold)
                    ).astype(int)

    return df

def train_ge_prediction_model(df, feature_cols, target_col):
    """
    Train a machine learning model to predict GE occurrences.

    Parameters:
        df (pd.DataFrame): Processed DataFrame with features and target labels.
        feature_cols (list): List of feature column names.
        target_col (str): Target column name.

    Returns:
        model (RandomForestClassifier): Trained machine learning model.
    """
    # Drop NaN values
    df = df.dropna()

    # Prepare training data
    X = df[feature_cols]
    y = df[target_col]

    # Display class distribution before resampling
    print("Before Resampling:", y.value_counts(normalize=True))

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=0.2, 
                                                        random_state=42, 
                                                        stratify=y)

    # Apply SMOTE to balance the dataset
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

    # Display class distribution after resampling
    print("After Resampling:", pd.Series(y_resampled).value_counts(normalize=True))

    # Train model (Random Forest Classifier)
    model = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on test set
    y_pred = model.predict(X_test)

    # Evaluate model performance
    print("\nModel Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

    return model

# === ⚡ Example Usage === #
# Set experiment parameters
window_days = 14  # Fixed X-day interval
sd_multiplier = 1.5  # SD multiplier (1.0 - 1.5)
interval_minutes = 30  # Predict every 30 minutes

# Convert timestamp to datetime
df["Timestamp"] = pd.to_datetime(df["Timestamp"])

# Compute fixed X-day thresholds for GE prediction (every 30 minutes)
df_with_thresholds = compute_fixed_ge_threshold(df, 
                                                study_id_col="StudyID", 
                                                timestamp_col="Timestamp", 
                                                glucose_col="Glucose mg/dL", 
                                                window_days=window_days, 
                                                sd_multiplier=sd_multiplier,
                                                interval_minutes=interval_minutes)

# Define feature columns and target variable
feature_columns = ["Glucose mg/dL", "GE_Threshold_Upper"]
target_column = "Final_GE_Flag"

# Train GE prediction model
ge_model = train_ge_prediction_model(df_with_thresholds, feature_columns, target_column)


Before Resampling: Final_GE_Flag
0    0.972234
1    0.027766
Name: proportion, dtype: float64
After Resampling: Final_GE_Flag
0    0.5
1    0.5
Name: proportion, dtype: float64

Model Accuracy: 0.9990243902439024

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6976
           1       0.97      1.00      0.98       199

    accuracy                           1.00      7175
   macro avg       0.98      1.00      0.99      7175
weighted avg       1.00      1.00      1.00      7175



## Training a Model to Predict GE Every 30 Minutes Based on X-Day Mean, Standard Deviation, and 140 mg/dL Threshold

## However, due to the simple predictive structure using only CGM data, the model continues to experience overfitting.