In [41]:
import pandas as pd
import numpy as np
from scipy import signal

In [42]:
cleaned_file = "../../data/processed/2_cleaned_cgm.csv"  # Path to save the cleaned file
df = pd.read_csv(cleaned_file, nrows=1000)  # Load the cleaned data
df['time'] = pd.to_datetime(df['time'])
df.sort_values(['ID', 'time'], inplace=True)
df.set_index('time', inplace=True)

In [43]:
df_dexcom = df[df['device']== 'intervals_5mins']

In [16]:
df_dexcom = df_dexcom[(df_dexcom['ID'].str.startswith('dexi'))|(df_dexcom['ID'].str.startswith('dexip'))]

In [44]:
# ----- helpers ------------------------------------------------------------- #
def bgi_mgdl(g):
    # Kovatchev 2006 constants for mg/dL
    return 1.509 * (np.log(g) ** 1.084 - 5.381)

def mage_window(x):
    if x.size < 2:          # need at least two points to compute a diff
        return np.nan
    sd = np.std(x)
    peaks, _   = signal.find_peaks(x,  prominence=sd)
    troughs, _ = signal.find_peaks(-x, prominence=sd)
    pts   = np.sort(np.concatenate((peaks, troughs, [0, x.size - 1])))
    diffs = np.abs(np.diff(x[pts]))
    return diffs.mean() if diffs.size else np.nan

# ----- main rolling summary ------------------------------------------------ #
def calculate_metrics(group):
    # ── add the 20 min‐ago glucose ─────────────────────────────────────────────
    group["glc_20_min_ago"] = group["glc"].shift(4)
    
    rolled = group["glc"].rolling("1h")      # no min_periods → every row gets output

    # how many CGM points were in that hour?
    group["samples_1h"] = rolled.count()

    # core summaries
    group["avg_glucose"] = rolled.mean()
    group["sd_glucose"]  = rolled.std()

    # time in range (mg/dL)
    group["time_below_70"]  = rolled.apply(lambda x: (x <  70).mean(), raw=False)
    group["time_70_180"]    = rolled.apply(lambda x: ((x >= 70) & (x <= 180)).mean(), raw=False)
    group["time_above_180"] = rolled.apply(lambda x: (x > 180).mean(),             raw=False)

    # glycaemic risk indices
    group["hbgi"] = rolled.apply(lambda x: 10 * np.mean(np.square(np.maximum(bgi_mgdl(x), 0))),
                                 raw=False)
    group["lbgi"] = rolled.apply(lambda x: 10 * np.mean(np.square(np.minimum(bgi_mgdl(x), 0))),
                                 raw=False)

    # MAGE (1-h window)
    group["mage"] = rolled.apply(mage_window, raw=True)

    return group


In [45]:
from tsfresh.feature_extraction import EfficientFCParameters

# The full default set is 700+ features – far too slow for every 5-min sample.
# EfficientFCParameters gives ~60 low-cost ones.
ts_cfg = EfficientFCParameters()

# If you only want a handful (e.g., autocorr lag-1, skewness, kurtosis) you can
# pass a dict like {"absolute_sum_of_changes": None, "autocorrelation": [{"lag": 1}]}


In [46]:
def build_windows_for_tsfresh(group):
    """
    group: one participant's CGM, indexed by time, columns ['glc', ...]
    returns: DataFrame with cols ['id', 'time', 'value'] ready for tsfresh
    """
    # Create a unique window id for each row = current timestamp
    idx = group.index                      # DateTimeIndex
    window_ids = np.arange(len(idx))       # or use idx.astype("int64")

    # Freeze the glucose slice for each row's PRECEDING hour.
    # We collect them in one list to avoid Python loops.
    series_list = []
    for wid, t_end in zip(window_ids, idx):
        win = group.loc[(t_end - pd.Timedelta("1h")): t_end, "glc"]
        if win.empty:
            continue
        tmp = pd.DataFrame(
            {"id": wid,
             "time": win.index.astype("int64"),   # int time is fine for tsfresh
             "value": win.values}
        )
        series_list.append(tmp)

    return pd.concat(series_list, ignore_index=True)


In [53]:
from tsfresh import extract_features

def tsfresh_features_for_participant(group):
    # build the long table
    long_df = build_windows_for_tsfresh(group)

    # run tsfresh ONCE
    feats = extract_features(
        long_df,
        column_id="id",
        column_sort="time",
        column_kind=None,
        column_value="value",
        default_fc_parameters=ts_cfg,
        n_jobs=0,                      # 0 → use all cores once, not per row
        disable_progressbar=True,
    )

    # feats index is window id; align back to the original timestamps
    feats.index = group.index[:len(feats)]   # same order we created ids

    # concatenate with the simple rolling metrics you already have
    out = pd.concat([group.reset_index(), feats.reset_index(drop=True)], axis=1)
    return out

In [54]:
metrics_df = (
    df.sort_values(["ID", "time"])
      .groupby("ID", group_keys=False)
      .apply(calculate_metrics)          # your pandas-rolling stats + mage
      .groupby("ID", group_keys=False)
      .apply(tsfresh_features_for_participant)
      .reset_index()                     # bring 'time' back if you wish
)

  df.sort_values(["ID", "time"])
  df.sort_values(["ID", "time"])


In [55]:
metrics_df

Unnamed: 0,index,time,ID,glc,device,glc_20_min_ago,samples_1h,avg_glucose,sd_glucose,time_below_70,...,value__fourier_entropy__bins_5,value__fourier_entropy__bins_10,value__fourier_entropy__bins_100,value__permutation_entropy__dimension_3__tau_1,value__permutation_entropy__dimension_4__tau_1,value__permutation_entropy__dimension_5__tau_1,value__permutation_entropy__dimension_6__tau_1,value__permutation_entropy__dimension_7__tau_1,value__query_similarity_count__query_None__threshold_0.0,value__mean_n_absolute_max__number_of_maxima_7
0,0,2015-05-22 11:14:00,aleppo_110,136.0,intervals_5mins,,1.0,136.000000,,0.0,...,,,,,,,,,,
1,1,2015-05-22 12:33:00,aleppo_110,157.0,intervals_5mins,,1.0,157.000000,,0.0,...,,,,,,,,,,
2,2,2015-05-22 12:34:00,aleppo_110,168.0,intervals_5mins,,2.0,162.500000,7.778175,0.0,...,-0.000000,-0.000000,-0.000000,,,,,,,
3,3,2015-05-22 12:36:00,aleppo_110,155.0,intervals_5mins,,3.0,160.000000,7.000000,0.0,...,0.693147,0.693147,0.693147,-0.000000,,,,,,
4,4,2015-05-22 12:41:00,aleppo_110,149.0,intervals_5mins,136.0,4.0,157.250000,7.932003,0.0,...,0.636514,0.636514,1.098612,0.693147,-0.000000,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,2015-05-25 22:56:00,aleppo_110,252.0,intervals_5mins,246.0,11.0,251.363636,3.828126,0.0,...,1.277034,1.747868,1.945910,1.418484,2.043192,2.079442,1.945910,1.791759,,254.857143
996,996,2015-05-25 23:01:00,aleppo_110,253.0,intervals_5mins,248.0,11.0,250.909091,3.207945,0.0,...,1.277034,1.549826,1.945910,1.359237,1.831020,1.906155,1.945910,1.791759,,254.000000
997,997,2015-05-25 23:06:00,aleppo_110,254.0,intervals_5mins,249.0,11.0,251.090909,3.330302,0.0,...,1.153742,1.475076,1.747868,1.088900,1.581094,1.667462,1.747868,1.791759,,253.428571
998,998,2015-05-25 23:11:00,aleppo_110,255.0,intervals_5mins,250.0,12.0,251.416667,3.369875,0.0,...,0.955700,1.153742,1.747868,1.088900,1.303092,1.386294,1.475076,1.560710,,253.857143
