# Import libraries

In [8]:
## Import libraries for machine learning and data processing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

In [9]:
import sys, importlib, site
print("sys.executable:", sys.executable)
print("sys.version:", sys.version)
print("sys.path (first 8):", sys.path[:8])
print("site.getsitepackages():", getattr(site, 'getsitepackages', lambda: None)())
print("USER site:", site.USER_SITE)
print("find tensorflow spec:", importlib.util.find_spec('tensorflow'))

sys.executable: /Users/adityaponnada/Documents/codework/real_time_prompting/real_time_prompting/tfpy/bin/python
sys.version: 3.11.14 (main, Oct  9 2025, 16:16:55) [Clang 17.0.0 (clang-1700.4.4.1)]
sys.path (first 8): ['/opt/homebrew/Cellar/python@3.11/3.11.14_1/Frameworks/Python.framework/Versions/3.11/lib/python311.zip', '/opt/homebrew/Cellar/python@3.11/3.11.14_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11', '/opt/homebrew/Cellar/python@3.11/3.11.14_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11/lib-dynload', '', '/Users/adityaponnada/Documents/codework/real_time_prompting/real_time_prompting/tfpy/lib/python3.11/site-packages']
site.getsitepackages(): ['/Users/adityaponnada/Documents/codework/real_time_prompting/real_time_prompting/tfpy/lib/python3.11/site-packages']
USER site: /Users/adityaponnada/Library/Python/3.11/lib/python/site-packages
find tensorflow spec: ModuleSpec(name='tensorflow', loader=<_frozen_importlib_external.SourceFileLoader object at 0x10

# Read data

In [20]:
## import dataset
raw_feature_df_scaled = pd.read_csv('/Users/adityaponnada/Downloads/time_study_data/processed_features_rnn.csv')
## Display the first few rows of the dataset
raw_feature_df_scaled.head(5)

Unnamed: 0,participant_id,outcome,is_weekend,in_battery_saver_mode,charging_status,screen_on,dist_from_home,is_phone_locked,last_phone_usage,closeness_to_sleep_time,...,mi_wake_day_part_24.0,mi_wake_day_part_25.0,mi_wake_day_part_26.0,mi_wake_day_part_27.0,mi_wake_day_part_28.0,mi_wake_day_part_29.0,mi_wake_day_part_30.0,mi_wake_day_part_31.0,mi_wake_day_part_32.0,mi_wake_day_part_33.0
0,afflictedrevenueepilepsy@timestudy_com,0,0,0.0,0.0,0,0.006074,1.0,60.0,981.983333,...,0,0,0,0,0,0,0,0,0,0
1,afflictedrevenueepilepsy@timestudy_com,1,0,,,0,0.005902,1.0,60.0,973.966667,...,0,0,0,0,0,0,0,0,0,0
2,afflictedrevenueepilepsy@timestudy_com,1,0,0.0,0.0,0,0.005426,1.0,60.0,965.933333,...,0,0,0,0,0,0,0,0,0,0
3,afflictedrevenueepilepsy@timestudy_com,0,0,0.0,1.0,0,0.005985,1.0,60.0,947.966667,...,0,0,0,0,0,0,0,0,0,0
4,afflictedrevenueepilepsy@timestudy_com,0,0,0.0,1.0,0,0.0064,1.0,60.0,936.966667,...,0,0,0,0,0,0,0,0,0,0


# Split training and test

In [88]:
def split_train_test_by_users_random(df, id_col='participant_id', n_train_users=10, random_state=None):
    """
    Randomly split a DataFrame into a train set containing all rows for a randomly
    selected set of `n_train_users` participants and a test set containing the
    remaining participants.

    Returns: (train_df, test_df) with indices reset.
    """
    import numpy as np
    import pandas as pd

    if id_col not in df.columns:
        raise ValueError(f"id_col '{id_col}' not found in DataFrame columns")

    unique_ids = pd.Index(df[id_col].dropna().unique())
    n_unique = len(unique_ids)
    if n_unique == 0:
        raise ValueError('No participant ids found in the DataFrame')
    if n_train_users <= 0 or n_train_users >= n_unique:
        raise ValueError(f'n_train_users must be >0 and < number of unique participants ({n_unique})')

    rng = np.random.default_rng(random_state)
    train_ids = rng.choice(unique_ids, size=n_train_users, replace=False)

    train_df = df[df[id_col].isin(train_ids)].reset_index(drop=True)
    test_df = df[~df[id_col].isin(train_ids)].reset_index(drop=True)

    return train_df, test_df

# Example usage:
train_df, test_df = split_train_test_by_users_random(raw_feature_df_scaled, n_train_users=10, random_state=42)

In [89]:
train_df.head()

Unnamed: 0,participant_id,outcome,is_weekend,in_battery_saver_mode,charging_status,screen_on,dist_from_home,is_phone_locked,last_phone_usage,closeness_to_sleep_time,...,mi_wake_day_part_24.0,mi_wake_day_part_25.0,mi_wake_day_part_26.0,mi_wake_day_part_27.0,mi_wake_day_part_28.0,mi_wake_day_part_29.0,mi_wake_day_part_30.0,mi_wake_day_part_31.0,mi_wake_day_part_32.0,mi_wake_day_part_33.0
0,bartenderradiatorapplied@timestudy_com,0,1,,,1,0.021014,,,828.966667,...,0,0,0,0,0,0,0,0,0,0
1,bartenderradiatorapplied@timestudy_com,0,1,,,1,0.019771,,,816.966667,...,0,0,0,0,0,0,0,0,0,0
2,bartenderradiatorapplied@timestudy_com,0,1,,,1,0.021419,,,808.983333,...,0,0,0,0,0,0,0,0,0,0
3,bartenderradiatorapplied@timestudy_com,0,1,,,1,0.021211,,,800.983333,...,0,0,0,0,0,0,0,0,0,0
4,bartenderradiatorapplied@timestudy_com,1,1,,,0,0.023365,,,773.966667,...,0,0,0,0,0,0,0,0,0,0


# Missing data imputation

For features like location, we will use median imputation. For other features we will forward the last known data (using Linear Interpolation)

In [90]:
# Print the percentage of missing values for each column in train_df
missing_pct_train = train_df.isnull().mean() * 100
print("% Missing values per column in train_df:")
missing_pct_train.sort_values(ascending=False)

% Missing values per column in train_df:


in_battery_saver_mode    52.718143
charging_status          52.718143
dist_from_home           12.220473
is_phone_locked           5.727781
last_phone_usage          5.727781
                           ...    
wake_day_part_12.0        0.000000
wake_day_part_11.0        0.000000
wake_day_part_10.0        0.000000
wake_day_part_9.0         0.000000
mi_wake_day_part_33.0     0.000000
Length: 122, dtype: float64

In [91]:
# Print column dtypes using helper if available, else fallback to direct printing
if 'train_df' not in globals():
    print('train_df not found; run the split cell to create it first.')
else:
    if 'print_column_dtypes' in globals():
        print_column_dtypes(train_df, show_counts=True)
    else:
        print('print_column_dtypes not defined — printing dtypes and non-null counts directly')
        print(train_df.dtypes)

print_column_dtypes not defined — printing dtypes and non-null counts directly
participant_id            object
outcome                    int64
is_weekend                 int64
in_battery_saver_mode    float64
charging_status          float64
                          ...   
mi_wake_day_part_29.0      int64
mi_wake_day_part_30.0      int64
mi_wake_day_part_31.0      int64
mi_wake_day_part_32.0      int64
mi_wake_day_part_33.0      int64
Length: 122, dtype: object


### Fill forward + hybrid imputation for training data

In [92]:
def impute_group_median_then_ffill(df, id_col='participant_id', outcome_col='outcome', mi_prefix='mi_', inplace=False, verbose=False):
    """Impute missing values per participant using group medians and forward-fill."""
    # Steps:
    # 1) Group by participant id.
    # 2) Ignore columns: id_col, outcome_col, and any column starting with mi_prefix.
    # 3) For remaining numeric columns: compute the group's median. If the first value in the
    #    group for that column is NaN, replace it with the group's median (fallback to global median if needed).
    # 4) For remaining NaNs in the group, use forward-fill (LOCF).
    # 5) After group-level processing, compute global medians for the processed columns and return them.
    import pandas as pd
    import numpy as np
    if df is None:
        raise ValueError('df must be a pandas DataFrame')
    if not inplace:
        df = df.copy()
    # Select columns to process (exclude id/outcome/mi_*)
    exclude = {id_col, outcome_col}
    cols_to_process = [c for c in df.columns if c not in exclude and not str(c).startswith(mi_prefix)]
    if verbose:
        print(f'Processing {len(cols_to_process)} columns (excluding {exclude} and prefix)')
    # Work only on numeric columns for median-based imputation; others we will still forward-fill if needed
    numeric_cols = df[cols_to_process].select_dtypes(include=[np.number]).columns.tolist()
    # Compute global medians for numeric columns
    global_medians = df[numeric_cols].median() if numeric_cols else pd.Series(dtype=float)
    # Group by participant and impute per-group
    if id_col in df.columns and numeric_cols:
        grouped = df.groupby(id_col, sort=False)
        for pid, idx in grouped.groups.items():
            for col in numeric_cols:
                s = df.loc[idx, col]
                # group median (may be NaN if group has no non-NaN values)
                try:
                    gm = grouped[col].median().get(pid, np.nan) if hasattr(grouped[col], 'median') else np.nan
                except Exception:
                    gm = np.nan
                if pd.isna(gm):
                    # fallback to global median if group median not available
                    gm = global_medians.get(col, np.nan)
                # If first value is NaN, set it to group median (or global median fallback)
                if not s.empty and pd.isna(s.iloc[0]):
                    if not pd.isna(gm):
                        df.loc[idx[0], col] = gm
                # Forward-fill within the group for remaining NaNs
                # Use transform-style assignment: compute filled series and write back only where original was NaN
                s_after_first = df.loc[idx, col]
                s_filled = s_after_first.fillna(method='ffill')
                mask = s_after_first.isna()
                if mask.any():
                    df.loc[idx, col] = s_filled
    else:
        # If id_col not present or no numeric columns, fall back to global strategies
        for col in numeric_cols:
            # if first value is NaN, replace with global median
            if df[col].isna().iloc[0]:
                gm = global_medians.get(col, np.nan)
                if not pd.isna(gm):
                    df.iloc[0, df.columns.get_loc(col)] = gm
            # forward-fill the rest
            df[col] = df[col].fillna(method='ffill')
    # After group-level imputation, any remaining NaNs in numeric_cols -> fill with global medians
    for col in numeric_cols:
        if df[col].isna().any():
            gm = global_medians.get(col, np.nan)
            if not pd.isna(gm):
                df[col] = df[col].fillna(gm)
    # Build medians dataframe to return (global medians for processed numeric columns)
    if not numeric_cols:
        medians_df = pd.DataFrame()
    else:
        medians_df = pd.DataFrame(global_medians).T.rename(index={0: 'global_median'})
    return df, medians_df

# Example usage: apply the imputer to train_df if available
if 'train_df' in globals():
    train_df, medians = impute_group_median_then_ffill(train_df, verbose=True)
    print('Returned medians (preview):')
    print(medians.head())
else:
    print('train_df not found; run the split cell to create it before imputing.')

Processing 60 columns (excluding {'participant_id', 'outcome'} and prefix)


  s_filled = s_after_first.fillna(method='ffill')
  s_filled = s_after_first.fillna(method='ffill')
  s_filled = s_after_first.fillna(method='ffill')
  s_filled = s_after_first.fillna(method='ffill')
  s_filled = s_after_first.fillna(method='ffill')
  s_filled = s_after_first.fillna(method='ffill')
  s_filled = s_after_first.fillna(method='ffill')
  s_filled = s_after_first.fillna(method='ffill')
  s_filled = s_after_first.fillna(method='ffill')
  s_filled = s_after_first.fillna(method='ffill')
  s_filled = s_after_first.fillna(method='ffill')
  s_filled = s_after_first.fillna(method='ffill')
  s_filled = s_after_first.fillna(method='ffill')
  s_filled = s_after_first.fillna(method='ffill')
  s_filled = s_after_first.fillna(method='ffill')
  s_filled = s_after_first.fillna(method='ffill')
  s_filled = s_after_first.fillna(method='ffill')
  s_filled = s_after_first.fillna(method='ffill')
  s_filled = s_after_first.fillna(method='ffill')
  s_filled = s_after_first.fillna(method='ffill')


Returned medians (preview):
               is_weekend  in_battery_saver_mode  charging_status  screen_on  \
global_median         0.0                    0.0              0.0        0.0   

               dist_from_home  is_phone_locked  last_phone_usage  \
global_median        0.015463              1.0               9.1   

               closeness_to_sleep_time  closeness_to_wake_time  mims_5min  \
global_median               464.916667              467.066667  30.571034   

               ...  wake_day_part_24.0  wake_day_part_25.0  \
global_median  ...                 0.0                 0.0   

               wake_day_part_26.0  wake_day_part_27.0  wake_day_part_28.0  \
global_median                 0.0                 0.0                 0.0   

               wake_day_part_29.0  wake_day_part_30.0  wake_day_part_31.0  \
global_median                 0.0                 0.0                 0.0   

               wake_day_part_32.0  wake_day_part_33.0  
global_median               

  s_filled = s_after_first.fillna(method='ffill')
  s_filled = s_after_first.fillna(method='ffill')
  s_filled = s_after_first.fillna(method='ffill')
  s_filled = s_after_first.fillna(method='ffill')


In [93]:
train_df.head()

Unnamed: 0,participant_id,outcome,is_weekend,in_battery_saver_mode,charging_status,screen_on,dist_from_home,is_phone_locked,last_phone_usage,closeness_to_sleep_time,...,mi_wake_day_part_24.0,mi_wake_day_part_25.0,mi_wake_day_part_26.0,mi_wake_day_part_27.0,mi_wake_day_part_28.0,mi_wake_day_part_29.0,mi_wake_day_part_30.0,mi_wake_day_part_31.0,mi_wake_day_part_32.0,mi_wake_day_part_33.0
0,bartenderradiatorapplied@timestudy_com,0,1,0.0,0.0,1,0.021014,1.0,9.1,828.966667,...,0,0,0,0,0,0,0,0,0,0
1,bartenderradiatorapplied@timestudy_com,0,1,0.0,0.0,1,0.019771,1.0,9.1,816.966667,...,0,0,0,0,0,0,0,0,0,0
2,bartenderradiatorapplied@timestudy_com,0,1,0.0,0.0,1,0.021419,1.0,9.1,808.983333,...,0,0,0,0,0,0,0,0,0,0
3,bartenderradiatorapplied@timestudy_com,0,1,0.0,0.0,1,0.021211,1.0,9.1,800.983333,...,0,0,0,0,0,0,0,0,0,0
4,bartenderradiatorapplied@timestudy_com,1,1,0.0,0.0,0,0.023365,1.0,9.1,773.966667,...,0,0,0,0,0,0,0,0,0,0


In [94]:
medians

Unnamed: 0,is_weekend,in_battery_saver_mode,charging_status,screen_on,dist_from_home,is_phone_locked,last_phone_usage,closeness_to_sleep_time,closeness_to_wake_time,mims_5min,...,wake_day_part_24.0,wake_day_part_25.0,wake_day_part_26.0,wake_day_part_27.0,wake_day_part_28.0,wake_day_part_29.0,wake_day_part_30.0,wake_day_part_31.0,wake_day_part_32.0,wake_day_part_33.0
global_median,0.0,0.0,0.0,0.0,0.015463,1.0,9.1,464.916667,467.066667,30.571034,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Impute test data

In [95]:
# ---------------------------------------------------------------------------
# Impute test data using provided medians + forward-fill per participant
# ---------------------------------------------------------------------------
def impute_test_with_medians_and_ffill(df, medians_df, id_col='participant_id', outcome_col='outcome', mi_prefix='mi_', inplace=False, verbose=False):
    """Impute missing values in a test DataFrame using a medians DataFrame for first-imputation

    For numeric columns present in `medians_df`, any NaN observations in `df` will be
    filled with the corresponding median value from `medians_df`. For remaining NaNs
    (and for non-numeric columns), imputation within each participant is done with
    forward-fill (LOCF). The function groups by `id_col` and performs group-wise
    forward-fill so sequence continuity is preserved per participant.

    Parameters
    - df: pandas.DataFrame (test set)
    - medians_df: pandas.DataFrame with column names matching df columns and at least
      one row containing medians (e.g., the `medians` returned by the train imputer).
    - id_col: column name for participant id (default 'participant_id')
    - outcome_col: column name to skip (default 'outcome')
    - mi_prefix: prefix for missingness indicator columns to skip (default 'mi_')
    - inplace: if False (default) operate on a copy and return it
    - verbose: print progress when True

    Returns the imputed DataFrame (same shape as input).
    """
    import pandas as pd
    import numpy as np

    if df is None:
        raise ValueError('df must be a pandas DataFrame')
    if medians_df is None or medians_df.empty:
        raise ValueError('medians_df must be a non-empty DataFrame')

    if not inplace:
        df = df.copy()

    # Columns to exclude from imputation
    exclude = {id_col, outcome_col}
    cols_to_process = [c for c in df.columns if c not in exclude and not str(c).startswith(mi_prefix)]
    if verbose:
        print(f'Imputing {len(cols_to_process)} columns (excluding {exclude} and prefix "{mi_prefix}")')

    # Determine medians mapping: flatten medians_df to a single row mapping if needed
    # Prefer a column-wise lookup: medians_df may have index like 'global_median'
    if medians_df.shape[0] == 1:
        med_map = medians_df.iloc[0].to_dict()
    else:
        # If multiple rows exist, try to find a row named 'global_median', else take first row
        if 'global_median' in medians_df.index:
            med_map = medians_df.loc['global_median'].to_dict()
        else:
            med_map = medians_df.iloc[0].to_dict()

    # First pass: where df[col] is NaN and med_map has a median, fill with that median
    for col in cols_to_process:
        if col in med_map and pd.notna(med_map.get(col)):
            try:
                # only operate on positions that are NaN
                mask = df[col].isna()
                if mask.any():
                    df.loc[mask, col] = med_map.get(col)
                    if verbose:
                        print(f'Filled {mask.sum()} NaNs in column "{col}" with median {med_map.get(col)}')
            except KeyError:
                # column not present (shouldn't happen since cols_to_process derived from df)
                continue

    # Second pass: group-wise forward-fill for remaining NaNs (per participant)
    if id_col in df.columns:
        grouped = df.groupby(id_col, sort=False)
        for col in cols_to_process:
            # Only proceed if there are NaNs remaining in the column
            if not df[col].isna().any():
                continue
            try:
                # Compute a forward-filled series aligned to the original index per group
                filled = grouped[col].transform(lambda s: s.fillna(method='ffill'))
            except Exception:
                # fallback to a global forward-fill if group transform fails
                filled = df[col].fillna(method='ffill')

            # Only write back values where original was NaN (to avoid overwriting valid data)
            mask = df[col].isna()
            if mask.any():
                df.loc[mask, col] = filled[mask]
                if verbose:
                    print(f'After group-ffill, filled {mask.sum()} remaining NaNs in column "{col}"')
    else:
        # No participant id column: simple forward-fill across the whole df
        for col in cols_to_process:
            if df[col].isna().any():
                before = df[col].isna().sum()
                df[col] = df[col].fillna(method='ffill')
                after = df[col].isna().sum()
                if verbose:
                    print(f'Global ffill {col}: {before-after} values filled')

    # Final pass: if any NaNs remain in columns, fill with med_map fallback
    for col in cols_to_process:
        if df[col].isna().any() and col in med_map and pd.notna(med_map.get(col)):
            before = df[col].isna().sum()
            df[col] = df[col].fillna(med_map.get(col))
            after = df[col].isna().sum()
            if verbose:
                print(f'Filled {before-after} remaining NaNs in "{col}" with median fallback')

    return df

# Example usage: apply to test_df if medians are available
if 'test_df' in globals() and 'medians' in globals():
    test_df = impute_test_with_medians_and_ffill(test_df, medians, verbose=True)
    print('Test set imputation complete. Preview:')
    print(test_df.head())
else:
    print('test_df or medians not available; run previous cells first to create them.')


Imputing 60 columns (excluding {'participant_id', 'outcome'} and prefix "mi_")
Filled 518170 NaNs in column "in_battery_saver_mode" with median 0.0
Filled 518170 NaNs in column "charging_status" with median 0.0
Filled 161838 NaNs in column "dist_from_home" with median 0.0154626683537372
Filled 132247 NaNs in column "is_phone_locked" with median 1.0
Filled 134667 NaNs in column "last_phone_usage" with median 9.1
Filled 679 NaNs in column "closeness_to_sleep_time" with median 464.9166666666667
Filled 679 NaNs in column "closeness_to_wake_time" with median 467.06666666666666
Filled 50460 NaNs in column "mims_5min" with median 30.571034085945772
Test set imputation complete. Preview:
                           participant_id  outcome  is_weekend  \
0  afflictedrevenueepilepsy@timestudy_com        0           0   
1  afflictedrevenueepilepsy@timestudy_com        1           0   
2  afflictedrevenueepilepsy@timestudy_com        1           0   
3  afflictedrevenueepilepsy@timestudy_com      

In [96]:
test_df.head()

Unnamed: 0,participant_id,outcome,is_weekend,in_battery_saver_mode,charging_status,screen_on,dist_from_home,is_phone_locked,last_phone_usage,closeness_to_sleep_time,...,mi_wake_day_part_24.0,mi_wake_day_part_25.0,mi_wake_day_part_26.0,mi_wake_day_part_27.0,mi_wake_day_part_28.0,mi_wake_day_part_29.0,mi_wake_day_part_30.0,mi_wake_day_part_31.0,mi_wake_day_part_32.0,mi_wake_day_part_33.0
0,afflictedrevenueepilepsy@timestudy_com,0,0,0.0,0.0,0,0.006074,1.0,60.0,981.983333,...,0,0,0,0,0,0,0,0,0,0
1,afflictedrevenueepilepsy@timestudy_com,1,0,0.0,0.0,0,0.005902,1.0,60.0,973.966667,...,0,0,0,0,0,0,0,0,0,0
2,afflictedrevenueepilepsy@timestudy_com,1,0,0.0,0.0,0,0.005426,1.0,60.0,965.933333,...,0,0,0,0,0,0,0,0,0,0
3,afflictedrevenueepilepsy@timestudy_com,0,0,0.0,1.0,0,0.005985,1.0,60.0,947.966667,...,0,0,0,0,0,0,0,0,0,0
4,afflictedrevenueepilepsy@timestudy_com,0,0,0.0,1.0,0,0.0064,1.0,60.0,936.966667,...,0,0,0,0,0,0,0,0,0,0


In [97]:
medians.head()

Unnamed: 0,is_weekend,in_battery_saver_mode,charging_status,screen_on,dist_from_home,is_phone_locked,last_phone_usage,closeness_to_sleep_time,closeness_to_wake_time,mims_5min,...,wake_day_part_24.0,wake_day_part_25.0,wake_day_part_26.0,wake_day_part_27.0,wake_day_part_28.0,wake_day_part_29.0,wake_day_part_30.0,wake_day_part_31.0,wake_day_part_32.0,wake_day_part_33.0
global_median,0.0,0.0,0.0,0.0,0.015463,1.0,9.1,464.916667,467.066667,30.571034,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [98]:
medians.columns

Index(['is_weekend', 'in_battery_saver_mode', 'charging_status', 'screen_on',
       'dist_from_home', 'is_phone_locked', 'last_phone_usage',
       'closeness_to_sleep_time', 'closeness_to_wake_time', 'mims_5min',
       'days_in_study', 'completion_24h', 'completion_1h',
       'time_between_prompts', 'time_since_last_answered',
       'completion_since_wake', 'completion_since_start',
       'time_of_day_Afternoon', 'time_of_day_Early Morning',
       'time_of_day_Evening', 'time_of_day_Late Night', 'time_of_day_Morning',
       'time_of_day_Night', 'location_category_Home',
       'location_category_Other', 'location_category_School',
       'location_category_Transit', 'location_category_Work',
       'wake_day_part_0.0', 'wake_day_part_1.0', 'wake_day_part_2.0',
       'wake_day_part_3.0', 'wake_day_part_4.0', 'wake_day_part_5.0',
       'wake_day_part_6.0', 'wake_day_part_7.0', 'wake_day_part_8.0',
       'wake_day_part_9.0', 'wake_day_part_10.0', 'wake_day_part_11.0',
       'w

In [99]:
from pathlib import Path
import pandas as pd
out_path = Path('/Users/adityaponnada/Downloads/time_study_data/general_rnn_medians.csv')
out_path.parent.mkdir(parents=True, exist_ok=True)
if 'medians' in globals() and isinstance(medians, pd.DataFrame) and not medians.empty:
    medians.to_csv(out_path, index=True)
    print(f'Wrote medians to {out_path}')
else:
    print('medians DataFrame not found or empty; nothing written.')

Wrote medians to /Users/adityaponnada/Downloads/time_study_data/general_rnn_medians.csv


## Scale features for training data

In [100]:
# ---------------------------------------------------------------------------
# Z-normalization helper (mean=0, std=1) for selected columns
# ---------------------------------------------------------------------------

def z_normalize_columns(df, cols_to_scale, id_col='participant_id', inplace=False, ddof=0, verbose=False):
    """Z-normalize specified columns per-participant (grouped by `id_col`).

    For each participant, subtract the participant mean and divide by the participant std.
    Groups with zero or undefined std use 1.0 to avoid division by zero.

    Returns:
      - df_out: DataFrame with the specified columns z-normalized
      - means_df: single-row DataFrame containing global means (not grouped by participant)
    """
    import pandas as pd
    import numpy as np

    if df is None:
        raise ValueError('df must be a pandas DataFrame')
    if not isinstance(cols_to_scale, (list, tuple)):
        raise ValueError('cols_to_scale must be a list or tuple of column names')
    if id_col not in df.columns:
        raise ValueError(f"id_col '{id_col}' not found in DataFrame")

    if not inplace:
        df = df.copy()

    # Ensure requested columns exist
    cols = [c for c in cols_to_scale if c in df.columns]
    missing = [c for c in cols_to_scale if c not in df.columns]
    if missing and verbose:
        print(f'Warning: the following requested columns were not found and will be skipped: {missing}')

    if not cols:
        return df, pd.DataFrame()

    # Coerce scaling columns to numeric where possible (non-convertible become NaN)
    for c in cols:
        df[c] = pd.to_numeric(df[c], errors='coerce')

    # Compute global means (single-row) BEFORE normalization so they represent original training means
    global_means_series = df[cols].mean()
    means_df = pd.DataFrame(global_means_series).T
    means_df.index = ['global_mean']
    means_df.index.name = None  # keep a simple single-row DF; caller code accepts single-row DF

    # Group by participant
    grouped = df.groupby(id_col, sort=False)

    # Per-row group means and stds (aligned to df index)
    group_means_per_row = grouped[cols].transform('mean')
    group_stds_per_row = grouped[cols].transform(lambda s: s.std(ddof=ddof))

    # Replace zero or NaN std with 1.0 to avoid division by zero
    group_stds_per_row = group_stds_per_row.fillna(0.0).replace({0.0: 1.0})

    # Perform z-normalization per participant
    df.loc[:, cols] = (df[cols] - group_means_per_row) / group_stds_per_row

    if verbose:
        print('Z-normalized columns (per participant):', cols)
        print('Returned global means shape:', means_df.shape)

    return df, means_df

# Example usage: define the columns to scale and run on train_df if present
cols_to_scale = ['dist_from_home', 'last_phone_usage', 'closeness_to_sleep_time', 'closeness_to_wake_time',
                 'mims_5min', 'completion_24h', 'completion_1h', 'time_between_prompts',
                 'time_since_last_answered', 'completion_since_wake', 'completion_since_start']

if 'train_df' in globals():
    train_df, global_means = z_normalize_columns(train_df, cols_to_scale, inplace=False, verbose=True)
    print('Applied z-normalization to train_df; preview of means:')
    print(global_means)
else:
    print('train_df not available; run split cell first to create train_df.')


Z-normalized columns (per participant): ['dist_from_home', 'last_phone_usage', 'closeness_to_sleep_time', 'closeness_to_wake_time', 'mims_5min', 'completion_24h', 'completion_1h', 'time_between_prompts', 'time_since_last_answered', 'completion_since_wake', 'completion_since_start']
Returned global means shape: (1, 11)
Applied z-normalization to train_df; preview of means:
             dist_from_home  last_phone_usage  closeness_to_sleep_time  \
global_mean       26.758968         20.221428               450.684061   

             closeness_to_wake_time  mims_5min  completion_24h  completion_1h  \
global_mean              499.838291  46.112531        0.757542       0.738929   

             time_between_prompts  time_since_last_answered  \
global_mean              47.15425                145.570644   

             completion_since_wake  completion_since_start  
global_mean               0.692321                0.710762  


In [101]:
train_df.head()

Unnamed: 0,participant_id,outcome,is_weekend,in_battery_saver_mode,charging_status,screen_on,dist_from_home,is_phone_locked,last_phone_usage,closeness_to_sleep_time,...,mi_wake_day_part_24.0,mi_wake_day_part_25.0,mi_wake_day_part_26.0,mi_wake_day_part_27.0,mi_wake_day_part_28.0,mi_wake_day_part_29.0,mi_wake_day_part_30.0,mi_wake_day_part_31.0,mi_wake_day_part_32.0,mi_wake_day_part_33.0
0,bartenderradiatorapplied@timestudy_com,0,1,0.0,0.0,1,-0.178787,1.0,0.0,1.069253,...,0,0,0,0,0,0,0,0,0,0
1,bartenderradiatorapplied@timestudy_com,0,1,0.0,0.0,1,-0.178795,1.0,0.0,1.038789,...,0,0,0,0,0,0,0,0,0,0
2,bartenderradiatorapplied@timestudy_com,0,1,0.0,0.0,1,-0.178784,1.0,0.0,1.018522,...,0,0,0,0,0,0,0,0,0,0
3,bartenderradiatorapplied@timestudy_com,0,1,0.0,0.0,1,-0.178786,1.0,0.0,0.998212,...,0,0,0,0,0,0,0,0,0,0
4,bartenderradiatorapplied@timestudy_com,1,1,0.0,0.0,0,-0.178771,1.0,0.0,0.929624,...,0,0,0,0,0,0,0,0,0,0


In [102]:
global_means

Unnamed: 0,dist_from_home,last_phone_usage,closeness_to_sleep_time,closeness_to_wake_time,mims_5min,completion_24h,completion_1h,time_between_prompts,time_since_last_answered,completion_since_wake,completion_since_start
global_mean,26.758968,20.221428,450.684061,499.838291,46.112531,0.757542,0.738929,47.15425,145.570644,0.692321,0.710762


In [105]:
from pathlib import Path
import pandas as pd

# Write global_means (from training z-normalization) to CSV for later reuse
out_path = Path('/Users/adityaponnada/Downloads/time_study_data/global_means_general_rnn.csv')
out_path.parent.mkdir(parents=True, exist_ok=True)

if 'global_means' in globals():
    gm = global_means
    # Accept Series or DataFrame; convert Series -> single-row DataFrame for consistent saving
    if isinstance(gm, pd.Series):
        gm_df = gm.to_frame().T
    elif isinstance(gm, pd.DataFrame):
        gm_df = gm.copy()
    else:
        print('global_means exists but is not a pandas Series/DataFrame; not written.')
        gm_df = None

    if gm_df is not None and not gm_df.empty:
        gm_df.to_csv(out_path, index=True)
        print(f'Wrote global_means to {out_path}')
    else:
        print('global_means found but empty; nothing written.')
else:
    print('global_means not found in the notebook namespace; run the training scaling cell first.')


Wrote global_means to /Users/adityaponnada/Downloads/time_study_data/global_means_general_rnn.csv


## Scale features for test data

In [103]:
# ---------------------------------------------------------------------------
# Z-normalize test data using global means (train) and per-participant std
# Only scale a fixed set of allowed columns to avoid accidental scaling elsewhere
# ---------------------------------------------------------------------------

def z_normalize_test_using_global_mean(df, global_means_df, cols_to_scale=None, id_col='participant_id', ddof=0, inplace=False, verbose=False):
    """Z-normalize selected columns in `df` for test data.

    Behavior:
    - Centers each column using the mean provided in `global_means_df` (train global means).
    - Scales by the per-participant standard deviation computed on `df` (fallbacks used when needed).
    - Grouped by `id_col` so scaling preserves per-participant time-series structure.
    - Only a predetermined set of allowed columns will be scaled regardless of input.

    Returns the scaled DataFrame.
    """
    import pandas as pd
    import numpy as np

    if df is None:
        raise ValueError('df must be a pandas DataFrame')
    if global_means_df is None or global_means_df.empty:
        raise ValueError('global_means_df must be a non-empty DataFrame containing training means')
    if not inplace:
        df = df.copy()

    # Strict allowed columns (per your request)
    allowed_cols = ['dist_from_home', 'last_phone_usage', 'closeness_to_sleep_time', 'closeness_to_wake_time',
                    'mims_5min', 'completion_24h', 'completion_1h', 'time_between_prompts',
                    'time_since_last_answered', 'completion_since_wake', 'completion_since_start']

    # Resolve global means mapping (accept single-row DataFrame or index 'global_mean')
    if global_means_df.shape[0] == 1:
        gm_map = global_means_df.iloc[0].to_dict()
    else:
        if 'global_mean' in global_means_df.index:
            gm_map = global_means_df.loc['global_mean'].to_dict()
        else:
            gm_map = global_means_df.iloc[0].to_dict()

    # Determine which columns to operate on: intersect provided list (if any) with allowed_cols
    if cols_to_scale is None:
        requested = allowed_cols
    else:
        requested = list(cols_to_scale)

    cols = [c for c in requested if c in df.columns and c in allowed_cols]
    missing = [c for c in allowed_cols if c not in df.columns]
    if missing and verbose:
        print(f'Allowed columns not present in df and skipped: {missing}')
    if not cols:
        if verbose:
            print('No allowed columns found in DataFrame; returning original df')
        return df

    # Coerce to numeric where appropriate
    for c in cols:
        df[c] = pd.to_numeric(df[c], errors='coerce')

    # Compute per-participant stds for the selected columns
    if id_col in df.columns:
        part_std = df.groupby(id_col)[cols].std(ddof=ddof)
    else:
        part_std = pd.DataFrame()

    # Overall fallback stds computed on df
    overall_std = df[cols].std(ddof=ddof)

    # Apply scaling per participant
    if id_col in df.columns:
        grouped = df.groupby(id_col, sort=False)
        for pid, idx in grouped.groups.items():
            # for this participant, get stds row (may be missing or NaN)
            if pid in part_std.index:
                stds_row = part_std.loc[pid]
            else:
                stds_row = pd.Series({c: np.nan for c in cols})

            for col in cols:
                gm = gm_map.get(col, np.nan)
                if pd.isna(gm):
                    # if no global mean available, fallback to column mean from df
                    gm = df[col].mean()
                    if verbose:
                        print(f'Global mean for {col} not found; using test-set mean {gm:.4f} as fallback')

                std_val = stds_row.get(col, np.nan)
                if pd.isna(std_val) or std_val == 0:
                    std_val = overall_std.get(col, np.nan)
                if pd.isna(std_val) or std_val == 0:
                    std_val = 1.0

                # Apply z-normalization for this participant and column
                try:
                    df.loc[idx, col] = (df.loc[idx, col] - gm) / std_val
                except Exception:
                    # fallback: vectorized operation ensures alignment by index
                    col_vals = df.loc[idx, col].to_numpy(dtype=float)
                    df.loc[idx, col] = (col_vals - gm) / std_val
    else:
        # No participant id: apply global centering with gm_map and overall std
        for col in cols:
            gm = gm_map.get(col, np.nan)
            if pd.isna(gm):
                gm = df[col].mean()
            std_val = overall_std.get(col, np.nan)
            if pd.isna(std_val) or std_val == 0:
                std_val = 1.0
            df[col] = (df[col] - gm) / std_val

    if verbose:
        print('Completed z-normalization of test data for columns:', cols)

    return df

# Example usage (if global_means and test_df exist):
if 'test_df' in globals() and 'global_means' in globals():
    test_df = z_normalize_test_using_global_mean(test_df, global_means, cols_to_scale=None, verbose=True)
    print('Scaled test_df preview:')
    print(test_df[[c for c in ['distance_from_home', 'last_phone_usage', 'closeness_to_sleep_time', 'closeness_to_wake_time', 'mims_5min', 'completion_24h', 'completion_1h', 'time_between_prompts', 'time_since_last_answered', 'completion_since_wake', 'completion_since_start'] if c in test_df.columns]].head())
else:
    print('test_df or global_means not found; run previous cells to produce them before scaling test data.')


Completed z-normalization of test data for columns: ['dist_from_home', 'last_phone_usage', 'closeness_to_sleep_time', 'closeness_to_wake_time', 'mims_5min', 'completion_24h', 'completion_1h', 'time_between_prompts', 'time_since_last_answered', 'completion_since_wake', 'completion_since_start']
Scaled test_df preview:
   last_phone_usage  closeness_to_sleep_time  closeness_to_wake_time  \
0          1.864088                 2.017567               -1.865121   
1          1.864088                 1.987124               -1.834719   
2          1.864088                 1.956618               -1.804255   
3          1.864088                 1.888391               -1.736120   
4          1.864088                 1.846619               -1.694405   

   mims_5min  completion_24h  completion_1h  time_between_prompts  \
0  -0.706761      -13.933019      -4.140509             -0.174017   
1  -0.373796      -13.933019      -4.140509             -0.144432   
2  -0.803382       -4.736813      -1.3388

In [104]:
test_df.head()

Unnamed: 0,participant_id,outcome,is_weekend,in_battery_saver_mode,charging_status,screen_on,dist_from_home,is_phone_locked,last_phone_usage,closeness_to_sleep_time,...,mi_wake_day_part_24.0,mi_wake_day_part_25.0,mi_wake_day_part_26.0,mi_wake_day_part_27.0,mi_wake_day_part_28.0,mi_wake_day_part_29.0,mi_wake_day_part_30.0,mi_wake_day_part_31.0,mi_wake_day_part_32.0,mi_wake_day_part_33.0
0,afflictedrevenueepilepsy@timestudy_com,0,0,0.0,0.0,0,-1.791324,1.0,1.864088,2.017567,...,0,0,0,0,0,0,0,0,0,0
1,afflictedrevenueepilepsy@timestudy_com,1,0,0.0,0.0,0,-1.791336,1.0,1.864088,1.987124,...,0,0,0,0,0,0,0,0,0,0
2,afflictedrevenueepilepsy@timestudy_com,1,0,0.0,0.0,0,-1.791367,1.0,1.864088,1.956618,...,0,0,0,0,0,0,0,0,0,0
3,afflictedrevenueepilepsy@timestudy_com,0,0,0.0,1.0,0,-1.79133,1.0,1.864088,1.888391,...,0,0,0,0,0,0,0,0,0,0
4,afflictedrevenueepilepsy@timestudy_com,0,0,0.0,1.0,0,-1.791302,1.0,1.864088,1.846619,...,0,0,0,0,0,0,0,0,0,0


## RNN Training

In [107]:
print(train_df.shape[0])
print(train_df.shape[1])

93998
122


In [108]:
## Number of observations per participant
# Compute and print the number of observations per participant_id in train_df
try:
    counts = train_df['participant_id'].value_counts().sort_index()
    print('Number of observations per participant_id:')
    print(counts.to_string())
    # store the maximum count in obs_len
    obs_len = int(counts.max()) if not counts.empty else 0
    print(f'obs_len (max observations per participant) = {obs_len}')
except NameError:
    print('train_df is not defined. Run the split to create train_df first.')
    obs_len = None
except Exception as e:
    print('Error computing observation counts:', e)
    obs_len = None


Number of observations per participant_id:
participant_id
bartenderradiatorapplied@timestudy_com      4692
brinkaminounframed@timestudy_com           11267
defilinganywayimmovable@timestudy_com       8060
headwearskirmishantidote@timestudy_com     15798
pettytransfixedsolubly@timestudy_com        4751
remoldexcludingaffair@timestudy_com         6426
retrialgraftedsturdy@timestudy_com          3707
superiorpassablecosmic@timestudy_com       11674
urchinvariablytrend@timestudy_com          15868
washboardceramicsenticing@timestudy_com    11755
obs_len (max observations per participant) = 15868


In [109]:
num_users = train_df['participant_id'].nunique()
print(num_users)

10


In [110]:
print(train_df.columns)


Index(['participant_id', 'outcome', 'is_weekend', 'in_battery_saver_mode',
       'charging_status', 'screen_on', 'dist_from_home', 'is_phone_locked',
       'last_phone_usage', 'closeness_to_sleep_time',
       ...
       'mi_wake_day_part_24.0', 'mi_wake_day_part_25.0',
       'mi_wake_day_part_26.0', 'mi_wake_day_part_27.0',
       'mi_wake_day_part_28.0', 'mi_wake_day_part_29.0',
       'mi_wake_day_part_30.0', 'mi_wake_day_part_31.0',
       'mi_wake_day_part_32.0', 'mi_wake_day_part_33.0'],
      dtype='object', length=122)


In [111]:
## Print the shape of train_df and count feature columns excluding id/outcome
try:
    n_cols_total = train_df.shape[1]
    print('Total columns in train_df:', n_cols_total)
    # define which columns to exclude from feature count
    exclude_cols = ['participant_id', 'outcome']
    feature_cols = [c for c in train_df.columns if c not in exclude_cols]
    n_feature_cols = len(feature_cols)
    print(f'Number of columns excluding {exclude_cols}: {n_feature_cols}')
except NameError:
    print('train_df is not defined. Run the split to create train_df first.')
    n_feature_cols = None
except Exception as e:
    print('Error computing column counts:', e)
    n_feature_cols = None


Total columns in train_df: 122
Number of columns excluding ['participant_id', 'outcome']: 120


### Prepare 3D shapes for TF

In [113]:
# Utilities to build grouped sequences and pad them (reusable to avoid repeating logic)
def infer_feature_cols(df, id_col='participant_id', outcome_col='outcome'):
    """Return list of feature columns by excluding id/outcome."""
    exclude = {id_col, outcome_col}
    return [c for c in df.columns if c not in exclude]

def build_grouped_sequences(df, feature_cols=None, id_col='participant_id', sort_col=None):
    """
    Build a list of 2D arrays (timesteps x features) grouped by id_col.
    Returns (participant_ids, sequences).
    Note: uses df[feature_cols].to_numpy() so no extra imports required here.
    """
    if feature_cols is None:
        feature_cols = infer_feature_cols(df, id_col=id_col)
    if sort_col is not None and sort_col in df.columns:
        grouped = df.sort_values(sort_col).groupby(id_col, sort=True)
    else:
        grouped = df.groupby(id_col, sort=True)

    pids = []
    seqs = []
    for pid, group in grouped:
        pids.append(pid)
        # to_numpy yields ndarray; dtype casting to float32 can be done at pad time if needed
        seqs.append(group[feature_cols].to_numpy())
    return pids, seqs

def pad_seqs_list(seqs, maxlen, pad_value=-999.0):
    """
    Pad a list of ragged 2D arrays into a 3D array (n_items, maxlen, n_features)
    and return (padded_array, mask) where mask is boolean (True = valid timestep).
    This function expects `pad_sequences` and `np` to be available in the caller's scope
    (they can be imported later in the same cell).
    """
    # If there are no sequences, return empty structures
    if not seqs:
        return None, None

    # keras pad_sequences expects a list of 2D arrays; ensure dtype=float32
    seqs_float32 = [s.astype('float32') for s in seqs]
    padded = pad_sequences(seqs_float32, maxlen=int(maxlen), dtype='float32',
                           padding='post', truncating='post', value=pad_value)
    mask = np.any(padded != pad_value, axis=-1)
    return padded, mask

def build_and_pad_from_dfs(train_df, test_df=None, obs_len=None, pad_value=-999.0,
                           id_col='participant_id', outcome_col='outcome', sort_col=None):
    """
    Convenience wrapper: build grouped sequences for train/test, decide MAX_LENGTH
    (prefer obs_len when provided), pad sequences and return a dict with results.
    Returned keys: feature_cols, train_pids, test_pids, train_seqs, test_seqs,
                   X_train_padded, X_test_padded, train_mask, test_mask, MAX_LENGTH
    """
    feature_cols = infer_feature_cols(train_df, id_col=id_col, outcome_col=outcome_col)
    train_pids, train_seqs = build_grouped_sequences(train_df, feature_cols, id_col, sort_col)
    test_pids, test_seqs = ([], []) if test_df is None else build_grouped_sequences(test_df, feature_cols, id_col, sort_col)

    # Determine MAX_LENGTH
    if obs_len is not None and int(obs_len) > 0:
        MAX_LENGTH = int(obs_len)
    else:
        all_lengths = [len(s) for s in train_seqs] + [len(s) for s in test_seqs]
        if not all_lengths:
            raise RuntimeError('No sequences found to infer MAX_LENGTH')
        MAX_LENGTH = int(max(all_lengths))

    X_train_padded, train_mask = pad_seqs_list(train_seqs, MAX_LENGTH, pad_value)
    X_test_padded, test_mask = (None, None)
    if test_seqs:
        X_test_padded, test_mask = pad_seqs_list(test_seqs, MAX_LENGTH, pad_value)

    return {
        'feature_cols': feature_cols,
        'train_pids': train_pids,
        'test_pids': test_pids,
        'train_seqs': train_seqs,
        'test_seqs': test_seqs,
        'X_train_padded': X_train_padded,
        'X_test_padded': X_test_padded,
        'train_mask': train_mask,
        'test_mask': test_mask,
        'MAX_LENGTH': MAX_LENGTH
    }
# Force a fixed MAX_LENGTH for all participants (270 days * 24 hours * 4 timesteps per hour)
obs_len = 270 * 24 * 4
print(f'Setting obs_len (fixed MAX_LENGTH) = {obs_len}')
# Assumes `train_df` and optionally `test_df` exist and are already scaled.
# We group rows by `participant_id`, collect feature columns (exclude id/outcome),
# optionally sort by a timestamp column if present, then pad to obs_len (or inferred).

from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

PADDING_VALUE = -999.0
ID_COL = 'participant_id'
OUTCOME_COL = 'outcome'

# Verify train_df exists
if 'train_df' not in globals() or train_df is None:
    raise RuntimeError('train_df not found. Run the split/preprocessing cells before padding.')

# Choose feature columns: all except ID_COL and OUTCOME_COL
exclude_cols = {ID_COL, OUTCOME_COL}
feature_cols = [c for c in train_df.columns if c not in exclude_cols]
if not feature_cols:
    raise RuntimeError(f'No feature columns found after excluding {exclude_cols}')

# If there's a timestamp-like column, use it to sort within each participant
timestamp_candidates = ['timestamp','time','start_time','started_at','created_at','event_time']
sort_col = next((c for c in timestamp_candidates if c in train_df.columns), None)

def _build_sequences_from_df(df):
    """Return a list of 2D numpy arrays (timesteps x features), one per participant in df."""
    seqs = []
    # group preserves order only if df is already ordered; we'll sort if a sort_col exists
    if sort_col is not None:
        grouped = df.sort_values(sort_col).groupby(ID_COL, sort=True)
    else:
        grouped = df.groupby(ID_COL, sort=True)

    for pid, group in grouped:
        # extract feature columns as a 2D array
        arr = group[feature_cols].to_numpy(dtype=np.float32)
        seqs.append(arr)
    return seqs

# Build train sequences
X_train_seqs = _build_sequences_from_df(train_df)
print(f'Built {len(X_train_seqs)} training sequences. Example lengths (first 5):', [len(s) for s in X_train_seqs[:5]])

# Build test sequences if test_df exists
X_test_seqs = None
if 'test_df' in globals() and test_df is not None:
    X_test_seqs = _build_sequences_from_df(test_df)
    print(f'Built {len(X_test_seqs)} test sequences. Example lengths (first 5):', [len(s) for s in X_test_seqs[:5]])

# Determine MAX_LENGTH: prefer obs_len, else infer from sequences
if 'obs_len' in globals() and obs_len is not None and int(obs_len) > 0:
    MAX_LENGTH = int(obs_len)
else:
    all_lengths = [len(s) for s in X_train_seqs]
    if X_test_seqs is not None:
        all_lengths += [len(s) for s in X_test_seqs]
    if not all_lengths:
        raise RuntimeError('No sequences found to infer MAX_LENGTH')
    MAX_LENGTH = int(max(all_lengths))

print(f'Using MAX_LENGTH = {MAX_LENGTH} for padding')

# Pad sequences into 3D arrays (n_users, time_steps, n_features)
try:
    X_train_padded = pad_sequences(X_train_seqs, maxlen=MAX_LENGTH, dtype='float32', padding='post', truncating='post', value=PADDING_VALUE)
    print('X_train_padded shape:', X_train_padded.shape)
except Exception as e:
    X_train_padded = None
    print('Failed to pad training sequences:', e)

X_test_padded = None
if X_test_seqs is not None:
    try:
        X_test_padded = pad_sequences(X_test_seqs, maxlen=MAX_LENGTH, dtype='float32', padding='post', truncating='post', value=PADDING_VALUE)
        print('X_test_padded shape:', X_test_padded.shape)
    except Exception as e:
        X_test_padded = None
        print('Failed to pad test sequences:', e)

# Create boolean masks (True = valid timestep)
train_mask = None
test_mask = None
if X_train_padded is not None:
    train_mask = np.any(X_train_padded != PADDING_VALUE, axis=-1)
    print('train_mask shape:', train_mask.shape)
if X_test_padded is not None:
    test_mask = np.any(X_test_padded != PADDING_VALUE, axis=-1)
    print('test_mask shape:', test_mask.shape)

# Expose useful names to the notebook globals
# X_train_seqs, X_test_seqs, X_train_padded, X_test_padded, train_mask, test_mask, feature_cols, MAX_LENGTH


Setting obs_len (fixed MAX_LENGTH) = 25920
Built 10 training sequences. Example lengths (first 5): [4692, 11267, 8060, 15798, 4751]
Built 90 test sequences. Example lengths (first 5): [13964, 13567, 12279, 10845, 14494]
Using MAX_LENGTH = 25920 for padding
X_train_padded shape: (10, 25920, 120)
Built 90 test sequences. Example lengths (first 5): [13964, 13567, 12279, 10845, 14494]
Using MAX_LENGTH = 25920 for padding
X_train_padded shape: (10, 25920, 120)
X_test_padded shape: (90, 25920, 120)
train_mask shape: (10, 25920)
X_test_padded shape: (90, 25920, 120)
train_mask shape: (10, 25920)
test_mask shape: (90, 25920)
test_mask shape: (90, 25920)
