## Import libraries

In [1]:
import pandas as pd 
import numpy as np 
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
import gc
import json
import time
from tensorflow.keras.models import load_model

In [2]:
## For CPU only use

USE_CPU_ONLY = True 

if USE_CPU_ONLY:
    print("\n[STABILITY MODE] Disabling GPU to prevent Metal compilation hangs...")
    tf.config.set_visible_devices([], 'GPU')
else:
    # Re-adding the memory growth code for when GPU is enabled
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            print("[GPU MODE] Memory growth enabled.")
        except RuntimeError as e:
            print(f"Memory growth setting failed: {e}")


[STABILITY MODE] Disabling GPU to prevent Metal compilation hangs...


## Import data files

In [3]:
## Read general meadian file
global_median = pd.read_csv('/Users/adityaponnada/Downloads/time_study_data/hybrid_rnn_medians.csv')

In [4]:
global_median.head()
print(global_median.columns)
print(global_median.shape)

Index(['Unnamed: 0', 'is_weekend', 'in_battery_saver_mode', 'charging_status',
       'screen_on', 'dist_from_home', 'is_phone_locked', 'last_phone_usage',
       'closeness_to_sleep_time', 'closeness_to_wake_time', 'mims_5min',
       'days_in_study', 'completion_24h', 'completion_1h',
       'time_between_prompts', 'time_since_last_answered',
       'completion_since_wake', 'completion_since_start',
       'time_of_day_Afternoon', 'time_of_day_Early Morning',
       'time_of_day_Evening', 'time_of_day_Late Night', 'time_of_day_Morning',
       'time_of_day_Night', 'location_category_Home',
       'location_category_Other', 'location_category_School',
       'location_category_Transit', 'location_category_Work',
       'wake_day_part_0.0', 'wake_day_part_1.0', 'wake_day_part_2.0',
       'wake_day_part_3.0'],
      dtype='object')
(1, 33)


In [5]:
## Read the global means file
global_means = pd.read_csv('/Users/adityaponnada/Downloads/time_study_data/global_means_hybrid_rnn.csv')
global_means.head()
print(global_means.columns)
print(global_means.shape)

Index(['Unnamed: 0', 'dist_from_home', 'last_phone_usage',
       'closeness_to_sleep_time', 'closeness_to_wake_time', 'mims_5min',
       'completion_24h', 'completion_1h', 'time_between_prompts',
       'time_since_last_answered', 'completion_since_wake',
       'completion_since_start'],
      dtype='object')
(1, 12)


In [6]:
global_means

Unnamed: 0.1,Unnamed: 0,dist_from_home,last_phone_usage,closeness_to_sleep_time,closeness_to_wake_time,mims_5min,completion_24h,completion_1h,time_between_prompts,time_since_last_answered,completion_since_wake,completion_since_start
0,global_mean,88.030981,15.070819,475.398093,458.360935,48.62911,0.849871,0.828608,39.792924,69.221791,0.808598,0.81387


In [7]:
## Read the withdrew processed features file
withdrew_features = pd.read_csv('/Users/adityaponnada/Downloads/time_study_data/processed_features_withdrew.csv')
print(withdrew_features.shape)
withdrew_features.head()

(235071, 66)


Unnamed: 0,participant_id,outcome,is_weekend,in_battery_saver_mode,charging_status,screen_on,dist_from_home,is_phone_locked,last_phone_usage,closeness_to_sleep_time,...,mi_time_of_day_Night,mi_location_category_Home,mi_location_category_Other,mi_location_category_School,mi_location_category_Transit,mi_location_category_Work,mi_wake_day_part_0.0,mi_wake_day_part_1.0,mi_wake_day_part_2.0,mi_wake_day_part_3.0
0,ambushdollhousegenerous@timestudy_com,0,0,0.0,1.0,0,0.060436,1.0,11.1,980.95,...,0,0,0,0,0,0,0,0,0,0
1,ambushdollhousegenerous@timestudy_com,1,0,0.0,0.0,1,0.059622,0.0,0.0,830.9,...,0,0,0,0,0,0,0,0,0,0
2,ambushdollhousegenerous@timestudy_com,1,0,,,1,0.042405,0.0,0.0,487.783333,...,0,0,0,0,0,0,0,0,0,0
3,ambushdollhousegenerous@timestudy_com,0,0,0.0,1.0,1,0.008069,0.0,0.0,242.95,...,0,0,0,0,0,0,0,0,0,0
4,ambushdollhousegenerous@timestudy_com,1,0,0.0,0.0,1,0.059189,0.0,0.0,186.583333,...,0,0,0,0,0,0,0,0,0,0


## Impute missing data

In [8]:
def impute_within_participant(withdrew_df, global_median, id_col='participant_id'):
    """Impute missing values by forward-filling within each participant.

    - Skips columns starting with "mi_".
    - Excludes `id_col` and outcome columns (`outcome`, `outcomes`).
    - If the first observation for a participant is NaN, fills it using `global_median`.

    Args:
        withdrew_df (pd.DataFrame): input features dataframe (observations ordered per participant).
        global_median (pd.Series or pd.DataFrame): mapping of column -> median value.
        id_col (str): participant id column name (default: 'participant_id').

    Returns:
        pd.DataFrame: imputed copy of `withdrew_df`.
    """
    import pandas as pd
    import numpy as np

    if id_col not in withdrew_df.columns:
        raise ValueError(f"id_col '{id_col}' not found in withdrew_df columns")

    df = withdrew_df.copy()

    # Exclude participant id and outcome columns (case-insensitive)
    exclude = {id_col.lower(), 'outcome', 'outcomes'}

    # Build list of columns to impute
    cols_to_impute = [c for c in df.columns if (c.lower() not in exclude) and (not c.lower().startswith('mi_'))]

    # Forward-fill within each participant
    try:
        df[cols_to_impute] = df.groupby(id_col, sort=False)[cols_to_impute].ffill()
    except Exception:
        # Fallback if grouping/ffill fails for mixed dtypes
        df[cols_to_impute] = df.groupby(id_col, sort=False)[cols_to_impute].apply(lambda g: g.ffill())

    # Helper to extract median from global_median (Series or DataFrame)
    def _get_global_median(col):
        try:
            if isinstance(global_median, pd.Series):
                return global_median.get(col, np.nan)
            if isinstance(global_median, pd.DataFrame):
                # If column exists as DataFrame column, take its first non-null
                if col in global_median.columns:
                    vals = global_median[col].dropna().values
                    if len(vals) > 0:
                        return vals[0]
                # If index contains column names and there's a 'median' column
                if 'median' in global_median.columns and col in global_median.index:
                    return global_median.loc[col, 'median']
                # If index contains column names, return first non-null in that row
                if col in global_median.index:
                    row = global_median.loc[col]
                    if hasattr(row, 'dropna'):
                        vals = row.dropna().values
                        if len(vals) > 0:
                            return vals[0]
        except Exception:
            pass
        return np.nan

    # Fill remaining NaNs (leading NaNs) with global median (or df median as fallback)
    for col in cols_to_impute:
        if df[col].isna().any():
            med = _get_global_median(col)
            if pd.isna(med):
                try:
                    med = df[col].median(skipna=True)
                except Exception:
                    med = np.nan
            if pd.notna(med):
                df[col] = df[col].fillna(med)

    return df


In [9]:
withdrew_features = impute_within_participant(withdrew_features, global_median, id_col='participant_id')
withdrew_features.columns
withdrew_features.head()

Unnamed: 0,participant_id,outcome,is_weekend,in_battery_saver_mode,charging_status,screen_on,dist_from_home,is_phone_locked,last_phone_usage,closeness_to_sleep_time,...,mi_time_of_day_Night,mi_location_category_Home,mi_location_category_Other,mi_location_category_School,mi_location_category_Transit,mi_location_category_Work,mi_wake_day_part_0.0,mi_wake_day_part_1.0,mi_wake_day_part_2.0,mi_wake_day_part_3.0
0,ambushdollhousegenerous@timestudy_com,0,0,0.0,1.0,0,0.060436,1.0,11.1,980.95,...,0,0,0,0,0,0,0,0,0,0
1,ambushdollhousegenerous@timestudy_com,1,0,0.0,0.0,1,0.059622,0.0,0.0,830.9,...,0,0,0,0,0,0,0,0,0,0
2,ambushdollhousegenerous@timestudy_com,1,0,0.0,0.0,1,0.042405,0.0,0.0,487.783333,...,0,0,0,0,0,0,0,0,0,0
3,ambushdollhousegenerous@timestudy_com,0,0,0.0,1.0,1,0.008069,0.0,0.0,242.95,...,0,0,0,0,0,0,0,0,0,0
4,ambushdollhousegenerous@timestudy_com,1,0,0.0,0.0,1,0.059189,0.0,0.0,186.583333,...,0,0,0,0,0,0,0,0,0,0


## Z-normalization

In [10]:
def z_normalize_within_participant(withdrew_df, global_means, id_col='participant_id', cols=None):
    """Z-normalize selected features using provided global_means.

    - Only normalizes columns in `cols` (defaults to the list requested).
    - Skips `participant_id` and outcome columns.
    - Uses `global_means` to obtain per-column means. For std, tries to read from
      `global_means` (column 'std' or a 'std' entry) and falls back to the column std
      computed from `withdrew_df` if unavailable.
    - The normalization is applied grouped by `id_col` (i.e., transform per participant),
      but uses global mean/std values for scaling.

    Args:
        withdrew_df (pd.DataFrame): input dataframe containing features.
        global_means (pd.Series or pd.DataFrame): mapping of column -> mean (and optionally std).
        id_col (str): participant id column name.
        cols (list[str] | None): list of columns to normalize. If None, uses the default set.

    Returns:
        pd.DataFrame: a copy of `withdrew_df` with selected columns z-normalized.
    """
    import pandas as pd
    import numpy as np

    default_cols = [
        'dist_from_home', 'last_phone_usage', 'closeness_to_sleep_time', 'closeness_to_wake_time',
        'mims_5min', 'completion_24h', 'completion_1h', 'time_between_prompts',
        'time_since_last_answered', 'completion_since_wake', 'completion_since_start'
    ]
    if cols is None:
        cols = default_cols

    if id_col not in withdrew_df.columns:
        raise ValueError(f"id_col '{id_col}' not found in withdrew_df columns")

    df = withdrew_df.copy()

    # Exclude id and outcome columns explicitly
    exclude = {id_col.lower(), 'outcome', 'outcomes'}
    cols_to_scale = [c for c in cols if (c in df.columns) and (c.lower() not in exclude)]

    def _get_mean(col):
        try:
            if isinstance(global_means, pd.Series):
                return global_means.get(col, np.nan)
            if isinstance(global_means, pd.DataFrame):
                # If there's a 'mean' column and index contains column names
                if 'mean' in global_means.columns and col in global_means.index:
                    return global_means.loc[col, 'mean']
                # If the DataFrame has the feature as a column, take first non-null
                if col in global_means.columns:
                    vals = global_means[col].dropna().values
                    if len(vals) > 0:
                        return vals[0]
                # If index contains col and any column present, take first non-null in the row
                if col in global_means.index:
                    row = global_means.loc[col]
                    if hasattr(row, 'dropna'):
                        vals = row.dropna().values
                        if len(vals) > 0:
                            return vals[0]
        except Exception:
            pass
        return np.nan

    def _get_std(col):
        try:
            if isinstance(global_means, pd.DataFrame):
                if 'std' in global_means.columns and col in global_means.index:
                    return global_means.loc[col, 'std']
                # If std is provided as a column named like '<col>_std'
                std_col = f"{col}_std"
                if std_col in global_means.columns:
                    vals = global_means[std_col].dropna().values
                    if len(vals) > 0:
                        return vals[0]
            # if global_means is a Series containing tuples or dicts, skip
        except Exception:
            pass
        # Fallback: compute std from withdrew_df
        try:
            s = df[col].std(skipna=True)
            if pd.notna(s) and s > 0:
                return s
        except Exception:
            pass
        return np.nan

    for col in cols_to_scale:
        mean = _get_mean(col)
        std = _get_std(col)
        if pd.isna(mean):
            # fallback to column mean from df
            try:
                mean = df[col].mean(skipna=True)
            except Exception:
                mean = 0.0
        if pd.isna(std) or std == 0:
            std = 1.0
        # apply transform grouped by participant (uses same mean/std for all groups)
        df[col] = df.groupby(id_col, sort=False)[col].transform(lambda x, m=mean, s=std: (x - m) / s)

    return df


In [11]:
withdrew_features = z_normalize_within_participant(withdrew_features, global_means, id_col='participant_id')
withdrew_features.head()

Unnamed: 0,participant_id,outcome,is_weekend,in_battery_saver_mode,charging_status,screen_on,dist_from_home,is_phone_locked,last_phone_usage,closeness_to_sleep_time,...,mi_time_of_day_Night,mi_location_category_Home,mi_location_category_Other,mi_location_category_School,mi_location_category_Transit,mi_location_category_Work,mi_wake_day_part_0.0,mi_wake_day_part_1.0,mi_wake_day_part_2.0,mi_wake_day_part_3.0
0,ambushdollhousegenerous@timestudy_com,0,0,0.0,1.0,0,-0.529561,1.0,-0.178185,1.869864,...,0,0,0,0,0,0,0,0,0,0
1,ambushdollhousegenerous@timestudy_com,1,0,0.0,0.0,1,-0.529565,0.0,-0.676281,1.31488,...,0,0,0,0,0,0,0,0,0,0
2,ambushdollhousegenerous@timestudy_com,1,0,0.0,0.0,1,-0.529669,0.0,-0.676281,0.045809,...,0,0,0,0,0,0,0,0,0,0
3,ambushdollhousegenerous@timestudy_com,0,0,0.0,1.0,1,-0.529876,0.0,-0.676281,-0.859746,...,0,0,0,0,0,0,0,0,0,0
4,ambushdollhousegenerous@timestudy_com,1,0,0.0,0.0,1,-0.529568,0.0,-0.676281,-1.068227,...,0,0,0,0,0,0,0,0,0,0


## Keep only relevant columns

In [12]:
## read column list .txt file
with open('/Users/adityaponnada/Downloads/time_study_data/processed_feature_columns.txt', 'r') as f:
    column_list = [line.strip() for line in f if line.strip()]


In [13]:
print(column_list)
print(len(column_list))

['participant_id', 'outcome', 'is_weekend', 'in_battery_saver_mode', 'charging_status', 'screen_on', 'dist_from_home', 'is_phone_locked', 'last_phone_usage', 'closeness_to_sleep_time', 'closeness_to_wake_time', 'mims_5min', 'days_in_study', 'completion_24h', 'completion_1h', 'time_between_prompts', 'time_since_last_answered', 'completion_since_wake', 'completion_since_start', 'time_of_day_Afternoon', 'time_of_day_Early Morning', 'time_of_day_Evening', 'time_of_day_Late Night', 'time_of_day_Morning', 'time_of_day_Night', 'location_category_Home', 'location_category_Other', 'location_category_School', 'location_category_Transit', 'location_category_Work', 'wake_day_part_0.0', 'wake_day_part_1.0', 'wake_day_part_2.0', 'wake_day_part_3.0', 'mi_in_battery_saver_mode', 'mi_charging_status', 'mi_dist_from_home', 'mi_is_phone_locked', 'mi_last_phone_usage', 'mi_closeness_to_sleep_time', 'mi_closeness_to_wake_time', 'mi_mims_5min']
42


In [14]:
## keep only those columns in withdrew_features
withdrew_features = withdrew_features[column_list]
withdrew_features.head()

Unnamed: 0,participant_id,outcome,is_weekend,in_battery_saver_mode,charging_status,screen_on,dist_from_home,is_phone_locked,last_phone_usage,closeness_to_sleep_time,...,wake_day_part_2.0,wake_day_part_3.0,mi_in_battery_saver_mode,mi_charging_status,mi_dist_from_home,mi_is_phone_locked,mi_last_phone_usage,mi_closeness_to_sleep_time,mi_closeness_to_wake_time,mi_mims_5min
0,ambushdollhousegenerous@timestudy_com,0,0,0.0,1.0,0,-0.529561,1.0,-0.178185,1.869864,...,0,0,0,0,0,0,0,0,0,0
1,ambushdollhousegenerous@timestudy_com,1,0,0.0,0.0,1,-0.529565,0.0,-0.676281,1.31488,...,0,0,0,0,0,0,0,0,0,1
2,ambushdollhousegenerous@timestudy_com,1,0,0.0,0.0,1,-0.529669,0.0,-0.676281,0.045809,...,1,0,1,1,0,0,0,0,0,1
3,ambushdollhousegenerous@timestudy_com,0,0,0.0,1.0,1,-0.529876,0.0,-0.676281,-0.859746,...,0,1,0,0,0,0,0,0,0,1
4,ambushdollhousegenerous@timestudy_com,1,0,0.0,0.0,1,-0.529568,0.0,-0.676281,-1.068227,...,0,1,0,0,0,0,0,0,0,1


In [15]:
withdrew_features['participant_id'].nunique()

58