## Import libraries

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import f1_score
import gc

In [8]:
USE_CPU_ONLY = True 

if USE_CPU_ONLY:
    print("\n[STABILITY MODE] Disabling GPU to prevent Metal compilation hangs...")
    tf.config.set_visible_devices([], 'GPU')
else:
    # Re-adding the memory growth code for when GPU is enabled
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            print("[GPU MODE] Memory growth enabled.")
        except RuntimeError as e:
            print(f"Memory growth setting failed: {e}")


[STABILITY MODE] Disabling GPU to prevent Metal compilation hangs...


## Read heldout processed data

In [4]:
heldout_df = pd.read_csv('/Users/adityaponnada/Downloads/time_study_data/general_heldout_scaled.csv')  # Replace with actual path
heldout_df.head()

Unnamed: 0,participant_id,outcome,is_weekend,in_battery_saver_mode,charging_status,screen_on,dist_from_home,is_phone_locked,last_phone_usage,closeness_to_sleep_time,...,wake_day_part_2.0,wake_day_part_3.0,mi_in_battery_saver_mode,mi_charging_status,mi_dist_from_home,mi_is_phone_locked,mi_last_phone_usage,mi_closeness_to_sleep_time,mi_closeness_to_wake_time,mi_mims_5min
0,animateshowerclothes@timestudy_com,1,0,1.0,0.0,1,-40.137687,1.0,-0.589123,1.842273,...,0,0,0,0,0,1,1,0,0,0
1,animateshowerclothes@timestudy_com,0,0,1.0,1.0,1,-40.136734,1.0,-0.589123,1.739076,...,0,0,0,0,0,1,1,0,0,0
2,animateshowerclothes@timestudy_com,0,0,0.0,1.0,1,-40.137375,1.0,-0.589123,1.694101,...,0,0,0,0,0,1,1,0,0,0
3,animateshowerclothes@timestudy_com,0,0,0.0,1.0,1,-40.133405,1.0,-0.589123,1.6492,...,0,0,0,0,0,1,1,0,0,0
4,animateshowerclothes@timestudy_com,0,0,0.0,1.0,1,-40.132982,1.0,-0.589123,1.487557,...,0,0,0,0,0,1,1,0,0,0


In [5]:
def max_observations_per_participant(df, participant_col='participant_id'):
    """Return the maximum number of observations any participant has in `df`."""
    import pandas as pd
    if df is None or participant_col not in df.columns:
        raise ValueError('DataFrame must contain the participant column')
    counts = df[participant_col].value_counts()
    if counts.empty:
        return 0
    return int(counts.max())

# Example usage
max_obs = max_observations_per_participant(heldout_df)
print('max observations per participant:', max_obs)
max_obs

max observations per participant: 15169


15169

In [6]:
# Print number of columns excluding participant_id and outcome
excluded = {'participant_id', 'outcome'}
feature_cols = [c for c in heldout_df.columns if c not in excluded]
print('number of feature columns (excluding participant_id and outcome):', len(feature_cols))
# optional: show first few feature column names
print('example feature columns:', feature_cols[:10])

number of feature columns (excluding participant_id and outcome): 40
example feature columns: ['is_weekend', 'in_battery_saver_mode', 'charging_status', 'screen_on', 'dist_from_home', 'is_phone_locked', 'last_phone_usage', 'closeness_to_sleep_time', 'closeness_to_wake_time', 'mims_5min']


## Define constants

In [7]:
L_CHUNK = 3967
NUM_CHUNKS = 4
MAX_TIME_SLOTS = max_obs 
NUM_FEATURES = len(feature_cols) 
SENTINEL_VALUE = 999.0
LEARNING_RATE = 1e-4
FINE_TUNE_EPOCHS = 10

## Split user data

In [9]:
def split_user_data_temporally(df, split_ratio=0.1):
    """
    Splits each user's data into a 10% morning snapshot (train) 
    and 90% future window (test) before padding.
    """
    train_df_list = []
    test_df_list = []
    
    grouped = df.groupby('participant_id')
    for p_id, group in grouped:
        n_steps = len(group)
        split_idx = int(n_steps * split_ratio)
        
        train_df_list.append(group.iloc[:split_idx])
        test_df_list.append(group.iloc[split_idx:])
        
    # Combine into two distinct DataFrames
    train_df = pd.concat(train_df_list, ignore_index=True)
    test_df = pd.concat(test_df_list, ignore_index=True)
    
    return train_df, test_df

In [10]:
train_df, test_df = split_user_data_temporally(heldout_df, split_ratio=0.1)

## Prepare tensors

In [13]:
def prep_tensors_from_df(df, target_chunks):
    """
    Converts a pre-scaled DataFrame into 4D tensors.
    target_chunks: 1 for training (10%), 4 for testing (90%).
    """
    grouped = df.groupby('participant_id')
    X_list, Y_list, p_ids = [], [], []
    
    for p_id, group in grouped:
        p_ids.append(p_id)
        X_seq = group.drop(columns=['participant_id', 'outcome']).values 
        Y_seq = group['outcome'].values.astype('float32').reshape(-1, 1)
        X_list.append(X_seq)
        Y_list.append(Y_seq)

    # Calculate exact padding required for the chunk structure
    required_len = target_chunks * L_CHUNK

    X_p = pad_sequences(X_list, maxlen=required_len, padding='post', value=SENTINEL_VALUE, dtype='float32')
    Y_p = pad_sequences(Y_list, maxlen=required_len, padding='post', value=SENTINEL_VALUE, dtype='float32')
    
    X_4d = X_p.reshape(len(p_ids), target_chunks, L_CHUNK, NUM_FEATURES)
    Y_4d = Y_p.reshape(len(p_ids), target_chunks, L_CHUNK, 1)
    
    return tf.cast(X_4d, tf.float32), tf.cast(Y_4d, tf.float32), p_ids

In [14]:
# Training tensor uses 1 chunk (the 10% snapshot)
X_train_4d, Y_train_4d, p_ids = prep_tensors_from_df(train_df, target_chunks=1)

# Testing tensor uses 4 chunks (the 90% future window)
X_test_4d, Y_test_4d, _ = prep_tensors_from_df(test_df, target_chunks=4)