## Import libraries

In [1]:
## Import libraries for machine learning and data processing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

In [2]:
import sys, importlib, site
print("sys.executable:", sys.executable)
print("sys.version:", sys.version)
print("sys.path (first 8):", sys.path[:8])
print("site.getsitepackages():", getattr(site, 'getsitepackages', lambda: None)())
print("USER site:", site.USER_SITE)
print("find tensorflow spec:", importlib.util.find_spec('tensorflow'))

sys.executable: /Users/adityaponnada/Documents/codework/real_time_prompting/real_time_prompting/tfpy/bin/python
sys.version: 3.11.14 (main, Oct  9 2025, 16:16:55) [Clang 17.0.0 (clang-1700.4.4.1)]
sys.path (first 8): ['/opt/homebrew/Cellar/python@3.11/3.11.14_1/Frameworks/Python.framework/Versions/3.11/lib/python311.zip', '/opt/homebrew/Cellar/python@3.11/3.11.14_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11', '/opt/homebrew/Cellar/python@3.11/3.11.14_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11/lib-dynload', '', '/Users/adityaponnada/Documents/codework/real_time_prompting/real_time_prompting/tfpy/lib/python3.11/site-packages']
site.getsitepackages(): ['/Users/adityaponnada/Documents/codework/real_time_prompting/real_time_prompting/tfpy/lib/python3.11/site-packages']
USER site: /Users/adityaponnada/Library/Python/3.11/lib/python/site-packages
find tensorflow spec: ModuleSpec(name='tensorflow', loader=<_frozen_importlib_external.SourceFileLoader object at 0x15

## Read data

In [3]:
## import dataset
raw_feature_df_scaled = pd.read_csv('/Users/adityaponnada/Downloads/time_study_data/processed_features_rnn.csv')
## Display the first few rows of the dataset
raw_feature_df_scaled.head(5)

Unnamed: 0,participant_id,outcome,is_weekend,in_battery_saver_mode,charging_status,screen_on,dist_from_home,is_phone_locked,last_phone_usage,closeness_to_sleep_time,...,mi_time_of_day_Night,mi_location_category_Home,mi_location_category_Other,mi_location_category_School,mi_location_category_Transit,mi_location_category_Work,mi_wake_day_part_0.0,mi_wake_day_part_1.0,mi_wake_day_part_2.0,mi_wake_day_part_3.0
0,animateshowerclothes@timestudy_com,1,0,1.0,0.0,1,0.000586,,,860.983333,...,0,0,0,0,0,0,0,0,0,0
1,animateshowerclothes@timestudy_com,0,0,1.0,1.0,1,0.001221,,,838.0,...,0,0,0,0,0,0,0,0,0,0
2,animateshowerclothes@timestudy_com,0,0,0.0,1.0,1,0.000794,,,827.983333,...,0,0,0,0,0,0,0,0,0,0
3,animateshowerclothes@timestudy_com,0,0,0.0,1.0,1,0.003441,,,817.983333,...,0,0,0,0,0,0,0,0,0,0
4,animateshowerclothes@timestudy_com,0,0,0.0,1.0,1,0.003723,,,781.983333,...,0,0,0,0,0,0,0,0,0,0


In [9]:
raw_feature_df_scaled['participant_id'].nunique()

32

### Discard missingness indicators for the complete data
To save dimensions and memory

In [4]:
def drop_zero_mi_columns(df, mi_prefix='mi_', inplace=False, verbose=False):
    """
    Drop missingness-indicator columns whose non-null values are all zero.
    - Leaves columns that are entirely NaN.
    - Returns a DataFrame (copy by default) with those mi_* columns removed.
    """
    import pandas as pd
    if df is None or not isinstance(df, pd.DataFrame):
        raise ValueError("df must be a pandas DataFrame")

    if not inplace:
        df = df.copy()

    mi_cols = [c for c in df.columns if str(c).startswith(mi_prefix)]
    to_drop = []
    for c in mi_cols:
        non_null = df[c].dropna()
        # drop if there's at least one non-null value and all non-null values equal 0
        if len(non_null) > 0 and (non_null == 0).all():
            to_drop.append(c)

    if to_drop:
        if verbose:
            print(f"Dropping {len(to_drop)} columns: {to_drop}")
        df.drop(columns=to_drop, inplace=True)

    return df

raw_feature_df_scaled = drop_zero_mi_columns(raw_feature_df_scaled, mi_prefix='mi_', inplace=False, verbose=True)

Dropping 24 columns: ['mi_is_weekend', 'mi_screen_on', 'mi_days_in_study', 'mi_completion_24h', 'mi_completion_1h', 'mi_time_between_prompts', 'mi_time_since_last_answered', 'mi_completion_since_wake', 'mi_completion_since_start', 'mi_time_of_day_Afternoon', 'mi_time_of_day_Early Morning', 'mi_time_of_day_Evening', 'mi_time_of_day_Late Night', 'mi_time_of_day_Morning', 'mi_time_of_day_Night', 'mi_location_category_Home', 'mi_location_category_Other', 'mi_location_category_School', 'mi_location_category_Transit', 'mi_location_category_Work', 'mi_wake_day_part_0.0', 'mi_wake_day_part_1.0', 'mi_wake_day_part_2.0', 'mi_wake_day_part_3.0']


In [5]:
raw_feature_df_scaled.head(5)

Unnamed: 0,participant_id,outcome,is_weekend,in_battery_saver_mode,charging_status,screen_on,dist_from_home,is_phone_locked,last_phone_usage,closeness_to_sleep_time,...,wake_day_part_2.0,wake_day_part_3.0,mi_in_battery_saver_mode,mi_charging_status,mi_dist_from_home,mi_is_phone_locked,mi_last_phone_usage,mi_closeness_to_sleep_time,mi_closeness_to_wake_time,mi_mims_5min
0,animateshowerclothes@timestudy_com,1,0,1.0,0.0,1,0.000586,,,860.983333,...,0,0,0,0,0,1,1,0,0,0
1,animateshowerclothes@timestudy_com,0,0,1.0,1.0,1,0.001221,,,838.0,...,0,0,0,0,0,1,1,0,0,0
2,animateshowerclothes@timestudy_com,0,0,0.0,1.0,1,0.000794,,,827.983333,...,0,0,0,0,0,1,1,0,0,0
3,animateshowerclothes@timestudy_com,0,0,0.0,1.0,1,0.003441,,,817.983333,...,0,0,0,0,0,1,1,0,0,0
4,animateshowerclothes@timestudy_com,0,0,0.0,1.0,1,0.003723,,,781.983333,...,0,0,0,0,0,1,1,0,0,0


## Split training and test

In [6]:
def split_train_test_by_participant(df, id_col='participant_id', train_frac=0.1):
    """
    Splits the DataFrame into training and testing sets for each participant.
    The first train_frac (default 90%) of each participant's data goes to train, the rest to test.
    Returns: train_df, test_df
    """
    train_list = []
    test_list = []
    for pid, group in df.groupby(id_col):
        n = len(group)
        split_idx = int(np.floor(train_frac * n))
        group_sorted = group.sort_index()  # keep original order
        train_list.append(group_sorted.iloc[:split_idx])
        test_list.append(group_sorted.iloc[split_idx:])
    train_df = pd.concat(train_list).reset_index(drop=True)
    test_df = pd.concat(test_list).reset_index(drop=True)
    return train_df, test_df

# Apply the function to split the dataset
train_df, test_df = split_train_test_by_participant(raw_feature_df_scaled)
print(f"Train set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")

Train set shape: (35974, 42)
Test set shape: (323943, 42)


In [8]:
train_df['participant_id'].nunique(), test_df['participant_id'].nunique()

(32, 32)

In [7]:
def participant_split_summary(train_df, test_df, id_col='participant_id'):
    """
    Returns a DataFrame with columns: participant_id, n_train, n_test, pct_train, pct_test
    """
    train_counts = train_df.groupby(id_col).size().rename('n_train')
    test_counts = test_df.groupby(id_col).size().rename('n_test')
    summary = pd.concat([train_counts, test_counts], axis=1).fillna(0).astype(int)
    summary['total'] = summary['n_train'] + summary['n_test']
    summary['pct_train'] = (summary['n_train'] / summary['total'] * 100).round(2)
    summary['pct_test'] = (summary['n_test'] / summary['total'] * 100).round(2)
    summary = summary.reset_index()[[id_col, 'n_train', 'n_test', 'pct_train', 'pct_test']]
    return summary

# Example usage:
split_summary = participant_split_summary(train_df, test_df)
split_summary

Unnamed: 0,participant_id,n_train,n_test,pct_train,pct_test
0,animateshowerclothes@timestudy_com,979,8813,10.0,90.0
1,atlanticchefhatchet@timestudy_com,1323,11915,9.99,90.01
2,beavertomatoupscale@timestudy_com,1309,11790,9.99,90.01
3,bondingcoasterdirtiness@timestudy_com,1317,11857,10.0,90.0
4,childhoodmovingmagnify@timestudy_com,1480,13325,10.0,90.0
5,cohesiveprotractfavored@timestudy_com,1428,12860,9.99,90.01
6,collisionmolarbreeze@timestudy_com,829,7463,10.0,90.0
7,congestedculpritsaved@timestudy_com,1154,10388,10.0,90.0
8,congestedtapssneer@timestudy_com,812,7316,9.99,90.01
9,crestedserpentspongy@timestudy_com,1192,10735,9.99,90.01
