## Import libraries
Import libraries to perform one hot coding and rescaling of features

In [1]:
## Import librariries
import sys, os
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from joblib import dump, load
import pickle
from typing import List, Tuple
from datetime import datetime
from dateutil import parser

## Import raw features
Import the raw feature file and inspect it.

In [4]:
# Read the raw feature CSV file into a pandas DataFrame
## for raw_features, use this path: /Users/adityaponnada/Downloads/time_study_data/raw_features_rnn.csv
## for heldout data features, use this path: /Users/adityaponnada/Downloads/time_study_data/heldout_raw_features.csv
## for withdrew data features, use this path: /Users/adityaponnada/Downloads/time_study_data/withdrew_raw_features.csv
raw_feature_df = pd.read_csv("/Users/adityaponnada/Downloads/time_study_data/withdrew_raw_features.csv")
print(f"Raw feature DataFrame shape: {raw_feature_df.shape}")
raw_feature_df.head()

Raw feature DataFrame shape: (235071, 23)


Unnamed: 0,participant_id,prompt_time_converted,outcome,is_weekend,time_of_day,in_battery_saver_mode,charging_status,location_category,screen_on,dist_from_home,...,closeness_to_sleep_time,closeness_to_wake_time,mims_5min,days_in_study,completion_24h,completion_1h,time_between_prompts,time_since_last_answered,completion_since_wake,completion_since_start
0,ambushdollhousegenerous@timestudy_com,2020-11-25 07:24:03,0,0,Early Morning,0.0,1.0,Transit,0,0.060436,...,980.95,114.05,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,ambushdollhousegenerous@timestudy_com,2020-11-25 09:54:06,1,0,Morning,0.0,0.0,Transit,1,0.059622,...,830.9,264.1,,0,0.0,0.0,150.05,0.0,0.0,0.0
2,ambushdollhousegenerous@timestudy_com,2020-11-25 15:37:13,1,0,Afternoon,,,Transit,1,0.042405,...,487.783333,607.216667,,0,0.5,0.0,343.116667,343.116667,0.5,0.5
3,ambushdollhousegenerous@timestudy_com,2020-11-25 19:42:03,0,0,Evening,0.0,1.0,Transit,1,0.008069,...,242.95,852.05,,0,0.666667,0.0,244.833333,244.833333,0.666667,0.666667
4,ambushdollhousegenerous@timestudy_com,2020-11-25 20:38:25,1,0,Night,0.0,0.0,Transit,1,0.059189,...,186.583333,908.416667,,0,0.5,0.0,56.366667,301.2,0.5,0.5


In [5]:
raw_feature_df['participant_id'].nunique()

58

In [6]:
raw_feature_df['wake_day_part'].value_counts()

wake_day_part
2.0    61946
1.0    61485
3.0    55884
0.0    55462
Name: count, dtype: int64

In [7]:
raw_feature_df.columns

Index(['participant_id', 'prompt_time_converted', 'outcome', 'is_weekend',
       'time_of_day', 'in_battery_saver_mode', 'charging_status',
       'location_category', 'screen_on', 'dist_from_home', 'is_phone_locked',
       'last_phone_usage', 'wake_day_part', 'closeness_to_sleep_time',
       'closeness_to_wake_time', 'mims_5min', 'days_in_study',
       'completion_24h', 'completion_1h', 'time_between_prompts',
       'time_since_last_answered', 'completion_since_wake',
       'completion_since_start'],
      dtype='object')

In [8]:
# Print unique participant_id values from raw_feature_df (count + sample)
if 'raw_feature_df' in globals():
    vals = raw_feature_df['participant_id'].dropna().unique()
    print(f'Unique participant_id count: {len(vals)}')
    try:
        full_list = sorted(vals)
    except Exception:
        full_list = list(vals)
    for i, pid in enumerate(full_list, start=1):
        print(f'{i:3d}: {pid}')
else:
    print('raw_feature_df not found. Run the CSV load cell first.')


Unique participant_id count: 58
  1: ambushdollhousegenerous@timestudy_com
  2: anywaymustinesspushiness@timestudy_com
  3: bottledeskworkrequire@timestudy_com
  4: browsingfrisbeepersevere@timestudy_com
  5: buckedstiflestagnant@timestudy_com
  6: busybodyestimatesensitize@timestudy_com
  7: civicexcludingbarcode@timestudy_com
  8: cladlandscapeheave@timestudy_com
  9: confrontcaresssullen@timestudy_com
 10: deitymagnifierdrove@timestudy_com
 11: dimmeddismaylegume@timestudy_com
 12: dizzinesscatatoniceconomist@timestudy_com
 13: enjoyingretreathandled@timestudy_com
 14: euphemismfederalconfusing@timestudy_com
 15: generouswidthcoasting@timestudy_com
 16: gushyenstir@timestudy_com
 17: hacksawscoldingdares@timestudy_com
 18: hazingdiscolorsuffering@timestudy_com
 19: himationlalospheres@timestudy_com
 20: huntingevergreendeparted@timestudy_com
 21: iodinegrapemonstrous@timestudy_com
 22: itunesgurgleexchange@timestudy_com
 23: lappedvastlydebating@timestudy_com
 24: legalsaddledresemb

In [9]:
# Observations per participant (placed immediately after raw_feature_df is created)
from IPython.display import display

def observations_per_participant(df, id_col='participant_id'):
    """Return participant-level observation counts as a DataFrame with columns [id_col, 'n_obs']"""
    if df is None:
        print('Provided DataFrame is None')
        return pd.DataFrame(columns=[id_col, 'n_obs'])
    if id_col not in df.columns:
        raise ValueError(f"id_col '{id_col}' not found in DataFrame")
    counts = df.groupby(id_col).size().reset_index(name='n_obs')
    # Sort by participant id for deterministic ordering
    counts = counts.sort_values(by=id_col, ascending=True).reset_index(drop=True)
    return counts

# Compute and display counts using the freshly-loaded `raw_feature_df`
obs_counts = observations_per_participant(raw_feature_df)
print(f"Participants: {obs_counts.shape[0]} | Total observations: {len(raw_feature_df)}")
display(obs_counts.tail(50))

Participants: 58 | Total observations: 235071


Unnamed: 0,participant_id,n_obs
8,confrontcaresssullen@timestudy_com,7809
9,deitymagnifierdrove@timestudy_com,7477
10,dimmeddismaylegume@timestudy_com,6180
11,dizzinesscatatoniceconomist@timestudy_com,2422
12,enjoyingretreathandled@timestudy_com,2784
13,euphemismfederalconfusing@timestudy_com,4885
14,generouswidthcoasting@timestudy_com,2644
15,gushyenstir@timestudy_com,2721
16,hacksawscoldingdares@timestudy_com,8228
17,hazingdiscolorsuffering@timestudy_com,5815


In [10]:
## remove unknown user from df
raw_feature_df = raw_feature_df[raw_feature_df['participant_id'].astype(str).str.lower() != 'unknown_user'].reset_index(drop=True)

In [11]:
# Min and max of days_in_study per participant
from IPython.display import display

def min_max_days_by_participant(df, id_col='participant_id', days_col='days_in_study'):
    """Return a DataFrame with columns [id_col, 'days_min', 'days_max'] for each participant."""
    if df is None:
        print('Provided DataFrame is None')
        return pd.DataFrame(columns=[id_col, 'days_min', 'days_max'])
    if id_col not in df.columns:
        raise ValueError(f"id_col '{id_col}' not found in DataFrame")
    if days_col not in df.columns:
        raise ValueError(f"days_col '{days_col}' not found in DataFrame")
    # Coerce to numeric, preserve NaNs
    days_numeric = pd.to_numeric(df[days_col], errors='coerce')
    tmp = df.copy()
    tmp[days_col] = days_numeric
    agg = tmp.groupby(id_col)[days_col].agg(['min', 'max']).reset_index().rename(columns={'min':'days_min', 'max':'days_max'})
    # Sort by participant id for deterministic output
    agg = agg.sort_values(by=id_col, ascending=True).reset_index(drop=True)
    return agg

# Compute and display the min/max table using the raw feature DataFrame
minmax_days = min_max_days_by_participant(raw_feature_df)
print(f"Participants: {minmax_days.shape[0]} | Total observations: {len(raw_feature_df)}")
display(minmax_days.tail(50))

Participants: 58 | Total observations: 235071


Unnamed: 0,participant_id,days_min,days_max
8,confrontcaresssullen@timestudy_com,0,225
9,deitymagnifierdrove@timestudy_com,0,300
10,dimmeddismaylegume@timestudy_com,0,238
11,dizzinesscatatoniceconomist@timestudy_com,0,74
12,enjoyingretreathandled@timestudy_com,0,85
13,euphemismfederalconfusing@timestudy_com,0,188
14,generouswidthcoasting@timestudy_com,0,86
15,gushyenstir@timestudy_com,0,88
16,hacksawscoldingdares@timestudy_com,0,219
17,hazingdiscolorsuffering@timestudy_com,0,164


In [12]:
from IPython.display import display

def print_participant_head_tail(df, participant_id, id_col: str = 'participant_id', n: int = 5):
    """
    Simple utility: print the head and tail of `df` for the given participant_id.

    Parameters
    - df: pandas DataFrame containing participant rows
    - participant_id: value of the participant id to filter
    - id_col: name of the participant id column (default 'participant_id')
    - n: number of rows to show from head and tail (default 5)
    """
    if df is None:
        print('Provided DataFrame is None')
        return
    if id_col not in df.columns:
        print(f"id_col '{id_col}' not found in DataFrame columns")
        return
    sub = df[df[id_col] == participant_id]
    if sub.empty:
        print(f"No rows found for {id_col}={participant_id}")
        return
    print(f"--- HEAD ({n}) for participant {participant_id} ---")
    display(sub.head(n))
    print(f"--- TAIL ({n}) for participant {participant_id} ---")
    display(sub.tail(n))

# Example usage (uncomment and replace with a real id):
print_participant_head_tail(raw_feature_df, 'arrivejanitoruniformly@timestudy_com', n=5)


No rows found for participant_id=arrivejanitoruniformly@timestudy_com


In [13]:
def missing_value_table(df):
    skip_cols = ['participant_id', 'prompt_time_converted', 'outcome']
    cols = [col for col in df.columns if col.lower() not in skip_cols]
    missing_percent = df[cols].isnull().mean() * 100
    # Also count empty strings as missing
    empty_percent = (df[cols] == '').mean() * 100
    total_missing_percent = missing_percent + empty_percent
    result = pd.DataFrame({
        'missing_%': total_missing_percent.round(2)
    }).sort_values('missing_%', ascending=False)
    return result
    
# Display missing value table
missing_value_table(raw_feature_df)

Unnamed: 0,missing_%
in_battery_saver_mode,52.84
charging_status,52.84
last_phone_usage,17.07
is_phone_locked,16.66
dist_from_home,13.53
mims_5min,6.42
time_of_day,2.07
closeness_to_sleep_time,0.13
wake_day_part,0.13
closeness_to_wake_time,0.13


In [14]:
def missing_data_by_participant(df):
    skip_cols = ['prompt_time_converted', 'outcome']
    cols = [col for col in df.columns if col.lower() not in skip_cols and col.lower() != 'participant_id']
    # Create a boolean DataFrame for missing values (NaN or empty string)
    missing_bool = df[cols].isnull() | (df[cols] == '')
    # Group by participant_id and calculate % missing for each column
    missing_percent = missing_bool.groupby(df['participant_id']).mean() * 100
    missing_percent = missing_percent.round(2)
    # Reset index to have participant_id as a column
    missing_percent = missing_percent.reset_index()
    return missing_percent

# Print % missing data for each participant and variable
missing_data_by_participant(raw_feature_df)

Unnamed: 0,participant_id,is_weekend,time_of_day,in_battery_saver_mode,charging_status,location_category,screen_on,dist_from_home,is_phone_locked,last_phone_usage,...,closeness_to_sleep_time,closeness_to_wake_time,mims_5min,days_in_study,completion_24h,completion_1h,time_between_prompts,time_since_last_answered,completion_since_wake,completion_since_start
0,ambushdollhousegenerous@timestudy_com,0.0,0.0,52.17,52.17,0.0,0.0,9.52,0.0,0.0,...,0.0,0.0,4.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,anywaymustinesspushiness@timestudy_com,0.0,0.0,60.59,60.59,0.0,0.0,0.21,0.0,0.0,...,0.0,0.0,22.22,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,bottledeskworkrequire@timestudy_com,0.0,0.0,52.98,52.98,0.0,0.0,6.96,0.0,0.0,...,0.0,0.0,4.62,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,browsingfrisbeepersevere@timestudy_com,0.0,0.0,47.5,47.5,0.0,0.0,2.3,0.0,0.0,...,0.0,0.0,5.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,buckedstiflestagnant@timestudy_com,0.0,0.0,55.01,55.01,0.0,0.0,1.94,0.0,0.0,...,0.74,0.74,5.72,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,busybodyestimatesensitize@timestudy_com,0.0,0.0,53.95,53.95,0.0,0.0,13.16,0.0,0.0,...,0.0,0.0,6.58,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,civicexcludingbarcode@timestudy_com,0.0,0.0,57.42,57.42,0.0,0.0,7.13,0.0,0.0,...,0.0,0.0,13.82,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,cladlandscapeheave@timestudy_com,0.0,0.0,44.86,44.86,0.0,0.0,3.78,0.0,0.0,...,0.0,0.0,2.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,confrontcaresssullen@timestudy_com,0.0,0.0,51.79,51.79,0.0,0.0,25.43,100.0,100.0,...,0.0,0.0,2.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,deitymagnifierdrove@timestudy_com,0.0,0.0,52.86,52.86,0.0,0.0,14.1,0.0,0.0,...,0.0,0.0,11.18,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## one-hot encoding
Convert the categorical variables into one-hot dummy codes

In [15]:
def one_hot_encode_features(df, columns):
    """
    One-hot encode specified categorical columns in the DataFrame.
    Returns a new DataFrame with one-hot encoded columns as 0/1 integers and original columns dropped.
    """
    df_encoded = df.copy()
    df_encoded = pd.get_dummies(df_encoded, columns=columns, prefix=columns, drop_first=False)
    # Ensure all new one-hot columns are int (0/1)
    for col in df_encoded.columns:
        if any(col.startswith(f'{c}_') for c in columns):
            df_encoded[col] = df_encoded[col].astype(int)
    return df_encoded

# Example usage:
categorical_vars = ['time_of_day', 'location_category', 'wake_day_part']
raw_feature_df_encoded = one_hot_encode_features(raw_feature_df, categorical_vars)
raw_feature_df_encoded.head()

Unnamed: 0,participant_id,prompt_time_converted,outcome,is_weekend,in_battery_saver_mode,charging_status,screen_on,dist_from_home,is_phone_locked,last_phone_usage,...,time_of_day_Night,location_category_Home,location_category_Other,location_category_School,location_category_Transit,location_category_Work,wake_day_part_0.0,wake_day_part_1.0,wake_day_part_2.0,wake_day_part_3.0
0,ambushdollhousegenerous@timestudy_com,2020-11-25 07:24:03,0,0,0.0,1.0,0,0.060436,1.0,11.1,...,0,0,0,0,1,0,1,0,0,0
1,ambushdollhousegenerous@timestudy_com,2020-11-25 09:54:06,1,0,0.0,0.0,1,0.059622,0.0,0.0,...,0,0,0,0,1,0,1,0,0,0
2,ambushdollhousegenerous@timestudy_com,2020-11-25 15:37:13,1,0,,,1,0.042405,0.0,0.0,...,0,0,0,0,1,0,0,0,1,0
3,ambushdollhousegenerous@timestudy_com,2020-11-25 19:42:03,0,0,0.0,1.0,1,0.008069,0.0,0.0,...,0,0,0,0,1,0,0,0,0,1
4,ambushdollhousegenerous@timestudy_com,2020-11-25 20:38:25,1,0,0.0,0.0,1,0.059189,0.0,0.0,...,1,0,0,0,1,0,0,0,0,1


In [16]:
raw_feature_df_encoded['participant_id'].nunique()

58

In [17]:
## List the column names in the DataFrame
def list_column_names(df: pd.DataFrame) -> List[str]:
    """
    List all column names in the DataFrame.
    """
    return df.columns.tolist()

## Use the function to get column names
column_names = list_column_names(raw_feature_df_encoded)
print("Column names in the DataFrame:")
print(column_names)

Column names in the DataFrame:
['participant_id', 'prompt_time_converted', 'outcome', 'is_weekend', 'in_battery_saver_mode', 'charging_status', 'screen_on', 'dist_from_home', 'is_phone_locked', 'last_phone_usage', 'closeness_to_sleep_time', 'closeness_to_wake_time', 'mims_5min', 'days_in_study', 'completion_24h', 'completion_1h', 'time_between_prompts', 'time_since_last_answered', 'completion_since_wake', 'completion_since_start', 'time_of_day_Afternoon', 'time_of_day_Early Morning', 'time_of_day_Evening', 'time_of_day_Late Night', 'time_of_day_Morning', 'time_of_day_Night', 'location_category_Home', 'location_category_Other', 'location_category_School', 'location_category_Transit', 'location_category_Work', 'wake_day_part_0.0', 'wake_day_part_1.0', 'wake_day_part_2.0', 'wake_day_part_3.0']


## Normalize features [Skip this step]

In [27]:
from sklearn.preprocessing import MinMaxScaler

def min_max_scale_by_participant(df, columns, group_col='participant_id'):
    """
    Apply min-max scaling (0-1) to specified columns, grouped by participant_id.
    Returns a new DataFrame with scaled columns (original columns replaced).
    """
    df_scaled = df.copy()
    scaler = MinMaxScaler()
    # Apply scaling for each participant
    for pid, group in df.groupby(group_col):
        idx = group.index
        scaled_values = scaler.fit_transform(group[columns])
        df_scaled.loc[idx, columns] = scaled_values
    return df_scaled

# Example usage:
scale_columns = ['dist_from_home', 'last_phone_usage', 'closeness_to_sleep_time', 'closeness_to_wake_time', 
                 'mims_5min', 'time_between_prompts', 'time_since_last_answered']
raw_feature_df_scaled = min_max_scale_by_participant(raw_feature_df_encoded, scale_columns)
raw_feature_df_scaled[scale_columns + ['participant_id']].head()

  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis

Unnamed: 0,dist_from_home,last_phone_usage,closeness_to_sleep_time,closeness_to_wake_time,mims_5min,time_between_prompts,time_since_last_answered,participant_id
0,2e-06,0.0,0.187031,0.807168,0.15051,0.0,0.0,arrivejanitoruniformly@timestudy_com
1,2e-06,0.0,0.180555,0.814221,0.177391,0.000856,0.0,arrivejanitoruniformly@timestudy_com
2,2e-06,0.0,0.17605,0.819127,0.220735,0.000528,0.0,arrivejanitoruniformly@timestudy_com
3,2e-06,0.0,0.17206,0.823473,0.165295,0.001388,0.0,arrivejanitoruniformly@timestudy_com
4,2e-06,0.0,0.104721,0.896812,0.240907,0.001191,0.001191,arrivejanitoruniformly@timestudy_com


## FixedMax scaling for days in study

In [18]:
# Fixed-max scaling for days_in_study with fixed_max = 365

def fixed_max_scale_days_in_study_365(df: pd.DataFrame,
                                     group_col: str = 'participant_id',
                                     days_col: str = 'days_in_study',
                                     fixed_max: float = 365.0,
                                     inplace: bool = False) -> pd.DataFrame:
    """Scale the `days_col` to [0,1] using a fixed maximum value (default 365).

    This variant overwrites the original `days_col` with the scaled values
    (in-place if requested, otherwise on a copy).

    Parameters
    - df: pandas DataFrame containing the days column
    - group_col: kept for API compatibility (not used)
    - days_col: name of the days-in-study column to scale (will be overwritten)
    - fixed_max: maximum days value to use for scaling
    - inplace: if True, modify `df` in-place and return it; otherwise work on a copy

    Returns
    - pandas DataFrame with `days_col` replaced by scaled values in [0,1]
    """
    if df is None:
        raise ValueError('df must be a pandas DataFrame, got None')
    if not isinstance(df, pd.DataFrame):
        raise TypeError(f'df must be a pandas DataFrame, got {type(df)}')
    if days_col not in df.columns:
        raise ValueError(f"days_col '{days_col}' not found in DataFrame")

    # Work on a copy unless inplace requested
    if not inplace:
        df = df.copy()

    # Coerce to numeric (invalid values -> NaN)
    coerced = pd.to_numeric(df[days_col], errors='coerce')

    # Overwrite the original column with clipped & scaled values; NaNs preserved
    df[days_col] = coerced.clip(lower=0, upper=float(fixed_max)) / float(fixed_max)
    df[days_col] = df[days_col].astype(float)

    return df

raw_feature_df_encoded = fixed_max_scale_days_in_study_365(raw_feature_df_encoded)

In [19]:
## Show the new days_in_study column along with participant_id for preview
raw_feature_df_encoded[['participant_id', 'days_in_study']]

Unnamed: 0,participant_id,days_in_study
0,ambushdollhousegenerous@timestudy_com,0.000000
1,ambushdollhousegenerous@timestudy_com,0.000000
2,ambushdollhousegenerous@timestudy_com,0.000000
3,ambushdollhousegenerous@timestudy_com,0.000000
4,ambushdollhousegenerous@timestudy_com,0.000000
...,...,...
235066,unwrappedsnaggedepiphany@timestudy_com,0.441096
235067,unwrappedsnaggedepiphany@timestudy_com,0.441096
235068,unwrappedsnaggedepiphany@timestudy_com,0.441096
235069,unwrappedsnaggedepiphany@timestudy_com,0.441096


In [20]:
raw_feature_df_encoded['participant_id'].nunique()

58

## Add missingness indicator

In [21]:
def add_missingness_indicators(df, skip_cols=None, inplace=False):
    """Add binary missingness indicator columns for dataframe columns."""
    if skip_cols is None:
        skip_cols = ['participant_id', 'outcome', 'prompt_time_converted']
    if df is None:
        raise ValueError('df must be a pandas DataFrame, got None')
    if not inplace:
        df = df.copy()
    # Only consider original (non-indicator) columns and skip the requested columns
    cols_to_process = [c for c in df.columns if c not in skip_cols and not str(c).startswith('mi_')]
    for c in cols_to_process:
        new_col = f'mi_{c}'
        # 1 if NaN, 0 otherwise (preserve existing non-NaN values)
        df[new_col] = df[c].isna().astype(int)
    return df

# Example usage: apply to the encoded DataFrame and show the new mi_ columns
if 'raw_feature_df_encoded' in globals():
    raw_feature_df_encoded = add_missingness_indicators(raw_feature_df_encoded)
    mi_cols = [c for c in raw_feature_df_encoded.columns if c.startswith('mi_')][:20]  # preview up to 20 indicators
    print(f'Added {len(mi_cols)} missingness indicator columns (previewing up to 20):')
    display(raw_feature_df_encoded[['participant_id'] + mi_cols].head())
else:
    print('raw_feature_df_encoded not found; run earlier cells to create it first.')

Added 20 missingness indicator columns (previewing up to 20):


Unnamed: 0,participant_id,mi_is_weekend,mi_in_battery_saver_mode,mi_charging_status,mi_screen_on,mi_dist_from_home,mi_is_phone_locked,mi_last_phone_usage,mi_closeness_to_sleep_time,mi_closeness_to_wake_time,...,mi_days_in_study,mi_completion_24h,mi_completion_1h,mi_time_between_prompts,mi_time_since_last_answered,mi_completion_since_wake,mi_completion_since_start,mi_time_of_day_Afternoon,mi_time_of_day_Early Morning,mi_time_of_day_Evening
0,ambushdollhousegenerous@timestudy_com,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ambushdollhousegenerous@timestudy_com,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ambushdollhousegenerous@timestudy_com,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ambushdollhousegenerous@timestudy_com,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ambushdollhousegenerous@timestudy_com,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
raw_feature_df_encoded['participant_id'].nunique()

58

In [23]:
raw_feature_df_encoded[['participant_id', 'in_battery_saver_mode', 'mi_in_battery_saver_mode']].head()

Unnamed: 0,participant_id,in_battery_saver_mode,mi_in_battery_saver_mode
0,ambushdollhousegenerous@timestudy_com,0.0,0
1,ambushdollhousegenerous@timestudy_com,0.0,0
2,ambushdollhousegenerous@timestudy_com,,1
3,ambushdollhousegenerous@timestudy_com,0.0,0
4,ambushdollhousegenerous@timestudy_com,0.0,0


In [24]:
raw_feature_df_encoded[['participant_id', 'wake_day_part_0.0', 'mi_wake_day_part_0.0']].head()

Unnamed: 0,participant_id,wake_day_part_0.0,mi_wake_day_part_0.0
0,ambushdollhousegenerous@timestudy_com,1,0
1,ambushdollhousegenerous@timestudy_com,1,0
2,ambushdollhousegenerous@timestudy_com,0,0
3,ambushdollhousegenerous@timestudy_com,0,0
4,ambushdollhousegenerous@timestudy_com,0,0


In [25]:
raw_feature_df_encoded.columns

Index(['participant_id', 'prompt_time_converted', 'outcome', 'is_weekend',
       'in_battery_saver_mode', 'charging_status', 'screen_on',
       'dist_from_home', 'is_phone_locked', 'last_phone_usage',
       'closeness_to_sleep_time', 'closeness_to_wake_time', 'mims_5min',
       'days_in_study', 'completion_24h', 'completion_1h',
       'time_between_prompts', 'time_since_last_answered',
       'completion_since_wake', 'completion_since_start',
       'time_of_day_Afternoon', 'time_of_day_Early Morning',
       'time_of_day_Evening', 'time_of_day_Late Night', 'time_of_day_Morning',
       'time_of_day_Night', 'location_category_Home',
       'location_category_Other', 'location_category_School',
       'location_category_Transit', 'location_category_Work',
       'wake_day_part_0.0', 'wake_day_part_1.0', 'wake_day_part_2.0',
       'wake_day_part_3.0', 'mi_is_weekend', 'mi_in_battery_saver_mode',
       'mi_charging_status', 'mi_screen_on', 'mi_dist_from_home',
       'mi_is_phone_l

In [26]:
# Remove `prompt_time_converted` from `raw_feature_df_encoded` if present
if 'raw_feature_df_encoded' in globals():
    if 'prompt_time_converted' in raw_feature_df_encoded.columns:
        raw_feature_df_encoded = raw_feature_df_encoded.drop(columns=['prompt_time_converted'])
        print("Dropped column 'prompt_time_converted' from raw_feature_df_encoded.")
    else:
        print("Column 'prompt_time_converted' not present in raw_feature_df_encoded; nothing to do.")
else:
    print("raw_feature_df_encoded not found in the notebook namespace. Run the one-hot encoding cell first.")

Dropped column 'prompt_time_converted' from raw_feature_df_encoded.


In [27]:
raw_feature_df_encoded['participant_id'].nunique()

58

In [28]:
## Save the raw_features_df_encoded to a file
## for held out dataset, save to: /Users/adityaponnada/Downloads/time_study_data/processed_features_heldout.csv
## for traning data, save to: /Users/adityaponnada/Downloads/time_study_data/processed_features_rnn.csv
## for withdrew data, save to: /Users/adityaponnada/Downloads/time_study_data/processed_features_withdrew.csv
raw_feature_df_encoded.to_csv('/Users/adityaponnada/Downloads/time_study_data/processed_features_withdrew.csv', index=False)