## Import libraries
Import libraries to perform one hot coding and rescaling of features

In [1]:
## Import librariries
import sys, os
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from joblib import dump, load
import pickle
from typing import List, Tuple
from datetime import datetime
from dateutil import parser

## Import raw features
Import the raw feature file and inspect it.

In [5]:
# Read the raw feature CSV file into a pandas DataFrame
raw_feature_df = pd.read_csv("/Users/adityaponnada/Downloads/time_study_data/raw_features_rnn.csv")
print(f"Raw feature DataFrame shape: {raw_feature_df.shape}")
raw_feature_df.head()

Raw feature DataFrame shape: (1088856, 23)


Unnamed: 0,participant_id,prompt_time_converted,outcome,is_weekend,time_of_day,in_battery_saver_mode,charging_status,location_category,screen_on,dist_from_home,...,closeness_to_sleep_time,closeness_to_wake_time,mims_5min,days_in_study,completion_24h,completion_1h,time_between_prompts,time_since_last_answered,completion_since_wake,completion_since_start
0,afflictedrevenueepilepsy@timestudy_com,2021-06-23 06:38:01,0,0,Early Morning,0.0,0.0,Home,0,0.006074,...,981.983333,8.016667,5.545853,0,0.0,0.0,0.0,0.0,0.0,0.0
1,afflictedrevenueepilepsy@timestudy_com,2021-06-23 06:46:02,1,0,Early Morning,,,Home,0,0.005902,...,973.966667,16.033333,24.657388,0,0.0,0.0,8.016667,0.0,0.0,0.0
2,afflictedrevenueepilepsy@timestudy_com,2021-06-23 06:54:04,1,0,Early Morning,0.0,0.0,Home,0,0.005426,...,965.933333,24.066667,0.0,0,0.5,0.5,8.033333,8.033333,0.5,0.5
3,afflictedrevenueepilepsy@timestudy_com,2021-06-23 07:12:02,0,0,Early Morning,0.0,1.0,Home,0,0.005985,...,947.966667,42.033333,1.005139,0,0.666667,0.666667,17.966667,17.966667,0.666667,0.666667
4,afflictedrevenueepilepsy@timestudy_com,2021-06-23 07:23:02,0,0,Early Morning,0.0,1.0,Home,0,0.0064,...,936.966667,53.033333,0.04436,0,0.5,0.5,11.0,28.966667,0.5,0.5


In [8]:
# Print unique participant_id values from raw_feature_df (count + sample)
if 'raw_feature_df' in globals():
    vals = raw_feature_df['participant_id'].dropna().unique()
    print(f'Unique participant_id count: {len(vals)}')
    try:
        full_list = sorted(vals)
    except Exception:
        full_list = list(vals)
    for i, pid in enumerate(full_list, start=1):
        print(f'{i:3d}: {pid}')
else:
    print('raw_feature_df not found. Run the CSV load cell first.')


Unique participant_id count: 100
  1: afflictedrevenueepilepsy@timestudy_com
  2: anagramprobingscrooge@timestudy_com
  3: anthillfastinglucrative@timestudy_com
  4: arrivejanitoruniformly@timestudy_com
  5: attirecrabbinghumbling@timestudy_com
  6: backfirebankedprudishly@timestudy_com
  7: badlandwiltmuseum@timestudy_com
  8: bannisterhardwiredladle@timestudy_com
  9: bartenderradiatorapplied@timestudy_com
 10: brinkaminounframed@timestudy_com
 11: catsupexploitmocker@timestudy_com
 12: caucuscattlemockup@timestudy_com
 13: certifiedembargobartender@timestudy_com
 14: chewingslouchingfailing@timestudy_com
 15: congresscyclistdefender@timestudy_com
 16: copybrickcreative@timestudy_com
 17: coynessculminatebarista@timestudy_com
 18: craftworkattendeeensnare@timestudy_com
 19: debatableuneasyeveryone@timestudy_com
 20: defilinganywayimmovable@timestudy_com
 21: diagramuncoupleoutput@timestudy_com
 22: dissuadecelestialrelic@timestudy_com
 23: distresslitigatemassager@timestudy_com
 24: 

In [10]:
test_pid = 'arrivejanitoruniformly@timestudy_com'
df_user = raw_feature_df[raw_feature_df['participant_id'] == test_pid].copy()
df_user

Unnamed: 0,participant_id,prompt_time_converted,outcome,is_weekend,time_of_day,in_battery_saver_mode,charging_status,location_category,screen_on,dist_from_home,...,closeness_to_sleep_time,closeness_to_wake_time,mims_5min,days_in_study,completion_24h,completion_1h,time_between_prompts,time_since_last_answered,completion_since_wake,completion_since_start
39810,arrivejanitoruniformly@timestudy_com,2021-02-15 12:33:02,0,0,Afternoon,,,Home,0,0.014321,...,746.966667,183.033333,,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
39811,arrivejanitoruniformly@timestudy_com,2021-02-15 12:46:00,0,0,Afternoon,,,Home,0,0.014677,...,734.000000,196.000000,,0,0.000000,0.000000,12.966667,0.000000,0.000000,0.000000
39812,arrivejanitoruniformly@timestudy_com,2021-02-15 12:54:00,0,0,Afternoon,,,Home,0,0.012370,...,726.000000,204.000000,,0,0.000000,0.000000,8.000000,0.000000,0.000000,0.000000
39813,arrivejanitoruniformly@timestudy_com,2021-02-15 13:15:01,1,0,Afternoon,,,Home,0,0.013669,...,704.983333,225.016667,37.567687,0,0.000000,0.000000,21.016667,0.000000,0.000000,0.000000
39814,arrivejanitoruniformly@timestudy_com,2021-02-15 13:33:03,1,0,Afternoon,0.0,0.0,Home,1,0.018931,...,686.950000,243.050000,35.144637,0,0.250000,0.333333,18.033333,18.033333,0.250000,0.250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50650,arrivejanitoruniformly@timestudy_com,,1,1,,0.0,0.0,Transit,1,3926.818767,...,158.966667,801.033333,41.671243,139,0.931034,1.000000,9.050000,9.050000,0.931034,0.931034
50651,arrivejanitoruniformly@timestudy_com,,1,0,,,,Transit,1,3926.819815,...,122.966667,837.033333,3.311122,140,0.933333,1.000000,28.983333,28.983333,0.933333,0.933333
50652,arrivejanitoruniformly@timestudy_com,,1,0,,,,Transit,1,3926.817126,...,114.983333,845.016667,12.461228,140,0.903226,0.750000,10.000000,38.983333,0.903226,0.903226
50653,arrivejanitoruniformly@timestudy_com,,1,0,,0.0,0.0,Transit,1,3926.818216,...,105.983333,854.016667,75.900521,140,0.875000,0.333333,13.983333,52.966667,0.875000,0.875000


In [4]:
# Observations per participant (placed immediately after raw_feature_df is created)
from IPython.display import display

def observations_per_participant(df, id_col='participant_id'):
    """Return participant-level observation counts as a DataFrame with columns [id_col, 'n_obs']"""
    if df is None:
        print('Provided DataFrame is None')
        return pd.DataFrame(columns=[id_col, 'n_obs'])
    if id_col not in df.columns:
        raise ValueError(f"id_col '{id_col}' not found in DataFrame")
    counts = df.groupby(id_col).size().reset_index(name='n_obs')
    # Sort by participant id for deterministic ordering
    counts = counts.sort_values(by=id_col, ascending=True).reset_index(drop=True)
    return counts

# Compute and display counts using the freshly-loaded `raw_feature_df`
obs_counts = observations_per_participant(raw_feature_df)
print(f"Participants: {obs_counts.shape[0]} | Total observations: {len(raw_feature_df)}")
display(obs_counts.head(50))

Participants: 100 | Total observations: 1088856


Unnamed: 0,participant_id,n_obs
0,afflictedrevenueepilepsy@timestudy_com,13964
1,anagramprobingscrooge@timestudy_com,13567
2,anthillfastinglucrative@timestudy_com,12279
3,arrivejanitoruniformly@timestudy_com,10845
4,attirecrabbinghumbling@timestudy_com,14494
5,backfirebankedprudishly@timestudy_com,13535
6,badlandwiltmuseum@timestudy_com,12257
7,bannisterhardwiredladle@timestudy_com,12757
8,bartenderradiatorapplied@timestudy_com,4692
9,brinkaminounframed@timestudy_com,11267


In [21]:
# Min and max of days_in_study per participant
from IPython.display import display

def min_max_days_by_participant(df, id_col='participant_id', days_col='days_in_study'):
    """Return a DataFrame with columns [id_col, 'days_min', 'days_max'] for each participant."""
    if df is None:
        print('Provided DataFrame is None')
        return pd.DataFrame(columns=[id_col, 'days_min', 'days_max'])
    if id_col not in df.columns:
        raise ValueError(f"id_col '{id_col}' not found in DataFrame")
    if days_col not in df.columns:
        raise ValueError(f"days_col '{days_col}' not found in DataFrame")
    # Coerce to numeric, preserve NaNs
    days_numeric = pd.to_numeric(df[days_col], errors='coerce')
    tmp = df.copy()
    tmp[days_col] = days_numeric
    agg = tmp.groupby(id_col)[days_col].agg(['min', 'max']).reset_index().rename(columns={'min':'days_min', 'max':'days_max'})
    # Sort by participant id for deterministic output
    agg = agg.sort_values(by=id_col, ascending=True).reset_index(drop=True)
    return agg

# Compute and display the min/max table using the raw feature DataFrame
minmax_days = min_max_days_by_participant(raw_feature_df)
print(f"Participants: {minmax_days.shape[0]} | Total observations: {len(raw_feature_df)}")
display(minmax_days.head(100))

Participants: 100 | Total observations: 1087353


Unnamed: 0,participant_id,days_min,days_max
0,arrivejanitoruniformly@timestudy_com,0,350
1,atlanticchefhatchet@timestudy_com,0,352
2,badlandwiltmuseum@timestudy_com,0,289
3,bannisterhardwiredladle@timestudy_com,0,350
4,bartenderradiatorapplied@timestudy_com,0,277
...,...,...,...
95,vagabondnumerousflatterer@timestudy_com,0,326
96,whoeverrelightspookily@timestudy_com,0,199
97,wikipediaetchingcrystal@timestudy_com,0,348
98,wrigglecatalyststerility@timestudy_com,0,338


In [22]:
from IPython.display import display

def print_participant_head_tail(df, participant_id, id_col: str = 'participant_id', n: int = 5):
    """
    Simple utility: print the head and tail of `df` for the given participant_id.

    Parameters
    - df: pandas DataFrame containing participant rows
    - participant_id: value of the participant id to filter
    - id_col: name of the participant id column (default 'participant_id')
    - n: number of rows to show from head and tail (default 5)
    """
    if df is None:
        print('Provided DataFrame is None')
        return
    if id_col not in df.columns:
        print(f"id_col '{id_col}' not found in DataFrame columns")
        return
    sub = df[df[id_col] == participant_id]
    if sub.empty:
        print(f"No rows found for {id_col}={participant_id}")
        return
    print(f"--- HEAD ({n}) for participant {participant_id} ---")
    display(sub.head(n))
    print(f"--- TAIL ({n}) for participant {participant_id} ---")
    display(sub.tail(n))

# Example usage (uncomment and replace with a real id):
print_participant_head_tail(raw_feature_df, 'arrivejanitoruniformly@timestudy_com', n=5)


--- HEAD (5) for participant arrivejanitoruniformly@timestudy_com ---


Unnamed: 0,participant_id,prompt_time_converted,outcome,is_weekend,time_of_day,in_battery_saver_mode,charging_status,location_category,screen_on,dist_from_home,...,closeness_to_sleep_time,closeness_to_wake_time,mims_5min,days_in_study,completion_24h,completion_1h,time_between_prompts,time_since_last_answered,completion_since_wake,completion_since_start
0,arrivejanitoruniformly@timestudy_com,2021-10-23 00:24:01,1,1,Late Night,,,Home,0,0.00999,...,200.983333,819.016667,38.42509,250,0.0,0.0,0.0,0.0,0.0,0.0
1,arrivejanitoruniformly@timestudy_com,2021-10-23 00:37:00,1,1,Late Night,0.0,0.0,Home,0,0.010297,...,188.0,832.0,45.288044,250,0.0,0.0,12.966667,0.0,0.0,0.0
2,arrivejanitoruniformly@timestudy_com,2021-10-23 00:46:02,1,1,Late Night,,,Home,0,0.009733,...,178.966667,841.033333,56.353667,250,0.0,0.0,8.0,0.0,0.0,0.0
3,arrivejanitoruniformly@timestudy_com,2021-10-23 00:54:02,1,1,Late Night,,,Home,0,0.009829,...,170.966667,849.033333,42.199854,250,0.0,0.0,21.016667,0.0,0.0,0.0
4,arrivejanitoruniformly@timestudy_com,2021-10-23 03:09:03,1,1,Late Night,,,Home,0,0.008112,...,35.95,984.05,61.503609,250,0.25,0.333333,18.033333,18.033333,0.25,0.25


--- TAIL (5) for participant arrivejanitoruniformly@timestudy_com ---


Unnamed: 0,participant_id,prompt_time_converted,outcome,is_weekend,time_of_day,in_battery_saver_mode,charging_status,location_category,screen_on,dist_from_home,...,closeness_to_sleep_time,closeness_to_wake_time,mims_5min,days_in_study,completion_24h,completion_1h,time_between_prompts,time_since_last_answered,completion_since_wake,completion_since_start
10840,arrivejanitoruniformly@timestudy_com,2021-10-12 22:53:00,0,0,Night,0.0,1.0,Home,0,0.010586,...,127.0,698.0,0.0,239,0.829457,0.829457,0.0,0.0,0.0,0.0
10841,arrivejanitoruniformly@timestudy_com,2021-10-12 23:25:01,0,0,Night,0.0,1.0,Home,0,0.010269,...,94.983333,730.016667,0.556412,239,0.830116,0.830116,0.0,0.0,0.0,0.0
10842,arrivejanitoruniformly@timestudy_com,2021-10-12 23:33:01,0,0,Night,0.0,1.0,Home,0,0.010807,...,86.983333,738.016667,0.631078,239,0.830769,0.830769,0.0,0.0,0.0,0.0
10843,arrivejanitoruniformly@timestudy_com,2021-10-12 23:43:01,0,0,Night,,,Home,0,0.010293,...,76.983333,748.016667,0.0,239,0.831418,0.831418,0.0,0.0,0.0,0.0
10844,arrivejanitoruniformly@timestudy_com,2021-10-12 23:53:01,0,0,Night,,,Home,0,0.011239,...,66.983333,758.016667,0.0,239,0.832061,0.832061,0.0,0.0,0.0,0.0


In [33]:
def missing_value_table(df):
    skip_cols = ['participant_id', 'prompt_time_converted', 'outcome']
    cols = [col for col in df.columns if col.lower() not in skip_cols]
    missing_percent = df[cols].isnull().mean() * 100
    # Also count empty strings as missing
    empty_percent = (df[cols] == '').mean() * 100
    total_missing_percent = missing_percent + empty_percent
    result = pd.DataFrame({
        'missing_%': total_missing_percent.round(2)
    }).sort_values('missing_%', ascending=False)
    return result
    
# Display missing value table
missing_value_table(raw_feature_df)

Unnamed: 0,missing_%
in_battery_saver_mode,52.06
charging_status,52.06
last_phone_usage,17.77
is_phone_locked,17.56
dist_from_home,17.05
mims_5min,5.33
time_of_day,0.27
closeness_to_sleep_time,0.19
wake_day_part,0.19
closeness_to_wake_time,0.19


In [24]:
def missing_data_by_participant(df):
    skip_cols = ['prompt_time_converted', 'outcome']
    cols = [col for col in df.columns if col.lower() not in skip_cols and col.lower() != 'participant_id']
    # Create a boolean DataFrame for missing values (NaN or empty string)
    missing_bool = df[cols].isnull() | (df[cols] == '')
    # Group by participant_id and calculate % missing for each column
    missing_percent = missing_bool.groupby(df['participant_id']).mean() * 100
    missing_percent = missing_percent.round(2)
    # Reset index to have participant_id as a column
    missing_percent = missing_percent.reset_index()
    return missing_percent

# Print % missing data for each participant and variable
missing_data_by_participant(raw_feature_df)

Unnamed: 0,participant_id,is_weekend,time_of_day,in_battery_saver_mode,charging_status,location_category,screen_on,dist_from_home,is_phone_locked,last_phone_usage,...,closeness_to_sleep_time,closeness_to_wake_time,mims_5min,days_in_study,completion_24h,completion_1h,time_between_prompts,time_since_last_answered,completion_since_wake,completion_since_start
0,arrivejanitoruniformly@timestudy_com,0.0,2.43,52.67,52.67,0.0,0.0,5.28,0.00,0.00,...,0.00,0.00,1.23,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,atlanticchefhatchet@timestudy_com,0.0,0.00,50.65,50.65,0.0,0.0,6.51,0.00,0.00,...,0.00,0.00,4.80,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,badlandwiltmuseum@timestudy_com,0.0,0.00,51.86,51.86,0.0,0.0,4.36,0.60,0.67,...,0.00,0.00,4.81,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,bannisterhardwiredladle@timestudy_com,0.0,0.00,52.62,52.62,0.0,0.0,16.95,0.00,0.00,...,0.00,0.00,6.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,bartenderradiatorapplied@timestudy_com,0.0,0.00,69.27,69.27,0.0,0.0,51.92,100.00,100.00,...,1.32,1.32,26.45,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,vagabondnumerousflatterer@timestudy_com,0.0,0.00,51.35,51.35,0.0,0.0,55.70,22.91,25.51,...,0.54,0.54,7.91,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,whoeverrelightspookily@timestudy_com,0.0,0.00,62.86,62.86,0.0,0.0,16.42,4.76,4.76,...,0.00,0.00,34.13,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,wikipediaetchingcrystal@timestudy_com,0.0,0.00,51.45,51.45,0.0,0.0,10.92,0.00,0.00,...,0.00,0.00,1.71,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,wrigglecatalyststerility@timestudy_com,0.0,0.00,52.13,52.13,0.0,0.0,6.98,0.38,0.38,...,0.00,0.00,2.88,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## one-hot encoding
Convert the categorical variables into one-hot dummy codes

In [25]:
def one_hot_encode_features(df, columns):
    """
    One-hot encode specified categorical columns in the DataFrame.
    Returns a new DataFrame with one-hot encoded columns as 0/1 integers and original columns dropped.
    """
    df_encoded = df.copy()
    df_encoded = pd.get_dummies(df_encoded, columns=columns, prefix=columns, drop_first=False)
    # Ensure all new one-hot columns are int (0/1)
    for col in df_encoded.columns:
        if any(col.startswith(f'{c}_') for c in columns):
            df_encoded[col] = df_encoded[col].astype(int)
    return df_encoded

# Example usage:
categorical_vars = ['time_of_day', 'location_category', 'wake_day_part']
raw_feature_df_encoded = one_hot_encode_features(raw_feature_df, categorical_vars)
raw_feature_df_encoded.head()

Unnamed: 0,participant_id,prompt_time_converted,outcome,is_weekend,in_battery_saver_mode,charging_status,screen_on,dist_from_home,is_phone_locked,last_phone_usage,...,wake_day_part_24.0,wake_day_part_25.0,wake_day_part_26.0,wake_day_part_27.0,wake_day_part_28.0,wake_day_part_29.0,wake_day_part_30.0,wake_day_part_31.0,wake_day_part_32.0,wake_day_part_33.0
0,arrivejanitoruniformly@timestudy_com,2021-10-23 00:24:01,1,1,,,0,0.00999,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,arrivejanitoruniformly@timestudy_com,2021-10-23 00:37:00,1,1,0.0,0.0,0,0.010297,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,arrivejanitoruniformly@timestudy_com,2021-10-23 00:46:02,1,1,,,0,0.009733,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,arrivejanitoruniformly@timestudy_com,2021-10-23 00:54:02,1,1,,,0,0.009829,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,arrivejanitoruniformly@timestudy_com,2021-10-23 03:09:03,1,1,,,0,0.008112,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
## List the column names in the DataFrame
def list_column_names(df: pd.DataFrame) -> List[str]:
    """
    List all column names in the DataFrame.
    """
    return df.columns.tolist()

## Use the function to get column names
column_names = list_column_names(raw_feature_df_encoded)
print("Column names in the DataFrame:")
print(column_names)

Column names in the DataFrame:
['participant_id', 'prompt_time_converted', 'outcome', 'is_weekend', 'in_battery_saver_mode', 'charging_status', 'screen_on', 'dist_from_home', 'is_phone_locked', 'last_phone_usage', 'closeness_to_sleep_time', 'closeness_to_wake_time', 'mims_5min', 'days_in_study', 'completion_24h', 'completion_1h', 'time_between_prompts', 'time_since_last_answered', 'completion_since_wake', 'completion_since_start', 'time_of_day_Afternoon', 'time_of_day_Early Morning', 'time_of_day_Evening', 'time_of_day_Late Night', 'time_of_day_Morning', 'time_of_day_Night', 'location_category_Home', 'location_category_Other', 'location_category_School', 'location_category_Transit', 'location_category_Work', 'wake_day_part_0.0', 'wake_day_part_1.0', 'wake_day_part_2.0', 'wake_day_part_3.0', 'wake_day_part_4.0', 'wake_day_part_5.0', 'wake_day_part_6.0', 'wake_day_part_7.0', 'wake_day_part_8.0', 'wake_day_part_9.0', 'wake_day_part_10.0', 'wake_day_part_11.0', 'wake_day_part_12.0', 'wake_

## Normalize features [Skip this step]

In [27]:
from sklearn.preprocessing import MinMaxScaler

def min_max_scale_by_participant(df, columns, group_col='participant_id'):
    """
    Apply min-max scaling (0-1) to specified columns, grouped by participant_id.
    Returns a new DataFrame with scaled columns (original columns replaced).
    """
    df_scaled = df.copy()
    scaler = MinMaxScaler()
    # Apply scaling for each participant
    for pid, group in df.groupby(group_col):
        idx = group.index
        scaled_values = scaler.fit_transform(group[columns])
        df_scaled.loc[idx, columns] = scaled_values
    return df_scaled

# Example usage:
scale_columns = ['dist_from_home', 'last_phone_usage', 'closeness_to_sleep_time', 'closeness_to_wake_time', 
                 'mims_5min', 'time_between_prompts', 'time_since_last_answered']
raw_feature_df_scaled = min_max_scale_by_participant(raw_feature_df_encoded, scale_columns)
raw_feature_df_scaled[scale_columns + ['participant_id']].head()

  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis

Unnamed: 0,dist_from_home,last_phone_usage,closeness_to_sleep_time,closeness_to_wake_time,mims_5min,time_between_prompts,time_since_last_answered,participant_id
0,2e-06,0.0,0.187031,0.807168,0.15051,0.0,0.0,arrivejanitoruniformly@timestudy_com
1,2e-06,0.0,0.180555,0.814221,0.177391,0.000856,0.0,arrivejanitoruniformly@timestudy_com
2,2e-06,0.0,0.17605,0.819127,0.220735,0.000528,0.0,arrivejanitoruniformly@timestudy_com
3,2e-06,0.0,0.17206,0.823473,0.165295,0.001388,0.0,arrivejanitoruniformly@timestudy_com
4,2e-06,0.0,0.104721,0.896812,0.240907,0.001191,0.001191,arrivejanitoruniformly@timestudy_com


## FixedMax scaling for days in study

In [30]:
# Display head and tail for participant_id, prompt_time_converted, and days_in_study from the encoded DataFrame

from IPython.display import display
cols = ['participant_id', 'prompt_time_converted', 'days_in_study']
if 'raw_feature_df_encoded' in globals():
    df = raw_feature_df_encoded
    missing = [c for c in cols if c not in df.columns]
    if missing:
        print(f'Missing columns in raw_feature_df_encoded: {missing}')
    else:
        print('--- HEAD (participant_id, days_in_study) ---')
        display(df[cols].head())
        print('--- TAIL (participant_id, days_in_study) ---')
        display(df[cols].tail())
else:
    print("raw_feature_df_encoded not found in the notebook namespace. Run the one-hot encoding cell first.")

--- HEAD (participant_id, days_in_study) ---


Unnamed: 0,participant_id,prompt_time_converted,days_in_study
0,arrivejanitoruniformly@timestudy_com,2021-10-23 00:24:01,250
1,arrivejanitoruniformly@timestudy_com,2021-10-23 00:37:00,250
2,arrivejanitoruniformly@timestudy_com,2021-10-23 00:46:02,250
3,arrivejanitoruniformly@timestudy_com,2021-10-23 00:54:02,250
4,arrivejanitoruniformly@timestudy_com,2021-10-23 03:09:03,250


--- TAIL (participant_id, days_in_study) ---


Unnamed: 0,participant_id,prompt_time_converted,days_in_study
1087348,yearlingfiberspotty@timestudy_com,2021-10-12 21:44:03,95
1087349,yearlingfiberspotty@timestudy_com,2021-10-12 21:54:01,95
1087350,yearlingfiberspotty@timestudy_com,2021-10-12 22:30:01,95
1087351,yearlingfiberspotty@timestudy_com,2021-10-12 22:38:04,95
1087352,yearlingfiberspotty@timestudy_com,2021-10-12 22:46:01,95
