## Import libraries

In [28]:
import pandas as pd 
import numpy as np 
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
import gc
import json
import time

In [29]:
## For CPU only use

USE_CPU_ONLY = True 

if USE_CPU_ONLY:
    print("\n[STABILITY MODE] Disabling GPU to prevent Metal compilation hangs...")
    tf.config.set_visible_devices([], 'GPU')
else:
    # Re-adding the memory growth code for when GPU is enabled
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            print("[GPU MODE] Memory growth enabled.")
        except RuntimeError as e:
            print(f"Memory growth setting failed: {e}")


[STABILITY MODE] Disabling GPU to prevent Metal compilation hangs...


## Import data files

In [3]:
## Read general meadian file
global_median = pd.read_csv('/Users/adityaponnada/Downloads/time_study_data/general_rnn_medians.csv')

In [7]:
global_median.head()
print(global_median.columns)
print(global_median.shape)

Index(['Unnamed: 0', 'is_weekend', 'in_battery_saver_mode', 'charging_status',
       'screen_on', 'dist_from_home', 'is_phone_locked', 'last_phone_usage',
       'closeness_to_sleep_time', 'closeness_to_wake_time', 'mims_5min',
       'days_in_study', 'completion_24h', 'completion_1h',
       'time_between_prompts', 'time_since_last_answered',
       'completion_since_wake', 'completion_since_start',
       'time_of_day_Afternoon', 'time_of_day_Early Morning',
       'time_of_day_Evening', 'time_of_day_Late Night', 'time_of_day_Morning',
       'time_of_day_Night', 'location_category_Home',
       'location_category_Other', 'location_category_School',
       'location_category_Transit', 'location_category_Work',
       'wake_day_part_0.0', 'wake_day_part_1.0', 'wake_day_part_2.0',
       'wake_day_part_3.0'],
      dtype='object')
(1, 33)


In [10]:
## Read the global means file
global_means = pd.read_csv('/Users/adityaponnada/Downloads/time_study_data/global_means_general_rnn.csv')
global_means.head()
print(global_means.columns)
print(global_means.shape)

Index(['Unnamed: 0', 'dist_from_home', 'last_phone_usage',
       'closeness_to_sleep_time', 'closeness_to_wake_time', 'mims_5min',
       'completion_24h', 'completion_1h', 'time_between_prompts',
       'time_since_last_answered', 'completion_since_wake',
       'completion_since_start'],
      dtype='object')
(1, 12)


In [11]:
## Read the withdrew processed features file
withdrew_features = pd.read_csv('/Users/adityaponnada/Downloads/time_study_data/processed_features_withdrew.csv')
print(withdrew_features.shape)
withdrew_features.head()

(235071, 66)


Unnamed: 0,participant_id,outcome,is_weekend,in_battery_saver_mode,charging_status,screen_on,dist_from_home,is_phone_locked,last_phone_usage,closeness_to_sleep_time,...,mi_time_of_day_Night,mi_location_category_Home,mi_location_category_Other,mi_location_category_School,mi_location_category_Transit,mi_location_category_Work,mi_wake_day_part_0.0,mi_wake_day_part_1.0,mi_wake_day_part_2.0,mi_wake_day_part_3.0
0,ambushdollhousegenerous@timestudy_com,0,0,0.0,1.0,0,0.060436,1.0,11.1,980.95,...,0,0,0,0,0,0,0,0,0,0
1,ambushdollhousegenerous@timestudy_com,1,0,0.0,0.0,1,0.059622,0.0,0.0,830.9,...,0,0,0,0,0,0,0,0,0,0
2,ambushdollhousegenerous@timestudy_com,1,0,,,1,0.042405,0.0,0.0,487.783333,...,0,0,0,0,0,0,0,0,0,0
3,ambushdollhousegenerous@timestudy_com,0,0,0.0,1.0,1,0.008069,0.0,0.0,242.95,...,0,0,0,0,0,0,0,0,0,0
4,ambushdollhousegenerous@timestudy_com,1,0,0.0,0.0,1,0.059189,0.0,0.0,186.583333,...,0,0,0,0,0,0,0,0,0,0


In [12]:
withdrew_features.columns

Index(['participant_id', 'outcome', 'is_weekend', 'in_battery_saver_mode',
       'charging_status', 'screen_on', 'dist_from_home', 'is_phone_locked',
       'last_phone_usage', 'closeness_to_sleep_time', 'closeness_to_wake_time',
       'mims_5min', 'days_in_study', 'completion_24h', 'completion_1h',
       'time_between_prompts', 'time_since_last_answered',
       'completion_since_wake', 'completion_since_start',
       'time_of_day_Afternoon', 'time_of_day_Early Morning',
       'time_of_day_Evening', 'time_of_day_Late Night', 'time_of_day_Morning',
       'time_of_day_Night', 'location_category_Home',
       'location_category_Other', 'location_category_School',
       'location_category_Transit', 'location_category_Work',
       'wake_day_part_0.0', 'wake_day_part_1.0', 'wake_day_part_2.0',
       'wake_day_part_3.0', 'mi_is_weekend', 'mi_in_battery_saver_mode',
       'mi_charging_status', 'mi_screen_on', 'mi_dist_from_home',
       'mi_is_phone_locked', 'mi_last_phone_usage',
 

## Impute missing data
Using fillforward + global median

In [13]:
def impute_within_participant(withdrew_df, global_median, id_col='participant_id'):
    """Impute missing values by forward-filling within each participant.

    - Skips columns starting with "mi_".
    - Excludes `id_col` and outcome columns (`outcome`, `outcomes`).
    - If the first observation for a participant is NaN, fills it using `global_median`.

    Args:
        withdrew_df (pd.DataFrame): input features dataframe (observations ordered per participant).
        global_median (pd.Series or pd.DataFrame): mapping of column -> median value.
        id_col (str): participant id column name (default: 'participant_id').

    Returns:
        pd.DataFrame: imputed copy of `withdrew_df`.
    """
    import pandas as pd
    import numpy as np

    if id_col not in withdrew_df.columns:
        raise ValueError(f"id_col '{id_col}' not found in withdrew_df columns")

    df = withdrew_df.copy()

    # Exclude participant id and outcome columns (case-insensitive)
    exclude = {id_col.lower(), 'outcome', 'outcomes'}

    # Build list of columns to impute
    cols_to_impute = [c for c in df.columns if (c.lower() not in exclude) and (not c.lower().startswith('mi_'))]

    # Forward-fill within each participant
    try:
        df[cols_to_impute] = df.groupby(id_col, sort=False)[cols_to_impute].ffill()
    except Exception:
        # Fallback if grouping/ffill fails for mixed dtypes
        df[cols_to_impute] = df.groupby(id_col, sort=False)[cols_to_impute].apply(lambda g: g.ffill())

    # Helper to extract median from global_median (Series or DataFrame)
    def _get_global_median(col):
        try:
            if isinstance(global_median, pd.Series):
                return global_median.get(col, np.nan)
            if isinstance(global_median, pd.DataFrame):
                # If column exists as DataFrame column, take its first non-null
                if col in global_median.columns:
                    vals = global_median[col].dropna().values
                    if len(vals) > 0:
                        return vals[0]
                # If index contains column names and there's a 'median' column
                if 'median' in global_median.columns and col in global_median.index:
                    return global_median.loc[col, 'median']
                # If index contains column names, return first non-null in that row
                if col in global_median.index:
                    row = global_median.loc[col]
                    if hasattr(row, 'dropna'):
                        vals = row.dropna().values
                        if len(vals) > 0:
                            return vals[0]
        except Exception:
            pass
        return np.nan

    # Fill remaining NaNs (leading NaNs) with global median (or df median as fallback)
    for col in cols_to_impute:
        if df[col].isna().any():
            med = _get_global_median(col)
            if pd.isna(med):
                try:
                    med = df[col].median(skipna=True)
                except Exception:
                    med = np.nan
            if pd.notna(med):
                df[col] = df[col].fillna(med)

    return df


In [14]:
withdrew_features = impute_within_participant(withdrew_features, global_median, id_col='participant_id')
withdrew_features.columns
withdrew_features.head()

Unnamed: 0,participant_id,outcome,is_weekend,in_battery_saver_mode,charging_status,screen_on,dist_from_home,is_phone_locked,last_phone_usage,closeness_to_sleep_time,...,mi_time_of_day_Night,mi_location_category_Home,mi_location_category_Other,mi_location_category_School,mi_location_category_Transit,mi_location_category_Work,mi_wake_day_part_0.0,mi_wake_day_part_1.0,mi_wake_day_part_2.0,mi_wake_day_part_3.0
0,ambushdollhousegenerous@timestudy_com,0,0,0.0,1.0,0,0.060436,1.0,11.1,980.95,...,0,0,0,0,0,0,0,0,0,0
1,ambushdollhousegenerous@timestudy_com,1,0,0.0,0.0,1,0.059622,0.0,0.0,830.9,...,0,0,0,0,0,0,0,0,0,0
2,ambushdollhousegenerous@timestudy_com,1,0,0.0,0.0,1,0.042405,0.0,0.0,487.783333,...,0,0,0,0,0,0,0,0,0,0
3,ambushdollhousegenerous@timestudy_com,0,0,0.0,1.0,1,0.008069,0.0,0.0,242.95,...,0,0,0,0,0,0,0,0,0,0
4,ambushdollhousegenerous@timestudy_com,1,0,0.0,0.0,1,0.059189,0.0,0.0,186.583333,...,0,0,0,0,0,0,0,0,0,0


## Z-normalization
Using the global mean from pre-training

In [15]:
def z_normalize_within_participant(withdrew_df, global_means, id_col='participant_id', cols=None):
    """Z-normalize selected features using provided global_means.

    - Only normalizes columns in `cols` (defaults to the list requested).
    - Skips `participant_id` and outcome columns.
    - Uses `global_means` to obtain per-column means. For std, tries to read from
      `global_means` (column 'std' or a 'std' entry) and falls back to the column std
      computed from `withdrew_df` if unavailable.
    - The normalization is applied grouped by `id_col` (i.e., transform per participant),
      but uses global mean/std values for scaling.

    Args:
        withdrew_df (pd.DataFrame): input dataframe containing features.
        global_means (pd.Series or pd.DataFrame): mapping of column -> mean (and optionally std).
        id_col (str): participant id column name.
        cols (list[str] | None): list of columns to normalize. If None, uses the default set.

    Returns:
        pd.DataFrame: a copy of `withdrew_df` with selected columns z-normalized.
    """
    import pandas as pd
    import numpy as np

    default_cols = [
        'dist_from_home', 'last_phone_usage', 'closeness_to_sleep_time', 'closeness_to_wake_time',
        'mims_5min', 'completion_24h', 'completion_1h', 'time_between_prompts',
        'time_since_last_answered', 'completion_since_wake', 'completion_since_start'
    ]
    if cols is None:
        cols = default_cols

    if id_col not in withdrew_df.columns:
        raise ValueError(f"id_col '{id_col}' not found in withdrew_df columns")

    df = withdrew_df.copy()

    # Exclude id and outcome columns explicitly
    exclude = {id_col.lower(), 'outcome', 'outcomes'}
    cols_to_scale = [c for c in cols if (c in df.columns) and (c.lower() not in exclude)]

    def _get_mean(col):
        try:
            if isinstance(global_means, pd.Series):
                return global_means.get(col, np.nan)
            if isinstance(global_means, pd.DataFrame):
                # If there's a 'mean' column and index contains column names
                if 'mean' in global_means.columns and col in global_means.index:
                    return global_means.loc[col, 'mean']
                # If the DataFrame has the feature as a column, take first non-null
                if col in global_means.columns:
                    vals = global_means[col].dropna().values
                    if len(vals) > 0:
                        return vals[0]
                # If index contains col and any column present, take first non-null in the row
                if col in global_means.index:
                    row = global_means.loc[col]
                    if hasattr(row, 'dropna'):
                        vals = row.dropna().values
                        if len(vals) > 0:
                            return vals[0]
        except Exception:
            pass
        return np.nan

    def _get_std(col):
        try:
            if isinstance(global_means, pd.DataFrame):
                if 'std' in global_means.columns and col in global_means.index:
                    return global_means.loc[col, 'std']
                # If std is provided as a column named like '<col>_std'
                std_col = f"{col}_std"
                if std_col in global_means.columns:
                    vals = global_means[std_col].dropna().values
                    if len(vals) > 0:
                        return vals[0]
            # if global_means is a Series containing tuples or dicts, skip
        except Exception:
            pass
        # Fallback: compute std from withdrew_df
        try:
            s = df[col].std(skipna=True)
            if pd.notna(s) and s > 0:
                return s
        except Exception:
            pass
        return np.nan

    for col in cols_to_scale:
        mean = _get_mean(col)
        std = _get_std(col)
        if pd.isna(mean):
            # fallback to column mean from df
            try:
                mean = df[col].mean(skipna=True)
            except Exception:
                mean = 0.0
        if pd.isna(std) or std == 0:
            std = 1.0
        # apply transform grouped by participant (uses same mean/std for all groups)
        df[col] = df.groupby(id_col, sort=False)[col].transform(lambda x, m=mean, s=std: (x - m) / s)

    return df


In [16]:
withdrew_features = z_normalize_within_participant(withdrew_features, global_means, id_col='participant_id')
withdrew_features.head()

Unnamed: 0,participant_id,outcome,is_weekend,in_battery_saver_mode,charging_status,screen_on,dist_from_home,is_phone_locked,last_phone_usage,closeness_to_sleep_time,...,mi_time_of_day_Night,mi_location_category_Home,mi_location_category_Other,mi_location_category_School,mi_location_category_Transit,mi_location_category_Work,mi_wake_day_part_0.0,mi_wake_day_part_1.0,mi_wake_day_part_2.0,mi_wake_day_part_3.0
0,ambushdollhousegenerous@timestudy_com,0,0,0.0,1.0,0,-0.160718,1.0,-0.413376,1.961272,...,0,0,0,0,0,0,0,0,0,0
1,ambushdollhousegenerous@timestudy_com,1,0,0.0,0.0,1,-0.160723,0.0,-0.91642,1.406289,...,0,0,0,0,0,0,0,0,0,0
2,ambushdollhousegenerous@timestudy_com,1,0,0.0,0.0,1,-0.160827,0.0,-0.91642,0.137218,...,0,0,0,0,0,0,0,0,0,0
3,ambushdollhousegenerous@timestudy_com,0,0,0.0,1.0,1,-0.161034,0.0,-0.91642,-0.768337,...,0,0,0,0,0,0,0,0,0,0
4,ambushdollhousegenerous@timestudy_com,1,0,0.0,0.0,1,-0.160726,0.0,-0.91642,-0.976818,...,0,0,0,0,0,0,0,0,0,0


## Keep only relevant columns
We only keep columns used for training

In [18]:
## read column list .txt file
with open('/Users/adityaponnada/Downloads/time_study_data/processed_feature_columns.txt', 'r') as f:
    column_list = [line.strip() for line in f if line.strip()]


In [20]:
print(column_list)
print(len(column_list))

['participant_id', 'outcome', 'is_weekend', 'in_battery_saver_mode', 'charging_status', 'screen_on', 'dist_from_home', 'is_phone_locked', 'last_phone_usage', 'closeness_to_sleep_time', 'closeness_to_wake_time', 'mims_5min', 'days_in_study', 'completion_24h', 'completion_1h', 'time_between_prompts', 'time_since_last_answered', 'completion_since_wake', 'completion_since_start', 'time_of_day_Afternoon', 'time_of_day_Early Morning', 'time_of_day_Evening', 'time_of_day_Late Night', 'time_of_day_Morning', 'time_of_day_Night', 'location_category_Home', 'location_category_Other', 'location_category_School', 'location_category_Transit', 'location_category_Work', 'wake_day_part_0.0', 'wake_day_part_1.0', 'wake_day_part_2.0', 'wake_day_part_3.0', 'mi_in_battery_saver_mode', 'mi_charging_status', 'mi_dist_from_home', 'mi_is_phone_locked', 'mi_last_phone_usage', 'mi_closeness_to_sleep_time', 'mi_closeness_to_wake_time', 'mi_mims_5min']
42


In [23]:
## keep only those columns in withdrew_features
withdrew_features = withdrew_features[column_list]
withdrew_features.head()

Unnamed: 0,participant_id,outcome,is_weekend,in_battery_saver_mode,charging_status,screen_on,dist_from_home,is_phone_locked,last_phone_usage,closeness_to_sleep_time,...,wake_day_part_2.0,wake_day_part_3.0,mi_in_battery_saver_mode,mi_charging_status,mi_dist_from_home,mi_is_phone_locked,mi_last_phone_usage,mi_closeness_to_sleep_time,mi_closeness_to_wake_time,mi_mims_5min
0,ambushdollhousegenerous@timestudy_com,0,0,0.0,1.0,0,-0.160718,1.0,-0.413376,1.961272,...,0,0,0,0,0,0,0,0,0,0
1,ambushdollhousegenerous@timestudy_com,1,0,0.0,0.0,1,-0.160723,0.0,-0.91642,1.406289,...,0,0,0,0,0,0,0,0,0,1
2,ambushdollhousegenerous@timestudy_com,1,0,0.0,0.0,1,-0.160827,0.0,-0.91642,0.137218,...,1,0,1,1,0,0,0,0,0,1
3,ambushdollhousegenerous@timestudy_com,0,0,0.0,1.0,1,-0.161034,0.0,-0.91642,-0.768337,...,0,1,0,0,0,0,0,0,0,1
4,ambushdollhousegenerous@timestudy_com,1,0,0.0,0.0,1,-0.160726,0.0,-0.91642,-0.976818,...,0,1,0,0,0,0,0,0,0,1


In [24]:
withdrew_features.shape

(235071, 42)

In [25]:
withdrew_features['participant_id'].nunique()

58

## Prepare tensors

In [26]:
## Get the number of observations per participant_id ordered by highest to lowest
withdrew_features['participant_id'].value_counts().sort_values(ascending=False)

participant_id
orbsquackysyllabuses@timestudy_com           12458
palmbuggystole@timestudy_com                 10857
lizardcauterizepreplan@timestudy_com          9705
ropetinworkdemote@timestudy_com               8666
skipperdropdowncrawlers@timestudy_com         8317
hacksawscoldingdares@timestudy_com            8228
shadilymanholegreeter@timestudy_com           7933
confrontcaresssullen@timestudy_com            7809
skydiverworriercarton@timestudy_com           7728
smeltingexerciserstabilize@timestudy_com      7577
deitymagnifierdrove@timestudy_com             7477
buckedstiflestagnant@timestudy_com            7286
riftchaosdipper@timestudy_com                 7067
dimmeddismaylegume@timestudy_com              6180
hazingdiscolorsuffering@timestudy_com         5815
rankingkindnessspindle@timestudy_com          5722
lappedvastlydebating@timestudy_com            5487
iodinegrapemonstrous@timestudy_com            5281
itunesgurgleexchange@timestudy_com            5177
euphemismfederal

In [30]:
# --- 1. GLOBAL CONSTANTS ---
L_CHUNK = 3967
NUM_CHUNKS = 4
MAX_TOTAL_STEPS = L_CHUNK * NUM_CHUNKS  # 15,868
SENTINEL_VALUE = 999.0
NUM_FEATURES = 40

In [31]:
def convert_to_4d_tensors(df):
    """
    Converts a pre-scaled/imputed DataFrame into 4D tensors.
    Logic is identical for both Setup 1 and Setup 2.
    """
    print(f"Processing {len(df['participant_id'].unique())} participants...")
    
    X_list = []
    Y_list = []
    participant_ids = []

    # Ensure the dataframe is sorted by participant and time (if applicable)
    # Group by participant to keep sequences intact
    grouped = df.groupby('participant_id')

    for p_id, group in grouped:
        participant_ids.append(p_id)
        
        # 1. Extract Features (X)
        # Drop metadata columns to leave only the 40 features
        x_features = group.drop(columns=['participant_id', 'outcome']).values
        
        # 2. Extract Labels (Y)
        y_labels = group['outcome'].values.astype('float32').reshape(-1, 1)
        
        X_list.append(x_features)
        Y_list.append(y_labels)

    # 3. Padding
    # pad_sequences handles users who withdrew early (shorter than 15,868 steps)
    # 'post' padding ensures the actual behavior is at the start, and sentinel values are at the end
    print("Padding sequences to 15,868 steps...")
    X_padded = pad_sequences(
        X_list, 
        maxlen=MAX_TOTAL_STEPS, 
        padding='post', 
        dtype='float32', 
        value=SENTINEL_VALUE
    )
    
    Y_padded = pad_sequences(
        Y_list, 
        maxlen=MAX_TOTAL_STEPS, 
        padding='post', 
        dtype='float32', 
        value=SENTINEL_VALUE
    )

    # 4. Reshaping to 4D (N, Chunks, L_Chunk, Features)
    num_participants = len(participant_ids)
    
    print(f"Reshaping into 4D tensors...")
    X_4d = X_padded.reshape(num_participants, NUM_CHUNKS, L_CHUNK, NUM_FEATURES)
    Y_4d = Y_padded.reshape(num_participants, NUM_CHUNKS, L_CHUNK, 1)

    # 5. Convert to TF Tensors
    X_tensor = tf.cast(X_4d, tf.float32)
    Y_tensor = tf.cast(Y_4d, tf.float32)

    print(f"Final X Shape: {X_tensor.shape}") # (N, 4, 3967, 40)
    print(f"Final Y Shape: {Y_tensor.shape}") # (N, 4, 3967, 1)
    
    return X_tensor, Y_tensor, participant_ids

In [32]:
X_withdrawn_s1, Y_withdrawn_s1, p_ids = convert_to_4d_tensors(withdrew_features)

Processing 58 participants...
Padding sequences to 15,868 steps...
Reshaping into 4D tensors...
Final X Shape: (58, 4, 3967, 40)
Final Y Shape: (58, 4, 3967, 1)


In [None]:
def calculate_burden_thresholds(X_tensor, Y_tensor, participant_ids, days_idx=0, sentinel=999.0):
    """
    Calculates the 'Burden Tolerance Threshold' and 'Burden Velocity' 
    for each withdrawn participant.
    
    The threshold is defined as the total cumulative count of 'Busy' (outcome=0) 
    moments recorded in the ground truth from Day 1 until withdrawal.
    
    The velocity is calculated as (Total Busy Pings / Actual Days in Study), 
    where days are derived from the 'days_in_study' feature.
    
    Args:
        X_tensor: 4D Tensor (N, 4, 3967, D) containing the scaled features.
        Y_tensor: 4D Tensor (N, 4, 3967, 1) containing labels and padding.
        participant_ids: List of participant IDs.
        days_idx: The column index of 'days_in_study' in the feature tensor.
        sentinel: The padding value to be ignored.
    """
    threshold_results = []
    
    # Ensure we are working with numpy arrays for efficient processing
    X_np = X_tensor.numpy() if hasattr(X_tensor, 'numpy') else X_tensor
    Y_np = Y_tensor.numpy() if hasattr(Y_tensor, 'numpy') else Y_tensor
    
    print(f"Calculating thresholds and temporal velocities for {len(participant_ids)} withdrawn users...")

    for i, p_id in enumerate(participant_ids):
        # 1. Handle Outcomes and Padding Mask
        y_user_flat = Y_np[i].flatten()
        valid_mask = y_user_flat != sentinel
        y_actual = y_user_flat[valid_mask]
        
        # Count total 'Busy' moments (outcome == 0)
        # This represents the user's specific 'Breaking Point'
        busy_moments = np.sum(y_actual == 0.0)
        
        # 2. Handle Study Duration (from Features)
        # We reshape X to flatten chunks and steps, then apply the same mask
        x_user_flat = X_np[i].reshape(-1, X_np.shape[-1])
        x_actual_features = x_user_flat[valid_mask]
        
        # Extract the 'days_in_study' column (which is Fixed-Max scaled by 365)
        scaled_days = x_actual_features[:, days_idx]
        
        # Denormalize to get actual days: Max scaled value * 365
        # Even if the user withdrew, the max value in this sequence is their last day.
        max_scaled_day = np.max(scaled_days)
        actual_days_raw = max_scaled_day * 365.0
        
        # Enforce a minimum of 1 day to ensure realistic velocities and avoid 0 values
        actual_days_in_study = np.maximum(1.0, actual_days_raw)
        
        # 3. Calculate Velocity (Intrusive Pings per Actual Day)
        # Epsilon is now secondary to the 1-day floor logic
        intrusive_velocity = busy_moments / actual_days_in_study
        
        threshold_results.append({
            'participant_id': p_id,
            'burden_threshold': int(busy_moments),
            'actual_days_in_study': round(actual_days_in_study, 2),
            'total_observations': len(y_actual),
            'intrusive_velocity_per_day': intrusive_velocity
        })

    # Create a summary DataFrame
    df_thresholds = pd.DataFrame(threshold_results)
    
    print("\n" + "="*50)
    print("      BURDEN & ATTRITION VELOCITY SUMMARY")
    print("="*50)
    print(f"Mean Threshold (Busy Pings):   {df_thresholds['burden_threshold'].mean():.1f}")
    print(f"Mean Duration (Actual Days):   {df_thresholds['actual_days_in_study'].mean():.1f}")
    print(f"Mean Velocity (Pings/Day):     {df_thresholds['intrusive_velocity_per_day'].mean():.4f}")
    print("="*50)
    
    return df_thresholds

In [39]:
df_breaking_points = calculate_burden_thresholds(X_withdrawn_s1, Y_withdrawn_s1, p_ids)
df_breaking_points.to_csv('/Users/adityaponnada/Downloads/time_study_data/withdrawn_user_thresholds.csv', index=False)

Calculating thresholds and temporal velocities for 58 withdrawn users...

      BURDEN & ATTRITION VELOCITY SUMMARY
Mean Threshold (Busy Pings):   882.5
Mean Duration (Actual Days):   346.1
Mean Velocity (Pings/Day):     15000002.4137


In [40]:
df_breaking_points

Unnamed: 0,participant_id,burden_threshold,actual_days_in_study,total_observations,intrusive_velocity_per_day
0,ambushdollhousegenerous@timestudy_com,958,365.0,3855,2.624658
1,anywaymustinesspushiness@timestudy_com,33,365.0,477,0.09041096
2,bottledeskworkrequire@timestudy_com,662,365.0,3937,1.813699
3,browsingfrisbeepersevere@timestudy_com,765,365.0,2482,2.09589
4,buckedstiflestagnant@timestudy_com,1643,365.0,7286,4.50137
5,busybodyestimatesensitize@timestudy_com,19,0.0,76,190000000.0
6,civicexcludingbarcode@timestudy_com,212,365.0,883,0.5808219
7,cladlandscapeheave@timestudy_com,110,365.0,370,0.3013699
8,confrontcaresssullen@timestudy_com,1676,365.0,7809,4.591781
9,deitymagnifierdrove@timestudy_com,515,365.0,7477,1.410959
