## Import libraries

In [1]:
import pandas as pd 
import numpy as np 


## Read the withdrawn participant file

In [2]:
## Import the status file
status_file = '/Users/adityaponnada/Downloads/time_study_data/participant_status_tracking_v2.csv'
status_df = pd.read_csv(status_file)

## Show the first few rows
print(status_df.head())
# Also print the columns names
print(status_df.columns)

   Record ID            Visualizer ID Participant Status  Consent Date  \
0       9001       sharpnessnextpouch           Completed    3/17/2020   
1       9002     uniformlyharmfulbush          Unenrolled    3/18/2020   
2       9003     hacksawscoldingdares            Withdrew    3/27/2020   
3       9004    dimnesscranialunheard           Completed    3/28/2020   
4       9005  coynessculminatebarista           Completed     4/8/2020   

  Date participant completed Date participant withdrew  \
0                  3/17/2021                       NaN   
1                        NaN                       NaN   
2                        NaN                 12/4/2020   
3                  3/28/2021                       NaN   
4                   4/8/2021                       NaN   

  Date participant unenrolled Date Devices Mailed ID of device loaned  \
0                         NaN           3/25/2020        C2F9214C2188   
1                  10/20/2020           3/25/2020        C2F

In [3]:
## Filter completed participants. We will only keep the visualizerID and status columns
status_df = status_df[status_df['Participant Status '] == 'Withdrew'][['Visualizer ID', 'Participant Status ']]
# Rename the visualizerID column to participant_id.
status_df.rename(columns={'Visualizer ID': 'participant_id'}, inplace=True)
# Also rename participant status to status
status_df.rename(columns={'Participant Status ': 'status'}, inplace=True)
# Reset the index
status_df.reset_index(drop=True, inplace=True)
# Add @timestudy_com to the participant_id column
status_df['participant_id'] = status_df['participant_id'] + '@timestudy_com'
## Show the first few rows
print(status_df.head())
# Also print the shape of the dataframe
print(status_df.shape)

                             participant_id    status
0        hacksawscoldingdares@timestudy_com  Withdrew
1               altoironyahoo@timestudy_com  Withdrew
2         wadblatancyflattery@timestudy_com  Withdrew
3  breechingtamenessdreamboat@timestudy_com  Withdrew
4        overstepsadnesscarat@timestudy_com  Withdrew
(90, 2)


In [4]:
withdrew_ids = status_df['participant_id'].tolist()
print(withdrew_ids)

['hacksawscoldingdares@timestudy_com', 'altoironyahoo@timestudy_com', 'wadblatancyflattery@timestudy_com', 'breechingtamenessdreamboat@timestudy_com', 'overstepsadnesscarat@timestudy_com', 'smeltingexerciserstabilize@timestudy_com', 'shaftbribezippy@timestudy_com', 'splinterimmorallyupward@timestudy_com', 'rankingkindnessspindle@timestudy_com', 'musicvividlybackstage@timestudy_com', 'backyardscapegoatoverrun@timestudy_com', 'floggingnicknamecondone/uprisingdisdaingraveyard@timestudy_com', 'hurricaneshrubscoral@timestudy_com', 'riftchaosdipper@timestudy_com', 'dizzinesscatatoniceconomist@timestudy_com', 'skydiverworriercarton@timestudy_com', 'cladlandscapeheave@timestudy_com', 'unrelatedtweedconcerned@timestudy_com', 'anywaymustinesspushiness@timestudy_com', 'unafraidreproducewad@timestudy_com', 'shadilymanholegreeter@timestudy_com', 'palmbuggystole@timestudy_com', 'ambushdollhousegenerous@timestudy_com', 'skipperdropdowncrawlers@timestudy_com', 'itunesgurgleexchange@timestudy_com', 'ge

In [5]:
def load_withdrew_comp_matrix(holdout_list, base_dir='/Users/adityaponnada/Downloads/time_study_data/compliance_matrix/'):
    """
    For each folder in `base_dir` whose name is present in `holdout_list`,
    concatenate all CSVs matching `uema_feature_mx_*.csv` in that folder (in sorted order)
    and append them to a single dataframe. Process folders sequentially (sorted by name).
    Returns a pandas DataFrame (does NOT add a `heldout_user` column).
    """
    import os, glob
    import pandas as pd
    if not os.path.isdir(base_dir):
        raise FileNotFoundError(f'Base directory not found: {base_dir}')
    # Ensure holdout_list elements are strings for matching
    holdout_set = set(str(x) for x in holdout_list)
    # Find folders in base_dir that match heldout users
    all_entries = sorted(os.listdir(base_dir))
    matched_folders = [d for d in all_entries if os.path.isdir(os.path.join(base_dir, d)) and d in holdout_set]
    matched_folders.sort()
    out_frames = []
    for folder in matched_folders:
        folder_path = os.path.join(base_dir, folder)
        pattern = os.path.join(folder_path, 'uema_feature_mx_*.csv')
        files = sorted(glob.glob(pattern))
        if not files:
            # no matching files for this user; skip
            continue
        user_frames = []
        for fp in files:
            try:
                df = pd.read_csv(fp)
            except Exception as e:
                print(f'Failed to read {fp}: {e}')
                continue
            user_frames.append(df)
        if user_frames:
            user_df = pd.concat(user_frames, ignore_index=True)
            # Do not add heldout_user column per request
            out_frames.append(user_df)
    if out_frames:
        heldout_df = pd.concat(out_frames, ignore_index=True)
    else:
        heldout_df = pd.DataFrame()
    return heldout_df

# Example usage (run in a cell after `holdout_list` is defined):
withdrew_df = load_withdrew_comp_matrix(withdrew_ids)
print(withdrew_df.shape)

(235112, 62)


In [7]:
withdrew_df.head()

Unnamed: 0,Participant_ID,Initial_Prompt_Date,Prompt_Type,Study_Mode,Initial_Prompt_Local_Time,Answer_Status,Actual_Prompt_Local_Time,First_Question_Completion_Unixtime,UTC_Offset,Reprompt_Num,...,start_time_7min,mims_summary_8min,num_readings_8min,start_time_8min,mims_summary_9min,num_readings_9min,start_time_9min,mims_summary_10min,num_readings_10min,start_time_10min
0,ambushdollhousegenerous@timestudy_com,2020-11-25,Trivia_EMA_Micro,TIME,Wed Nov 25 07:24:03 MST 2020,NeverStarted,Wed Nov 25 07:24:03 MST 2020,-1,GMT-07:00,0,...,2020-11-25 07:16:03.003,0.0,340.0,2020-11-25 07:15:03.003,0.0,400.0,2020-11-25 07:14:03.003,0.0,460.0,2020-11-25 07:13:03.003
1,ambushdollhousegenerous@timestudy_com,2020-11-25,Trivia_EMA_Micro,TIME,Wed Nov 25 09:54:06 MST 2020,Completed,Wed Nov 25 09:54:06 MST 2020,1606323255361,GMT-07:00,0,...,,OB,0.0,,OB,0.0,,OB,0.0,
2,ambushdollhousegenerous@timestudy_com,2020-11-25,Trivia_EMA_Micro,TIME,Wed Nov 25 15:37:13 MST 2020,Completed,Wed Nov 25 15:37:13 MST 2020,1606343842226,GMT-07:00,0,...,,OB,0.0,,OB,0.0,,OB,0.0,
3,ambushdollhousegenerous@timestudy_com,2020-11-25,Trivia_EMA_Micro,TIME,Wed Nov 25 19:42:03 MST 2020,NeverStarted,Wed Nov 25 19:42:03 MST 2020,-1,GMT-07:00,0,...,,OB,0.0,,OB,0.0,,OB,0.0,
4,ambushdollhousegenerous@timestudy_com,2020-11-25,Trivia_EMA_Micro,TIME,Wed Nov 25 20:38:25 MST 2020,Completed,Wed Nov 25 20:38:25 MST 2020,1606361910588,GMT-07:00,0,...,,OB,0.0,,OB,0.0,,OB,0.0,


In [8]:
withdrew_df['Participant_ID'].nunique()

59

In [10]:
withdrew_df['Participant_ID'].unique()

array(['ambushdollhousegenerous@timestudy_com', 'unknown_user',
       'anywaymustinesspushiness@timestudy_com',
       'bottledeskworkrequire@timestudy_com',
       'browsingfrisbeepersevere@timestudy_com',
       'buckedstiflestagnant@timestudy_com',
       'busybodyestimatesensitize@timestudy_com',
       'civicexcludingbarcode@timestudy_com',
       'cladlandscapeheave@timestudy_com',
       'confrontcaresssullen@timestudy_com',
       'deitymagnifierdrove@timestudy_com',
       'dimmeddismaylegume@timestudy_com',
       'dizzinesscatatoniceconomist@timestudy_com',
       'enjoyingretreathandled@timestudy_com',
       'euphemismfederalconfusing@timestudy_com',
       'generouswidthcoasting@timestudy_com', 'gushyenstir@timestudy_com',
       'hacksawscoldingdares@timestudy_com',
       'hazingdiscolorsuffering@timestudy_com',
       'himationlalospheres@timestudy_com',
       'huntingevergreendeparted@timestudy_com',
       'iodinegrapemonstrous@timestudy_com',
       'itunesgurglee

In [11]:
## Remove Participant_ID. = 'unknown_user'
withdrew_df = withdrew_df[withdrew_df['Participant_ID'] != 'unknown_user']
print(withdrew_df.shape)
print(withdrew_df['Participant_ID'].nunique())

(235071, 62)
58


In [13]:
## Save compliance_matrix to a csv file. The filename should have _date_time appended to it.
import datetime


current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
withdrew_df.to_csv(f'/Users/adityaponnada/Downloads/time_study_data/withdrew_comp_mx_{current_time}.csv', index=False)
print(f"Withdrew dataframe saved to /Users/adityaponnada/Downloads/time_study_data/withdrew_comp_mx_{current_time}.csv")

Withdrew dataframe saved to /Users/adityaponnada/Downloads/time_study_data/withdrew_comp_mx_20260122_000325.csv
