## Dataset preparation
This notebook imports participant data from TIME study, keeps only those who completed the study, comutes all the features, then saves two files:
1. Feature set for all the users
2. A sample of users to try different ML algorithms

## Import libraries
Import essential libraries here.

In [2]:
import sys
import numpy as np
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob

## Import participant status
We will import participant status data. So that we can filter out those who completed the study

In [13]:
## Import the status file
status_file = '/Users/adityaponnada/Downloads/time_study_data/participant_status_tracking_v2.csv'
status_df = pd.read_csv(status_file)

## Show the first few rows
print(status_df.head())
# Also print the columns names
print(status_df.columns)

   Record ID            Visualizer ID Participant Status  Consent Date  \
0       9001       sharpnessnextpouch           Completed    3/17/2020   
1       9002     uniformlyharmfulbush          Unenrolled    3/18/2020   
2       9003     hacksawscoldingdares            Withdrew    3/27/2020   
3       9004    dimnesscranialunheard           Completed    3/28/2020   
4       9005  coynessculminatebarista           Completed     4/8/2020   

  Date participant completed Date participant withdrew  \
0                  3/17/2021                       NaN   
1                        NaN                       NaN   
2                        NaN                 12/4/2020   
3                  3/28/2021                       NaN   
4                   4/8/2021                       NaN   

  Date participant unenrolled Date Devices Mailed ID of device loaned  \
0                         NaN           3/25/2020        C2F9214C2188   
1                  10/20/2020           3/25/2020        C2F

Now only keep the completed participants

In [14]:
## Filter completed participants. We will only keep the visualizerID and status columns
status_df = status_df[status_df['Participant Status '] == 'Completed'][['Visualizer ID', 'Participant Status ']]
# Rename the visualizerID column to participant_id.
status_df.rename(columns={'Visualizer ID': 'participant_id'}, inplace=True)
# Also rename participant status to status
status_df.rename(columns={'Participant Status ': 'status'}, inplace=True)
# Reset the index
status_df.reset_index(drop=True, inplace=True)
# Add @timestudy_com to the participant_id column
status_df['participant_id'] = status_df['participant_id'] + '@timestudy_com'
## Show the first few rows
print(status_df.head())
# Also print the shape of the dataframe
print(status_df.shape)


                           participant_id     status
0        sharpnessnextpouch@timestudy_com  Completed
1     dimnesscranialunheard@timestudy_com  Completed
2   coynessculminatebarista@timestudy_com  Completed
3  spinstersubatomiccoyness@timestudy_com  Completed
4     sadlyskilledlustfully@timestudy_com  Completed
(136, 2)


Save the completed participants IDs as a list

In [15]:
completed_participants = status_df['participant_id'].sort_values().tolist()
# Display the completed participants
print(completed_participants)
print(f"Total completed participants: {len(completed_participants)}")

['afflictedrevenueepilepsy@timestudy_com', 'anagramprobingscrooge@timestudy_com', 'animateshowerclothes@timestudy_com', 'anthillfastinglucrative@timestudy_com', 'arrivejanitoruniformly@timestudy_com', 'atlanticchefhatchet@timestudy_com', 'attirecrabbinghumbling@timestudy_com', 'backfirebankedprudishly@timestudy_com', 'badlandwiltmuseum@timestudy_com', 'bannisterhardwiredladle@timestudy_com', 'bartenderradiatorapplied@timestudy_com', 'beavertomatoupscale@timestudy_com', 'bondingcoasterdirtiness@timestudy_com', 'brinkaminounframed@timestudy_com', 'catsupexploitmocker@timestudy_com', 'caucuscattlemockup@timestudy_com', 'certifiedembargobartender@timestudy_com', 'chewingslouchingfailing@timestudy_com', 'childhoodmovingmagnify@timestudy_com', 'cohesiveprotractfavored@timestudy_com', 'collisionmolarbreeze@timestudy_com', 'congestedculpritsaved@timestudy_com', 'congestedtapssneer@timestudy_com', 'congresscyclistdefender@timestudy_com', 'copybrickcreative@timestudy_com', 'coynessculminatebaris

In [21]:
# Split completed_participants into training_list and holdout_list with non-deterministic sampling
import random

n_train = min(100, len(completed_participants))
if len(completed_participants) == 0:
    training_list = []
    holdout_list = []
else:
    # random.sample is non-deterministic by default (system RNG); each run will differ
    training_list = random.sample(completed_participants, k=n_train)
    holdout_set = set(training_list)
    # preserve original order for holdout_list
    holdout_list = [p for p in completed_participants if p not in holdout_set]

print(f'Training list size: {len(training_list)}')
print(f'Holdout list size: {len(holdout_list)}')
print('Training sample (first 10):', training_list[:10])
print('Holdout sample (first 10):', holdout_list[:10])

Training list size: 100
Holdout list size: 36
Training sample (first 10): ['headphoneoutsmartunfailing@timestudy_com', 'retrialgraftedsturdy@timestudy_com', 'landlordastrologycopy@timestudy_com', 'bartenderradiatorapplied@timestudy_com', 'backfirebankedprudishly@timestudy_com', 'bannisterhardwiredladle@timestudy_com', 'unfreezefrayingknoll@timestudy_com', 'endlessroamerreconfirm@timestudy_com', 'unmixableresultfidgety@timestudy_com', 'equallustinessuntil@timestudy_com']
Holdout sample (first 10): ['animateshowerclothes@timestudy_com', 'atlanticchefhatchet@timestudy_com', 'beavertomatoupscale@timestudy_com', 'bondingcoasterdirtiness@timestudy_com', 'childhoodmovingmagnify@timestudy_com', 'cohesiveprotractfavored@timestudy_com', 'collisionmolarbreeze@timestudy_com', 'congestedculpritsaved@timestudy_com', 'congestedtapssneer@timestudy_com', 'crestedserpentspongy@timestudy_com']


## Import compliance matrix
We will import hourly compliance matrix for all the completed participants

In [22]:
# Load uema_feature_mx_*.csv only for participants in completed_participants
import os, gc, glob
from pandas.errors import EmptyDataError

root_folder = '/Users/adityaponnada/Downloads/time_study_data/compliance_matrix/'
chunk_size = 10000

# normalize and dedupe completed_participants (preserve order)
normalized = []
seen = set()
for p in training_list:
    key = str(p).strip()
    if key not in seen:
        seen.add(key)
        normalized.append(key)
training_list = normalized

# Only keep IDs that match the expected folder pattern (example: *@timestudy_com)
training_list = [p for p in training_list if p.endswith('@timestudy_com')]

# accumulator for per-participant DataFrames (keeps memory lower than appending many small dfs)
participant_dfs = []

for pid in training_list:
    participant_folder = os.path.join(root_folder, pid)
    if not os.path.isdir(participant_folder):
        # skip missing participant folders
        continue
    # find files matching pattern
    files = sorted(glob.glob(os.path.join(participant_folder, 'uema_feature_mx_*.csv')))
    if not files:
        continue
    # read files for this participant in chunks and accumulate into a list
    print(f'Reading participant: {pid} | files: {len(files)}')
    per_parts = []
    for fp in files:
        try:
            reader = pd.read_csv(fp, chunksize=chunk_size, low_memory=True)
            for chunk in reader:
                per_parts.append(chunk)
        except EmptyDataError:
            # skip empty files
            continue
        except Exception as e:
            print(f'Failed reading {fp}: {e}')
    if per_parts:
        # concat per-participant chunks to a single dataframe to reduce number of objects
        try:
            df_pid = pd.concat(per_parts, ignore_index=True)
        except ValueError:
            # in case concat fails, skip this participant
            continue
        # optionally tag the source participant id so downstream code knows origin
        df_pid['participant_id_source'] = pid
        participant_dfs.append(df_pid)
        # cleanup
        del per_parts
        gc.collect()

# Final concatenation across participants
if participant_dfs:
    compliance_matrix = pd.concat(participant_dfs, ignore_index=True)
else:
    compliance_matrix = pd.DataFrame()

# report
print('Final compliance_matrix rows,cols:', compliance_matrix.shape)
if not compliance_matrix.empty:
    print('Approx memory (bytes):', compliance_matrix.memory_usage(deep=True).sum())


Reading participant: headphoneoutsmartunfailing@timestudy_com | files: 276
Reading participant: retrialgraftedsturdy@timestudy_com | files: 84
Reading participant: landlordastrologycopy@timestudy_com | files: 225
Reading participant: retrialgraftedsturdy@timestudy_com | files: 84
Reading participant: landlordastrologycopy@timestudy_com | files: 225
Reading participant: bartenderradiatorapplied@timestudy_com | files: 94
Reading participant: backfirebankedprudishly@timestudy_com | files: 251
Reading participant: bartenderradiatorapplied@timestudy_com | files: 94
Reading participant: backfirebankedprudishly@timestudy_com | files: 251
Reading participant: bannisterhardwiredladle@timestudy_com | files: 268
Reading participant: bannisterhardwiredladle@timestudy_com | files: 268
Reading participant: unfreezefrayingknoll@timestudy_com | files: 243
Reading participant: unfreezefrayingknoll@timestudy_com | files: 243
Reading participant: endlessroamerreconfirm@timestudy_com | files: 209
Reading 

In [23]:
## Get the number of rows in compliance_matrix
num_rows = compliance_matrix.shape[0]
print(f"Number of rows in compliance_matrix: {num_rows}")
# Get the number of columns in compliance_matrix
num_cols = compliance_matrix.shape[1]
print(f"Number of columns in compliance_matrix: {num_cols}")
# Get the number of unique participants in compliance_matrix
num_participants = compliance_matrix['Participant_ID'].nunique()
print(f"Number of unique participants in compliance_matrix: {num_participants}")

Number of rows in compliance_matrix: 1088971
Number of columns in compliance_matrix: 63
Number of unique participants in compliance_matrix: 101


In [24]:
print(len(training_list))

100


In [25]:
## Remove rows with participant_id is "unknown_user"
compliance_matrix = compliance_matrix[compliance_matrix['Participant_ID'] != 'unknown_user']

Save the file for later access

In [26]:
## Save compliance_matrix to a csv file. The filename should have _date_time appended to it.
current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
compliance_matrix.to_csv(f'/Users/adityaponnada/Downloads/time_study_data/compliance_matrix_{current_time}.csv', index=False)
print(f"Compliance matrix saved to /Users/adityaponnada/Downloads/time_study_data/compliance_matrix_{current_time}.csv")

Compliance matrix saved to /Users/adityaponnada/Downloads/time_study_data/compliance_matrix_20251208_183728.csv


In [27]:
## Write training list and holdout list to separate text files
with open(f'/Users/adityaponnada/Downloads/time_study_data/training_list_{current_time}.txt', 'w') as f:
    for item in training_list:
        f.write(f"{item}\n")
print(f"Training list saved to /Users/adityaponnada/Downloads/time_study_data/training_list_{current_time}.txt")

with open(f'/Users/adityaponnada/Downloads/time_study_data/holdout_list_{current_time}.txt', 'w') as f:
    for item in holdout_list:
        f.write(f"{item}\n")
print(f"Holdout list saved to /Users/adityaponnada/Downloads/time_study_data/holdout_list_{current_time}.txt")


Training list saved to /Users/adityaponnada/Downloads/time_study_data/training_list_20251208_183728.txt
Holdout list saved to /Users/adityaponnada/Downloads/time_study_data/holdout_list_20251208_183728.txt
