## Import libraries
Import libraries to perform one hot coding and rescaling of features

In [6]:
## Import librariries
import sys, os
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from joblib import dump, load
import pickle
from typing import List, Tuple
from datetime import datetime
from dateutil import parser

## Import raw features
Import the raw feature file and inspect it.

In [7]:
# Read the raw feature CSV file into a pandas DataFrame
raw_feature_df = pd.read_csv("/Users/adityaponnada/Downloads/time_study_data/raw_features_v1.csv")
print(f"Raw feature DataFrame shape: {raw_feature_df.shape}")
raw_feature_df.head()

Raw feature DataFrame shape: (108075, 20)


  raw_feature_df = pd.read_csv("/Users/adityaponnada/Downloads/time_study_data/raw_features_v1.csv")


Unnamed: 0,participant_id,prompt_time_converted,outcome,is_weekend,time_of_day,in_battery_saver_mode,charging_status,location_category,screen_on,dist_from_home,is_phone_locked,last_phone_usage,wake_day_part,closeness_to_sleep_time,closeness_to_wake_time,mims_5min,days_in_study,completion_24h,completion_since_wake,completion_since_start
0,certifiedembargobartender@timestudy_com,2021-10-23 07:46:02,1,1,Early Morning,0.0,0.0,Transit,1,936.101448,0.0,0.0,0.0,838.966667,1.033333,8.138573,176,0.0,0.0,0.0
1,certifiedembargobartender@timestudy_com,2021-10-23 07:54:01,1,1,Early Morning,,,Transit,1,936.101448,0.0,0.0,0.0,830.983333,9.016667,0.280337,176,1.0,1.0,1.0
2,certifiedembargobartender@timestudy_com,2021-10-23 08:24:00,0,1,Morning,0.0,1.0,Other,0,,1.0,18.1,0.0,801.0,39.0,0.0,176,1.0,1.0,1.0
3,certifiedembargobartender@timestudy_com,2021-10-23 08:32:00,1,1,Morning,0.0,1.0,Transit,1,936.101448,0.0,0.0,0.0,793.0,47.0,0.0,176,1.0,1.0,1.0
4,certifiedembargobartender@timestudy_com,2021-10-23 08:44:04,1,1,Morning,,,Transit,0,936.231251,1.0,9.0,0.0,780.933333,59.066667,160.934583,176,1.0,1.0,1.0


## Explore missing data
For all the features, we will get the % of missing data in them.

In [8]:
def missing_value_table(df):
    skip_cols = ['participant_id', 'prompt_time_converted', 'outcome']
    cols = [col for col in df.columns if col.lower() not in skip_cols]
    missing_percent = df[cols].isnull().mean() * 100
    # Also count empty strings as missing
    empty_percent = (df[cols] == '').mean() * 100
    total_missing_percent = missing_percent + empty_percent
    result = pd.DataFrame({
        'missing_%': total_missing_percent.round(2)
    }).sort_values('missing_%', ascending=False)
    return result
    
# Display missing value table
missing_value_table(raw_feature_df)

Unnamed: 0,missing_%
in_battery_saver_mode,52.19
charging_status,52.19
dist_from_home,13.88
last_phone_usage,13.8
is_phone_locked,13.8
mims_5min,4.02
wake_day_part,0.04
closeness_to_sleep_time,0.04
closeness_to_wake_time,0.04
completion_since_wake,0.0


In [9]:
def missing_data_by_participant(df):
    skip_cols = ['prompt_time_converted', 'outcome']
    cols = [col for col in df.columns if col.lower() not in skip_cols and col.lower() != 'participant_id']
    # Create a boolean DataFrame for missing values (NaN or empty string)
    missing_bool = df[cols].isnull() | (df[cols] == '')
    # Group by participant_id and calculate % missing for each column
    missing_percent = missing_bool.groupby(df['participant_id']).mean() * 100
    missing_percent = missing_percent.round(2)
    # Reset index to have participant_id as a column
    missing_percent = missing_percent.reset_index()
    return missing_percent

# Print % missing data for each participant and variable
missing_data_by_participant(raw_feature_df)

Unnamed: 0,participant_id,is_weekend,time_of_day,in_battery_saver_mode,charging_status,location_category,screen_on,dist_from_home,is_phone_locked,last_phone_usage,wake_day_part,closeness_to_sleep_time,closeness_to_wake_time,mims_5min,days_in_study,completion_24h,completion_since_wake,completion_since_start
0,certifiedembargobartender@timestudy_com,0.0,0.0,53.81,53.81,0.0,0.0,29.09,6.98,6.98,0.0,0.0,0.0,4.74,0.0,0.0,0.0,0.0
1,chewingslouchingfailing@timestudy_com,0.0,0.0,52.62,52.62,0.0,0.0,14.44,100.0,100.0,0.0,0.0,0.0,2.99,0.0,0.0,0.0,0.0
2,enjoyergoofinessgrudge@timestudy_com,0.0,0.0,51.62,51.62,0.0,0.0,13.26,0.89,0.89,0.46,0.46,0.46,3.31,0.0,0.0,0.0,0.0
3,erasuresafeguardravishing@timestudy_com,0.0,0.0,51.58,51.58,0.0,0.0,18.2,0.38,0.38,0.0,0.0,0.0,2.38,0.0,0.0,0.0,0.0
4,neutergoldfishsworn@timestudy_com,0.0,0.0,52.61,52.61,0.0,0.0,7.96,8.2,8.2,0.0,0.0,0.0,5.98,0.0,0.0,0.0,0.0
5,persevereriseswoop@timestudy_com,0.0,0.0,51.47,51.47,0.0,0.0,2.56,0.0,0.0,0.0,0.0,0.0,2.26,0.0,0.0,0.0,0.0
6,pretendedconstrainfraying@timestudy_com,0.0,0.0,52.09,52.09,0.0,0.0,10.96,0.0,0.0,0.0,0.0,0.0,3.99,0.0,0.0,0.0,0.0
7,remoldexcludingaffair@timestudy_com,0.0,0.0,51.81,51.81,0.0,0.0,8.12,0.03,0.03,0.0,0.0,0.0,8.59,0.0,0.0,0.0,0.0
8,slapstickporcupineslacks@timestudy_com,0.0,0.0,52.39,52.39,0.0,0.0,12.37,0.0,0.0,0.0,0.0,0.0,2.53,0.0,0.0,0.0,0.0
9,subtitlegrievousbazooka@timestudy_com,0.0,0.0,51.6,51.6,0.0,0.0,21.54,0.0,0.0,0.0,0.0,0.0,5.11,0.0,0.0,0.0,0.0
