## Import libraries

In [1]:
## Import libraries for machine learning and data processing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
## For random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

## Import data file
Also contains one hot coded features and scaled features

In [2]:
## import dataset
raw_feature_df_scaled = pd.read_csv('/Users/adityaponnada/Downloads/time_study_data/processed_features_v1.csv')
## Display the first few rows of the dataset
raw_feature_df_scaled.head()

Unnamed: 0,participant_id,outcome,is_weekend,in_battery_saver_mode,charging_status,screen_on,dist_from_home,is_phone_locked,last_phone_usage,closeness_to_sleep_time,...,time_of_day_Night,location_category_Home,location_category_Other,location_category_School,location_category_Transit,location_category_Work,wake_day_part_0.0,wake_day_part_1.0,wake_day_part_2.0,wake_day_part_3.0
0,certifiedembargobartender@timestudy_com,1,1,0.0,0.0,1,0.995567,0.0,0.0,0.891772,...,0,0,0,0,1,0,1,0,0,0
1,certifiedembargobartender@timestudy_com,1,1,,,1,0.995567,0.0,0.0,0.883542,...,0,0,0,0,1,0,1,0,0,0
2,certifiedembargobartender@timestudy_com,0,1,0.0,1.0,0,,1.0,0.301667,0.852631,...,0,0,1,0,0,0,1,0,0,0
3,certifiedembargobartender@timestudy_com,1,1,0.0,1.0,1,0.995567,0.0,0.0,0.844384,...,0,0,0,0,1,0,1,0,0,0
4,certifiedembargobartender@timestudy_com,1,1,,,0,0.995705,1.0,0.15,0.831944,...,0,0,0,0,1,0,1,0,0,0


In [3]:
## Inspect the number of unique participant_ids
num_participants = raw_feature_df_scaled['participant_id'].nunique()
print(f"Number of unique participants: {num_participants}")

## Check the shape of the dataset
num_rows, num_cols = raw_feature_df_scaled.shape
print(f"Dataset shape: {num_rows} rows, {num_cols} columns")

## Inspect the data typpes of all columns, except participant_id
raw_feature_df_scaled.dtypes

Number of unique participants: 10
Dataset shape: 108075 rows, 31 columns


participant_id                object
outcome                        int64
is_weekend                     int64
in_battery_saver_mode        float64
charging_status              float64
screen_on                      int64
dist_from_home               float64
is_phone_locked              float64
last_phone_usage             float64
closeness_to_sleep_time      float64
closeness_to_wake_time       float64
mims_5min                    float64
days_in_study                float64
completion_24h               float64
completion_since_wake        float64
completion_since_start       float64
time_of_day_Afternoon          int64
time_of_day_Early Morning      int64
time_of_day_Evening            int64
time_of_day_Late Night         int64
time_of_day_Morning            int64
time_of_day_Night              int64
location_category_Home         int64
location_category_Other        int64
location_category_School       int64
location_category_Transit      int64
location_category_Work         int64
w

## Prepare test and training set
We will keep random 8 users in training set and the two users in test set

In [4]:
import numpy as np
def split_train_test_by_participant_ids(df, id_col='participant_id', n_train=8, random_state=42):
    """
    Randomly selects n_train participant_ids for training, the rest for testing.
    Returns: train_df, test_df
    """
    np.random.seed(random_state)
    unique_ids = df[id_col].unique()
    train_ids = np.random.choice(unique_ids, size=n_train, replace=False)
    test_ids = [pid for pid in unique_ids if pid not in train_ids]
    train_df = df[df[id_col].isin(train_ids)].reset_index(drop=True)
    test_df = df[df[id_col].isin(test_ids)].reset_index(drop=True)
    return train_df, test_df

# Example usage:
train_df, test_df = split_train_test_by_participant_ids(raw_feature_df_scaled)
print(f"Train set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")
print(f"Train participant_ids: {train_df['participant_id'].unique()}")
print(f"Test participant_ids: {test_df['participant_id'].unique()}")

Train set shape: (88178, 31)
Test set shape: (19897, 31)
Train participant_ids: ['certifiedembargobartender@timestudy_com'
 'chewingslouchingfailing@timestudy_com'
 'enjoyergoofinessgrudge@timestudy_com'
 'neutergoldfishsworn@timestudy_com' 'persevereriseswoop@timestudy_com'
 'remoldexcludingaffair@timestudy_com'
 'slapstickporcupineslacks@timestudy_com'
 'subtitlegrievousbazooka@timestudy_com']
Test participant_ids: ['erasuresafeguardravishing@timestudy_com'
 'pretendedconstrainfraying@timestudy_com']


### Inspect the data distribution
We will check number of observations and overall completion rates for each user in test and training dataset

In [5]:
def participant_summary_stats(train_df, test_df, id_col='participant_id', outcome_col='outcome'):
    """
    Returns two DataFrames:
    1. For training set: participant_id, n_observations, completion_rate
    2. For test set: participant_id, n_observations, completion_rate
    """
    def summarize(df):
        counts = df.groupby(id_col).size().rename('n_observations')
        completion = df.groupby(id_col)[outcome_col].mean().rename('completion_rate')
        summary = pd.concat([counts, completion], axis=1).reset_index()
        return summary
    train_summary = summarize(train_df)
    test_summary = summarize(test_df)
    return train_summary, test_summary

# Example usage:
train_summary, test_summary = participant_summary_stats(train_df, test_df)
print('Train set summary:')
print(train_summary)
print('Test set summary:')
print(test_summary)

Train set summary:
                            participant_id  n_observations  completion_rate
0  certifiedembargobartender@timestudy_com           11555         0.804846
1    chewingslouchingfailing@timestudy_com           13283         0.943913
2     enjoyergoofinessgrudge@timestudy_com           10457         0.886488
3        neutergoldfishsworn@timestudy_com            8724         0.885030
4         persevereriseswoop@timestudy_com           12993         0.797045
5      remoldexcludingaffair@timestudy_com            6426         0.737006
6   slapstickporcupineslacks@timestudy_com           11437         0.967387
7    subtitlegrievousbazooka@timestudy_com           13303         0.957604
Test set summary:
                            participant_id  n_observations  completion_rate
0  erasuresafeguardravishing@timestudy_com            4457         0.700247
1  pretendedconstrainfraying@timestudy_com           15440         0.841256
