# Import libraries

In [13]:
## Import libraries for machine learning and data processing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

# Read data

In [14]:
## import dataset
raw_feature_df_scaled = pd.read_csv('/Users/adityaponnada/Downloads/time_study_data/processed_features_v100.csv')
## Display the first few rows of the dataset
raw_feature_df_scaled.head(5)

Unnamed: 0,participant_id,outcome,is_weekend,in_battery_saver_mode,charging_status,screen_on,dist_from_home,is_phone_locked,last_phone_usage,closeness_to_sleep_time,...,wake_day_part_24.0,wake_day_part_25.0,wake_day_part_26.0,wake_day_part_27.0,wake_day_part_28.0,wake_day_part_29.0,wake_day_part_30.0,wake_day_part_31.0,wake_day_part_32.0,wake_day_part_33.0
0,arrivejanitoruniformly@timestudy_com,1,1,,,0,2e-06,0.0,0.0,0.187031,...,0,0,0,0,0,0,0,0,0,0
1,arrivejanitoruniformly@timestudy_com,1,1,0.0,0.0,0,2e-06,0.0,0.0,0.180555,...,0,0,0,0,0,0,0,0,0,0
2,arrivejanitoruniformly@timestudy_com,1,1,,,0,2e-06,0.0,0.0,0.17605,...,0,0,0,0,0,0,0,0,0,0
3,arrivejanitoruniformly@timestudy_com,1,1,,,0,2e-06,0.0,0.0,0.17206,...,0,0,0,0,0,0,0,0,0,0
4,arrivejanitoruniformly@timestudy_com,1,1,,,0,2e-06,0.0,0.0,0.104721,...,0,0,0,0,0,0,0,0,0,0


# Missingness indicator

In [16]:
def add_missingness_indicators(df, id_col='participant_id', outcome_col='outcome', prefix='mi_'):
    """
    Add binary missingness indicator columns to `df` for every column except the id_col and outcome_col.

    Rules:
      - Skip columns named by id_col and outcome_col.
      - For a column named X, create column named prefix + X (e.g., 'mi_X').
      - Indicator is 1 when the original value is NaN, else 0.

    The function returns the DataFrame with the new indicator columns added (inplace on a copy).
    """
    import pandas as pd
    if df is None or not hasattr(df, 'copy'):
        raise ValueError('df must be a pandas DataFrame')

    result = df if df is None else df.copy()
    skip = {id_col, outcome_col}
    for col in result.columns.tolist():
        if col in skip:
            continue
        # skip already-indicator columns to avoid creating mi_mi_X
        if str(col).startswith(prefix):
            continue
        mi_col = f'{prefix}{col}'
        # compute indicator: 1 if NaN, else 0. Use isna for pandas types
        try:
            result[mi_col] = result[col].isna().astype(int)
        except Exception:
            # fallback: use pandas isnull
            result[mi_col] = pd.isnull(result[col]).astype(int)
    return result

# Example usage:
raw_feature_df_scaled = add_missingness_indicators(raw_feature_df_scaled)
raw_feature_df_scaled[['in_battery_saver_mode', 'mi_in_battery_saver_mode']].head()


Unnamed: 0,in_battery_saver_mode,mi_in_battery_saver_mode
0,,1
1,0.0,0
2,,1
3,,1
4,,1


# Split training and test

In [17]:
def split_train_test_by_users_random(df, id_col='participant_id', n_train_users=10, random_state=None):
    """
    Randomly split a DataFrame into a train set containing all rows for a randomly
    selected set of `n_train_users` participants and a test set containing the
    remaining participants.

    Returns: (train_df, test_df) with indices reset.
    """
    import numpy as np
    import pandas as pd

    if id_col not in df.columns:
        raise ValueError(f"id_col '{id_col}' not found in DataFrame columns")

    unique_ids = pd.Index(df[id_col].dropna().unique())
    n_unique = len(unique_ids)
    if n_unique == 0:
        raise ValueError('No participant ids found in the DataFrame')
    if n_train_users <= 0 or n_train_users >= n_unique:
        raise ValueError(f'n_train_users must be >0 and < number of unique participants ({n_unique})')

    rng = np.random.default_rng(random_state)
    train_ids = rng.choice(unique_ids, size=n_train_users, replace=False)

    train_df = df[df[id_col].isin(train_ids)].reset_index(drop=True)
    test_df = df[~df[id_col].isin(train_ids)].reset_index(drop=True)

    return train_df, test_df

# Example usage:
train_df, test_df = split_train_test_by_users_random(raw_feature_df_scaled, n_train_users=10, random_state=42)

In [19]:
## Print the shape of train_df
train_df.shape[1]

122

# Missing data imputation

For features like location, we will use median imputation. For other features we will forward the last known data (using Linear Interpolation)

In [20]:
# Print the percentage of missing values for each column in train_df
missing_pct_train = train_df.isnull().mean() * 100
print("% Missing values per column in train_df:")
missing_pct_train.sort_values(ascending=False)

% Missing values per column in train_df:


in_battery_saver_mode    52.497249
charging_status          52.497249
dist_from_home           12.897827
mims_5min                 5.864054
is_phone_locked           1.442592
                           ...    
wake_day_part_12.0        0.000000
wake_day_part_11.0        0.000000
wake_day_part_10.0        0.000000
wake_day_part_9.0         0.000000
mi_wake_day_part_33.0     0.000000
Length: 122, dtype: float64

In [21]:
def missing_values_table(train_df, show=True):
    """
    Compute a DataFrame summarizing missing data per column for `train_df` and optionally print it in full.

    Returns a pandas DataFrame with columns: 'missing_count' and 'missing_pct' (0-100), sorted by missing_pct desc.

    Parameters:
    - train_df: pandas DataFrame to analyze
    - show: if True, print the full DataFrame without truncation
    """
    import pandas as pd

    if train_df is None or not hasattr(train_df, 'isnull'):
        raise ValueError('train_df must be a valid pandas DataFrame')

    n_rows = len(train_df)
    missing_count = train_df.isnull().sum()
    missing_pct = (missing_count / n_rows) * 100 if n_rows > 0 else missing_count * 0.0
    df_missing = pd.DataFrame({'missing_count': missing_count, 'missing_pct': missing_pct})
    df_missing = df_missing.sort_values('missing_pct', ascending=False)

    if show:
        # display entire table without truncation
        with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', None):
            print(df_missing)

    return df_missing

# Example usage:
# df_missing = missing_values_table(train_df)
# or to get the Series of percentages: df_missing['missing_pct']


## Linear interpolation based impuation

In [22]:
def interpolate_impute(train_df, cols=None, method='linear', limit_direction='both', axis=0, inplace=False, fill_remaining_with='median'):
    """
    Perform linear-interpolation based imputation on `train_df` for the specified columns.

    - train_df: pandas DataFrame
    - cols: list of columns to interpolate. If None, numeric columns will be used.
    - method: interpolation method passed to pandas.DataFrame.interpolate (default 'linear').
    - limit_direction: passed to interpolate (default 'both').
    - axis: axis for interpolation (0 for index, 1 for columns).
    - inplace: if True, modify train_df in place and return it; otherwise return a copy.
    - fill_remaining_with: if 'median', fill remaining NaNs with column median after interpolation;
      if 'ffill', use forward-fill then backward-fill; if None, leave NaNs as-is.

    Returns the imputed DataFrame.
    """
    import pandas as pd
    import numpy as np

    if train_df is None or not hasattr(train_df, 'copy'):
        raise ValueError('train_df must be a valid pandas DataFrame')

    df = train_df if inplace else train_df.copy()

    if cols is None:
        # default: numeric columns only
        cols = df.select_dtypes(include=[np.number]).columns.tolist()
    else:
        # ensure requested cols exist
        cols = [c for c in cols if c in df.columns]

    if not cols:
        # nothing to do
        return df

    # Interpolate numeric columns
    try:
        df[cols] = df[cols].interpolate(method=method, limit_direction=limit_direction, axis=axis)
    except Exception as e:
        print(f'Interpolation failed: {e}; returning original or partially-imputed DataFrame')

    # Optionally fill remaining NaNs
    if fill_remaining_with == 'median':
        for c in cols:
            if df[c].isna().any():
                try:
                    med = df[c].median()
                    df[c].fillna(med, inplace=True)
                except Exception:
                    pass
    elif fill_remaining_with == 'ffill':
        df[cols] = df[cols].ffill().bfill()
    # if None, leave remaining NaNs as-is

    return df

# Example usage:
train_df = interpolate_impute(train_df, cols=None, method='linear', inplace=True)  # numeric cols only
# To operate in-place: interpolate_impute(train_df, inplace=True)


In [23]:
train_df.head(5)

Unnamed: 0,participant_id,outcome,is_weekend,in_battery_saver_mode,charging_status,screen_on,dist_from_home,is_phone_locked,last_phone_usage,closeness_to_sleep_time,...,mi_wake_day_part_24.0,mi_wake_day_part_25.0,mi_wake_day_part_26.0,mi_wake_day_part_27.0,mi_wake_day_part_28.0,mi_wake_day_part_29.0,mi_wake_day_part_30.0,mi_wake_day_part_31.0,mi_wake_day_part_32.0,mi_wake_day_part_33.0
0,catsupexploitmocker@timestudy_com,0,0,0.0,1.0,0,3e-06,1.0,0.063333,0.923111,...,0,0,0,0,0,0,0,0,0,0
1,catsupexploitmocker@timestudy_com,0,0,0.0,1.0,0,5.3e-05,1.0,0.366667,0.906962,...,0,0,0,0,0,0,0,0,0,0
2,catsupexploitmocker@timestudy_com,0,0,0.0,1.0,0,5e-06,1.0,0.518333,0.898969,...,0,0,0,0,0,0,0,0,0,0
3,catsupexploitmocker@timestudy_com,0,0,0.0,1.0,1,5e-06,0.0,0.0,0.891005,...,0,0,0,0,0,0,0,0,0,0
4,catsupexploitmocker@timestudy_com,0,0,0.0,1.0,0,6e-06,1.0,0.016667,0.883027,...,0,0,0,0,0,0,0,0,0,0
