In [1]:
import numpy as np
import pandas as pd
import re
from darts.timeseries import TimeSeries
from darts.utils.missing_values import fill_missing_values as darts_fill_na

In [2]:
biometrics = pd.read_csv('data/bq-results-20240418-170723-1713460063623.csv')

In [3]:
biometrics_2022_Q1_Q2 = pd.read_csv('data/BM_2022_Q1_Q2.csv')
biometrics_2022_Q3_Q4 = pd.read_csv('data/BM_2022_Q3_Q4.csv')
biometrics = pd.concat([biometrics_2022_Q1_Q2, biometrics_2022_Q3_Q4])

# Training

## Process Data

### Biometrics

In [3]:
biometrics.drop_duplicates(subset=['CloudId', 'BiometricName', 'MeasuredOnUTC', 'Value'], keep='first', inplace=True)

def clean_timestamp(ts_str):
    clean_str = re.sub(r'(\d+:\d+:\d+)(\.\d+)?', r'\1', ts_str)
    return clean_str

biometrics['MeasuredOn'] = biometrics['MeasuredOnUTC'].apply(clean_timestamp)
biometrics['MeasuredOn'] = pd.to_datetime(biometrics['MeasuredOn'], errors='coerce')
biometrics['MeasuredOnDate'] = biometrics['MeasuredOn'].dt.date
biometrics['MeasuredOnDay'] = biometrics['MeasuredOn'].dt.dayofyear
biometrics['MeasuredOnWeek'] = biometrics['MeasuredOn'].dt.isocalendar().week
biometrics['MeasuredOnYear'] = biometrics['MeasuredOn'].dt.year
biometrics.sort_values('MeasuredOnWeek', inplace=True)

In [4]:
bms = biometrics.drop(columns=['Age', 'Gender', 'MeasureProvidedBy', 'MeasuredOnUTC', 'MeasuredOn', 'MeasuredOnDate', 'MeasuredOnYear'])
bms = bms.groupby(['CloudId', 'BiometricName', 'MeasuredOnWeek']).agg({'Value': 'mean'}).reset_index()
bms.sample(5)

Unnamed: 0,CloudId,BiometricName,MeasuredOnWeek,Value
47183,1643e9bbc304cd9c8018cea377ac8104b61118cd,Left Arm Fat Perc,52,35.9
93707,2c3e6e4508f71f25e5a9a6d4c358e44049b7c4af,Bone Mass,21,2.3
177285,5438e2211dd515a3dc2614d7c47e63a7950cf30e,Muscle Mass,47,60.3
248092,7534191a8a53c284edbba355e6d2cffda6cf1c67,Standard Fat Perc,25,29.0
61101,1dd2db292181e51bf8ead9f5863ef4e49387a401,Trunk Fat Mass,30,11.7


In [5]:
grouped_bms = bms.groupby(['CloudId', 'BiometricName']).agg({'MeasuredOnWeek': [list, 'count'], 'Value': list})
grouped_bms = grouped_bms.reset_index()
grouped_bms.sample(5)

Unnamed: 0_level_0,CloudId,BiometricName,MeasuredOnWeek,MeasuredOnWeek,Value
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,list,count,list
125937,8990abfd1580b05b734851710aac3d406479902b,Standard Body Weight,[47],1,[63.6]
199747,da4b3aa096da1d47d293acbf72761f1908d9d16e,Weight Control,[27],1,[-6.2]
7768,092241d0f57fa612af9dd5c76c3f95047340a86d,Muscle Mass Balance Leg,"[48, 52]",2,"[1.0, 1.0]"
55418,3b6afc352a07dbbd85e1036186c1143cc0d9cd11,Weight,"[21, 22, 25, 34, 50]",5,"[59.9, 59.5, 59.3, 59.3, 57.3]"
56102,3c6d93c177ec0358b6c5b140fc0665a8e30a315c,Left Arm Fat Perc,[29],1,[29.0]


Use only (CloudId, BiometricName) pairs with more than 3 measurements and spanning over at least 24 weeks.

In [6]:
has_long_interval = grouped_bms[('MeasuredOnWeek', 'list')].apply(lambda x: max(x) - min(x) >= 24)
has_many_measurements = grouped_bms[('MeasuredOnWeek', 'count')] > 3
print(f'Total number of (CloudId, BiometricName) pairs: {len(grouped_bms)}')
print(f'Number of pairs with long interval: {has_long_interval.sum()} ({has_long_interval.sum() / len(grouped_bms) * 100:.2f}%)')
print(f'Number of pairs with many measurements: {has_many_measurements.sum()} ({has_many_measurements.sum() / len(grouped_bms) * 100:.2f}%)')
print(f'Number of pairs with both long interval and many measurements: {(has_long_interval & has_many_measurements).sum()} ({(has_long_interval & has_many_measurements).sum() / len(grouped_bms) * 100:.2f}%)')
filtered_group_bms = grouped_bms[has_long_interval & has_many_measurements]

Total number of (CloudId, BiometricName) pairs: 234079
Number of pairs with long interval: 38935 (16.63%)
Number of pairs with many measurements: 38991 (16.66%)
Number of pairs with both long interval and many measurements: 25394 (10.85%)


In [7]:
filtered_bms = filtered_group_bms.explode(column=[('MeasuredOnWeek', 'list'), ('Value', 'list')], ignore_index=True)
filtered_bms = filtered_bms.drop(columns=[('MeasuredOnWeek', 'count')])
filtered_bms.columns = [col[0] for col in filtered_bms.columns]
filtered_bms = filtered_bms.astype({'MeasuredOnWeek': 'int32', 'Value': 'float64'})
filtered_bms.sample(5)

Unnamed: 0,CloudId,BiometricName,MeasuredOnWeek,Value
79388,7128376020c87552d335522da7f60eec195ee242,Left Leg Muscle Mass,20,6.9
90735,80c9b5684dc5cb701825bb926bf5f644bc9375c3,Right Arm Fat Mass,50,0.6
96747,876fed2ad3fb1efb40ca76eeaab7c566f2386a63,Trunk Fat Mass,29,11.2
74626,691882df8eb30ebc1c59caa6a94f6b10378c6346,BMI,39,25.74
32815,2cb553586d3a0c96195adc551c9c8bcba72f839a,Extra Cellular Water,27,14.2


Compute time series for each Cloud ID, Biomertric pair

In [8]:
bms_df = filtered_bms.groupby(['CloudId', 'BiometricName']).agg({'MeasuredOnWeek': list, 'Value': list}).reset_index()
bms_df.sample(5)

Unnamed: 0,CloudId,BiometricName,MeasuredOnWeek,Value
9937,62cd862a49d3b8b21885b64e4bc57ad6a20200a8,Metabolic Age,"[16, 32, 34, 36, 48]","[40.0, 41.0, 41.0, 41.0, 34.0]"
9996,62d50cfdc505a9fe0e9668fd1685ecd5d7af17a2,Right Leg Fat Mass,"[20, 27, 30, 34, 38, 48, 50, 52]","[3.2, 2.6, 0.5, 2.9, 2.45, 2.7, 2.55, 3.1]"
20277,c89696e13e1964d5f2a1613f3ad24f7d17e497d7,Weight,"[16, 25, 34, 39, 41, 48, 50]","[67.099998, 66.199997, 66.599998, 65.900002, 6..."
7692,4ba20ab3ebcd039f5d7cbdc63e957cf900c8e24a,Left Leg Fat Perc,"[20, 22, 25, 34, 41, 50]","[37.3, 39.6, 39.4, 38.3, 38.2, 38.7]"
17530,aa92c7453b25f120441d388ae34900e451d248d3,Trunk Muscle Mass Score,"[20, 27, 29, 38, 48, 50]","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]"


In [9]:
bms_df.rename(columns={'BiometricName': 'Name', 'MeasuredOnWeek': 'Week', 'Value': 'Value'}, inplace=True)

### Static Features

Add gender and age as static covariates

In [10]:
static_cov_df = biometrics[['CloudId', 'Gender', 'Age']].drop_duplicates(subset='CloudId')

df = bms_df.merge(static_cov_df, on='CloudId')
reordered_cols = ['CloudId', 'Gender', 'Age', 'Name', 'Week', 'Value'] 
df = df.reindex(columns=reordered_cols)

### Exercise Data

In [11]:
# exercises = pd.read_pickle('data/exercise_muscle_df.pkl')
exercises = pd.read_pickle('data/exercise_df.pkl')
exercises

Unnamed: 0,CloudId,Year,Week,total_calories_week,total_minutes_week,cardio_calories_week,cardio_minutes_week,isotonic_calories_week,isotonic_minutes_week,avg_duration_per_workout,avg_calories_per_workout,avg_metsmin_workout,avg_isotonic_workouts,avg_cardio_workouts
0,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,2021,20,487.0,3775,218.0,1260.0,269.0,2515.0,3775.000000,487.000000,56.900000,9.0,2.0
1,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,2021,21,1918.0,12326,1120.0,4800.0,798.0,7526.0,4108.666667,639.333333,60.966667,27.0,6.0
4,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,2021,22,1093.0,8237,553.0,2945.0,540.0,5292.0,4118.500000,546.500000,55.450000,18.0,4.0
6,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,2021,23,321.0,1902,169.0,900.0,152.0,1002.0,1902.000000,321.000000,38.100000,4.0,2.0
7,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,2021,24,1129.0,8075,585.0,3000.0,544.0,5075.0,4037.500000,564.500000,58.500000,18.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364745,fffb154edf22df2daca3c00042c99166f3f55fb7,2021,38,490.0,3928,266.0,1200.0,224.0,2728.0,3928.000000,490.000000,45.100000,10.0,1.0
364746,fffb154edf22df2daca3c00042c99166f3f55fb7,2021,44,262.0,1500,243.0,1200.0,19.0,300.0,1500.000000,262.000000,11.200000,1.0,1.0
364747,fffb154edf22df2daca3c00042c99166f3f55fb7,2021,45,564.0,4549,280.0,1200.0,284.0,3349.0,4549.000000,564.000000,55.900000,13.0,1.0
364748,fffb154edf22df2daca3c00042c99166f3f55fb7,2021,46,351.0,2208,261.0,1200.0,90.0,1008.0,2208.000000,351.000000,24.100000,4.0,1.0


In [12]:
exercises.rename(columns={
    'upper_calories_week': 'upper_body_calories_week',
    'lower_calories_week': 'lower_body_calories_week',
    'core_calories_week': 'core_calories_week',
    'upper_minutes_week': 'upper_body_minutes_week',
    'lower_minutes_week': 'lower_body_minutes_week',
    'core_minutes_week': 'core_minutes_week',
    'avg_core_body_workouts': 'avg_core_workouts_week',
    'total_calories_week_total_body': 'total_body_calories_week',
    'total_minutes_week_total_body': 'total_body_minutes_week',
    }, inplace=True)

In [13]:
exercises_df = exercises.groupby('CloudId').agg(
    {col: list for col in exercises.columns.tolist()[2:]}
).reset_index()

In [14]:
ts_df = df.merge(exercises_df, on='CloudId')

# remove _x and _y from column names
def strip_col_suffix(col):
    return re.sub(r'_[xy]$', '', col)

ts_df.columns = pd.MultiIndex.from_tuples(
    [('CloudId', ''), ('Gender', ''), ('Age', '')] + 
    [('Biometrics', strip_col_suffix(col)) for col in ts_df.columns[3:6]] +
    [('Exercise', strip_col_suffix(col)) for col in ts_df.columns[6:]])

In [15]:
ts_df.to_pickle('processed_dataset.pkl')

### Compare time spans

## TimeSeries Creation

Other tasks:
- Exercise features should be encoded as covariates
- Month could also be encoded as covariate
- Consider using other abundant biometrics as covariates

In [16]:
ts_df = pd.read_pickle('processed_dataset.pkl')

In [17]:
ex_df = ts_df[['CloudId', 'Exercise']].drop_duplicates(subset=('CloudId', ''))
bm_df = ts_df[['CloudId', 'Gender', 'Age', 'Biometrics']]

In [18]:
def create_bm_timeseries(row):
    times = pd.Index(row[('Biometrics', 'Week')])
    values = row[('Biometrics', 'Value')]
    covs = pd.DataFrame(data={ 'Gender': [row[('Gender', '')]], 'Age': [row[('Age', '')]] })
    ts = TimeSeries.from_times_and_values(times=times, values=values, columns=[row[('Biometrics', 'Name')]], static_covariates=covs, freq=1)
    filled_ts = darts_fill_na(ts, fill='auto').astype(np.float32)
    return filled_ts

def create_ex_timeseries(row):
    times = pd.Index(row[('Exercise', 'Week')])
    agg_ts = None
    for col in ex_df['Exercise'].columns:
        if col == 'Week':
            continue
        values = row[('Exercise', col)]
        ts = TimeSeries.from_times_and_values(times=times, values=values, columns=[col], freq=1)
        ts = darts_fill_na(ts, fill=0.).astype(np.float32)
        if agg_ts is None:
            agg_ts = ts
        else:
            agg_ts = agg_ts.stack(ts)
    return agg_ts

In [19]:
bm_tms = bm_df.apply(create_bm_timeseries, axis=1)
bm_df.insert(len(bm_df.columns), ('Biometrics', 'TimeSeries'), bm_tms)

In [20]:
ex_tms = ex_df.apply(create_ex_timeseries, axis=1)
ex_df.insert(len(ex_df.columns), ('Exercise', 'TimeSeries'), ex_tms)

Create dataframe with all necessary timeseries

In [21]:
train_df = bm_df.loc[:, [('CloudId', ''), ('Gender', ''), ('Age', ''), ('Biometrics', 'Name'), ('Biometrics', 'TimeSeries')]]
train_df.columns = ['CloudId', 'Gender', 'Age', 'Biometric', 'BiometricTimeSeries']

In [22]:
ex_df_stripped = ex_df.loc[:, [('CloudId', ''), ('Exercise', 'TimeSeries')]]
ex_df_stripped.columns = ['CloudId', 'ExercisesTimeSeries']

In [23]:
train_df = train_df.merge(ex_df_stripped, on='CloudId')

In [24]:
train_df.to_pickle('timeseries_dataset.pkl')