In [2]:
import numpy as np
import pandas as pd
import re
from darts.timeseries import TimeSeries
from darts.utils.missing_values import fill_missing_values as darts_fill_na

In [None]:
biometrics = pd.read_csv('data/bq-results-20240418-170723-1713460063623.csv')

In [19]:
biometrics_2022_Q1_Q2 = pd.read_csv('data/BM_2022_Q1_Q2.csv')
biometrics_2022_Q3_Q4 = pd.read_csv('data/BM_2022_Q3_Q4.csv')
biometrics = pd.concat([biometrics_2022_Q1_Q2, biometrics_2022_Q3_Q4])

# Training

## Process Data

### Biometrics

In [21]:
biometrics.drop_duplicates(subset=['CloudId', 'BiometricName', 'MeasuredOnUTC', 'Value'], keep='first', inplace=True)

def clean_timestamp(ts_str):
    clean_str = re.sub(r'(\d+:\d+:\d+)(\.\d+)?', r'\1', ts_str)
    return clean_str

biometrics['MeasuredOn'] = biometrics['MeasuredOnUTC'].apply(clean_timestamp)
biometrics['MeasuredOn'] = pd.to_datetime(biometrics['MeasuredOn'], errors='coerce')
biometrics['MeasuredOnDate'] = biometrics['MeasuredOn'].dt.date
biometrics['MeasuredOnDay'] = biometrics['MeasuredOn'].dt.dayofyear
biometrics['MeasuredOnWeek'] = biometrics['MeasuredOn'].dt.isocalendar().week
biometrics['MeasuredOnYear'] = biometrics['MeasuredOn'].dt.year
biometrics.sort_values('MeasuredOnWeek', inplace=True)

In [22]:
bms = biometrics.drop(columns=['Age', 'Gender', 'MeasureProvidedBy', 'MeasuredOnUTC', 'MeasuredOn', 'MeasuredOnDate', 'MeasuredOnYear'])
bms = bms.groupby(['CloudId', 'BiometricName', 'MeasuredOnWeek']).agg({'Value': 'mean'}).reset_index()
bms.sample(5)

Unnamed: 0,CloudId,BiometricName,MeasuredOnWeek,Value
2074557,642043ea5442d09b283aa4efb96b123afe4e76c3,Basal Metabolic Rate,12,1988.0
4032393,c1dcd96a5278f6455d848e2939103b2947b7abe1,Muscle Mass,36,36.15
4182660,c80bab50506275fb6aa6200fdf108040966dbd7d,Fat Mass Perc of Ideal Left Arm,15,62.22
4262857,cbe07e997d959992706a1313cede77df7ed59bb5,Trunk Fat Perc Score,13,-1.0
3037080,8fb89ddffcf5a8a0ea815faf4e97e176985a6f3c,Fat Mass,3,21.9


In [23]:
grouped_bms = bms.groupby(['CloudId', 'BiometricName']).agg({'MeasuredOnWeek': [list, 'count'], 'Value': list})
grouped_bms = grouped_bms.reset_index()
grouped_bms.sample(5)

Unnamed: 0_level_0,CloudId,BiometricName,MeasuredOnWeek,MeasuredOnWeek,Value
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,list,count,list
204463,7b8d30c72d5c49c9955be6635bc769cbdc2ac43e,Standard Body Weight,"[6, 8, 11, 18, 24, 36, 39, 44]",8,"[52.9, 52.9, 52.9, 52.9, 52.9, 52.9, 52.9, 52.9]"
153772,5d278a308e8374b1517284185d24843ae7bc5fb4,Growth Score,"[13, 19, 30, 36, 40, 46]",6,"[66.0, 67.0, 69.0, 69.0, 69.0, 68.0]"
285493,ace83dffa2ec853f9bb4aeb6505131e7068026f8,Muscle Mass Balance Arm,"[1, 2, 5, 6, 7, 9, 10, 12, 14, 15, 19, 20, 21,...",32,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, ..."
82906,315e85264517f4c5db78a059a15b29ba6937eee7,Basal Metabolic Rate Score,"[16, 28, 31, 39, 50]",5,"[10.0, 10.0, 10.0, 11.0, 10.0]"
204729,7bb2fbff14eabcc30a01282c520f8f02a1e95944,Skeletal Muscle Mass,"[2, 7, 18, 25, 31, 37, 45]",7,"[33.1, 32.7, 32.7, 32.0, 33.0, 33.2, 33.6]"


Use only (CloudId, BiometricName) pairs with more than 3 measurements and spanning over at least 24 weeks.

In [24]:
has_long_interval = grouped_bms[('MeasuredOnWeek', 'list')].apply(lambda x: max(x) - min(x) >= 24)
has_many_measurements = grouped_bms[('MeasuredOnWeek', 'count')] > 3
print(f'Total number of (CloudId, BiometricName) pairs: {len(grouped_bms)}')
print(f'Number of pairs with long interval: {has_long_interval.sum()} ({has_long_interval.sum() / len(grouped_bms) * 100:.2f}%)')
print(f'Number of pairs with many measurements: {has_many_measurements.sum()} ({has_many_measurements.sum() / len(grouped_bms) * 100:.2f}%)')
print(f'Number of pairs with both long interval and many measurements: {(has_long_interval & has_many_measurements).sum()} ({(has_long_interval & has_many_measurements).sum() / len(grouped_bms) * 100:.2f}%)')
filtered_group_bms = grouped_bms[has_long_interval & has_many_measurements]

Total number of (CloudId, BiometricName) pairs: 424623
Number of pairs with long interval: 369983 (87.13%)
Number of pairs with many measurements: 357822 (84.27%)
Number of pairs with both long interval and many measurements: 348755 (82.13%)


In [25]:
filtered_bms = filtered_group_bms.explode(column=[('MeasuredOnWeek', 'list'), ('Value', 'list')], ignore_index=True)
filtered_bms = filtered_bms.drop(columns=[('MeasuredOnWeek', 'count')])
filtered_bms.columns = [col[0] for col in filtered_bms.columns]
filtered_bms = filtered_bms.astype({'MeasuredOnWeek': 'int32', 'Value': 'float64'})
filtered_bms.sample(5)

Unnamed: 0,CloudId,BiometricName,MeasuredOnWeek,Value
3160308,9c2def4ffcf8bf2a57cfb4d0381bd2f463a1e653,Muscle Mass Balance Leg,8,1.0
1807959,5a66ca699c0e3b3d142a966de374d14a8b0e9ab5,Muscle Mass,43,66.2
3597355,b16e6b6b6a0df5e9fd7a0450e6f65fa0840a9c8f,Right Leg Fat Perc Score,38,-3.0
2519795,7c79593036ca52755ae3033d2979b8ac6050a7b8,Right Arm Fat Mass,44,0.4
1662964,52b03e692a177e5cf1f2a0b75392bca5605b5ac8,Right Arm Muscle Mass,3,2.5


Compute time series for each Cloud ID, Biomertric pair

In [26]:
bms_df = filtered_bms.groupby(['CloudId', 'BiometricName']).agg({'MeasuredOnWeek': list, 'Value': list}).reset_index()
bms_df.sample(5)

Unnamed: 0,CloudId,BiometricName,MeasuredOnWeek,Value
240946,b265f013db26add70d14cda113f59ffab2c402a1,Right Leg Muscle Mass,"[6, 20, 23, 26, 28, 30, 38, 41, 46]","[5.6, 5.6, 5.5, 5.9, 5.9, 5.6, 5.4, 5.5, 5.4]"
203066,9680ed34cb2a3183b35b760b2c8bb812ae0c0c53,Left Arm Fat Mass,"[5, 9, 15, 19, 25, 39, 45, 50]","[0.5, 0.5, 0.5, 0.5, 0.6, 0.6, 0.6, 0.6]"
216580,a0bc0f53bcc2bedc365445d2ca8ced3459b53e12,Total Body Water,"[6, 24, 36, 39, 42]","[43.9, 45.2, 43.2, 42.2, 43.2]"
313504,e6fa5b818efb992be24876a5d2305c002ac7cbe2,Height,"[3, 8, 10, 14, 16, 18, 20, 21, 22, 25, 26, 27,...","[175.0, 175.0, 175.0, 175.0, 175.0, 175.0, 175..."
268478,c7069279ded47b3d692c5fc8f0542a4cd41210d0,Body cell mass,"[12, 20, 27, 40, 52]","[38.8, 39.1, 39.3, 38.8, 39.2]"


In [27]:
bms_df.rename(columns={'BiometricName': 'Name', 'MeasuredOnWeek': 'Week', 'Value': 'Value'}, inplace=True)

### Static Features

Add gender and age as static covariates

In [31]:
static_cov_df = biometrics[['CloudId', 'Gender', 'Age']].drop_duplicates(subset='CloudId')

df = bms_df.merge(static_cov_df, on='CloudId')
reordered_cols = ['CloudId', 'Gender', 'Age', 'Name', 'Week', 'Value'] 
df = df.reindex(columns=reordered_cols)

### Exercise Data

In [6]:
# exercises = pd.read_pickle('data/exercise_muscle_df.pkl')
exercises = pd.read_pickle('data/exercise_df.pkl')
exercises

Unnamed: 0,CloudId,Year,Week,total_calories_week,total_minutes_week,cardio_calories_week,cardio_minutes_week,isotonic_calories_week,isotonic_minutes_week,upper_calories_week,...,total_minutes_week_total_body,avg_duration_per_workout,avg_calories_per_workout,avg_metsmin_workout,avg_isotonic_workouts,avg_cardio_workouts,avg_upper_body_workouts,avg_lower_body_workouts,avg_core_body_workouts,avg_total_body_workouts
0,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,2022,13,202.0,2112,0.0,0.0,202.0,2112.0,124.0,...,0.0,2112.000000,202.0,28.8,7.0,0.0,5.0,0.0,2.0,0.0
1,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,2022,14,2001.0,12203,1422.0,6825.0,579.0,5378.0,250.0,...,6825.0,4067.666667,667.0,39.9,18.0,5.0,10.0,4.0,4.0,5.0
4,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,2022,15,825.0,8187,133.0,1209.0,692.0,6978.0,372.0,...,1209.0,2729.000000,275.0,35.4,23.0,2.0,15.0,2.0,6.0,2.0
7,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,2022,16,372.0,3492,84.0,605.0,288.0,2887.0,124.0,...,605.0,3492.000000,372.0,43.1,9.0,1.0,5.0,2.0,2.0,1.0
8,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,2022,17,846.0,8067,68.0,604.0,778.0,7463.0,372.0,...,604.0,2689.000000,282.0,39.5,25.0,1.0,15.0,4.0,6.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395613,fffb154edf22df2daca3c00042c99166f3f55fb7,2022,47,586.0,3693,295.0,1200.0,291.0,2493.0,87.0,...,1200.0,3693.000000,586.0,68.0,11.0,1.0,4.0,4.0,3.0,1.0
395614,fffb154edf22df2daca3c00042c99166f3f55fb7,2022,48,625.0,3860,392.0,1800.0,233.0,2060.0,69.0,...,1800.0,3860.000000,625.0,63.6,9.0,2.0,3.0,4.0,2.0,2.0
395615,fffb154edf22df2daca3c00042c99166f3f55fb7,2022,49,619.0,2646,619.0,2646.0,0.0,0.0,0.0,...,2646.0,1323.000000,309.5,15.8,0.0,3.0,0.0,0.0,0.0,3.0
395617,fffb154edf22df2daca3c00042c99166f3f55fb7,2022,50,274.0,1200,274.0,1200.0,0.0,0.0,0.0,...,1200.0,1200.000000,274.0,10.3,0.0,1.0,0.0,0.0,0.0,1.0


In [11]:
exercises.rename(columns={
    'upper_calories_week': 'upper_body_calories_week',
    'lower_calories_week': 'lower_body_calories_week',
    'core_calories_week': 'core_calories_week',
    'upper_minutes_week': 'upper_body_minutes_week',
    'lower_minutes_week': 'lower_body_minutes_week',
    'core_minutes_week': 'core_minutes_week',
    'avg_core_body_workouts': 'avg_core_workouts_week',
    'total_calories_week_total_body': 'total_body_calories_week',
    'total_minutes_week_total_body': 'total_body_minutes_week',
    }, inplace=True)

In [17]:
exercises_df = exercises.groupby('CloudId').agg(
    {col: list for col in exercises.columns.tolist()[2:]}
).reset_index()

In [32]:
ts_df = df.merge(exercises_df, on='CloudId')

# remove _x and _y from column names
def strip_col_suffix(col):
    return re.sub(r'_[xy]$', '', col)

ts_df.columns = pd.MultiIndex.from_tuples(
    [('CloudId', ''), ('Gender', ''), ('Age', '')] + 
    [('Biometrics', strip_col_suffix(col)) for col in ts_df.columns[3:6]] +
    [('Exercise', strip_col_suffix(col)) for col in ts_df.columns[6:]])

In [33]:
ts_df.to_pickle('processed_dataset.pkl')

### Compare time spans

## TimeSeries Creation

Other tasks:
- Exercise features should be encoded as covariates
- Month could also be encoded as covariate
- Consider using other abundant biometrics as covariates

In [35]:
ts_df = pd.read_pickle('processed_dataset.pkl')

In [36]:
ex_df = ts_df[['CloudId', 'Exercise']].drop_duplicates(subset=('CloudId', ''))
bm_df = ts_df[['CloudId', 'Gender', 'Age', 'Biometrics']]

In [37]:
def create_bm_timeseries(row):
    times = pd.Index(row[('Biometrics', 'Week')])
    values = row[('Biometrics', 'Value')]
    covs = pd.DataFrame(data={ 'Gender': [row[('Gender', '')]], 'Age': [row[('Age', '')]] })
    ts = TimeSeries.from_times_and_values(times=times, values=values, columns=[row[('Biometrics', 'Name')]], static_covariates=covs, freq=1)
    filled_ts = darts_fill_na(ts, fill='auto').astype(np.float32)
    return filled_ts

def create_ex_timeseries(row):
    times = pd.Index(row[('Exercise', 'Week')])
    agg_ts = None
    for col in ex_df['Exercise'].columns:
        if col == 'Week':
            continue
        values = row[('Exercise', col)]
        ts = TimeSeries.from_times_and_values(times=times, values=values, columns=[col], freq=1)
        ts = darts_fill_na(ts, fill=0.).astype(np.float32)
        if agg_ts is None:
            agg_ts = ts
        else:
            agg_ts = agg_ts.stack(ts)
    return agg_ts

In [38]:
bm_tms = bm_df.apply(create_bm_timeseries, axis=1)
bm_df.insert(len(bm_df.columns), ('Biometrics', 'TimeSeries'), bm_tms)

In [39]:
ex_tms = ex_df.apply(create_ex_timeseries, axis=1)
ex_df.insert(len(ex_df.columns), ('Exercise', 'TimeSeries'), ex_tms)

Create dataframe with all necessary timeseries

In [44]:
train_df = bm_df.loc[:, [('CloudId', ''), ('Gender', ''), ('Age', ''), ('Biometrics', 'Name'), ('Biometrics', 'TimeSeries')]]
train_df.columns = ['CloudId', 'Gender', 'Age', 'Biometric', 'BiometricTimeSeries']

In [45]:
ex_df_stripped = ex_df.loc[:, [('CloudId', ''), ('Exercise', 'TimeSeries')]]
ex_df_stripped.columns = ['CloudId', 'ExercisesTimeSeries']

In [46]:
train_df = train_df.merge(ex_df_stripped, on='CloudId')

In [47]:
train_df.to_pickle('timeseries_dataset.pkl')