In [1]:
import numpy as np
import pandas as pd
import re
from darts.timeseries import TimeSeries
from darts.utils.missing_values import fill_missing_values as darts_fill_na

In [2]:
csv_dfs = {}
csv_dfs['bq-results-20240418-170723-1713460063623'] = pd.read_csv('data/bq-results-20240418-170723-1713460063623.csv')

# Training

## Process Data

### Biometrics

In [3]:
biometrics = csv_dfs['bq-results-20240418-170723-1713460063623']
biometrics.drop_duplicates(subset=['CloudId', 'BiometricName', 'MeasuredOnUTC', 'Value'], keep='first', inplace=True)

def clean_timestamp(ts_str):
    clean_str = re.sub(r'(\d+:\d+:\d+)(\.\d+)?', r'\1', ts_str)
    return clean_str

biometrics['MeasuredOn'] = biometrics['MeasuredOnUTC'].apply(clean_timestamp)
biometrics['MeasuredOn'] = pd.to_datetime(biometrics['MeasuredOn'], errors='coerce')
biometrics['MeasuredOnDate'] = biometrics['MeasuredOn'].dt.date
biometrics['MeasuredOnDay'] = biometrics['MeasuredOn'].dt.dayofyear
biometrics['MeasuredOnWeek'] = biometrics['MeasuredOn'].dt.isocalendar().week
biometrics['MeasuredOnYear'] = biometrics['MeasuredOn'].dt.year
biometrics.sort_values('MeasuredOnWeek', inplace=True)

In [4]:
bms = biometrics.drop(columns=['Age', 'Gender', 'MeasureProvidedBy', 'MeasuredOnUTC', 'MeasuredOn', 'MeasuredOnDate', 'MeasuredOnYear'])
bms = bms.groupby(['CloudId', 'BiometricName', 'MeasuredOnWeek']).agg({'Value': 'mean'}).reset_index()
bms.sample(5)

Unnamed: 0,CloudId,BiometricName,MeasuredOnWeek,Value
402701,bf84021b3dc2645d68b9dea3c9ab69d158ac7041,Left Arm Fat Mass,38,0.5
84896,288c0ebf5a89144647b965f2049eb9c787a9d408,Fat Mass,27,19.1
193929,5cc7db6f92307e445f7bac9617b3abd68ec891eb,Left Arm Fat Mass,16,0.7
3280,021108a51cd9ae7ef15e91d6745028aeaadc746d,Intra Cellular Water,20,18.5
459752,db5b4948951215fd4333742e949e614a0515995b,Fat Mass,38,9.2


In [5]:
grouped_bms = bms.groupby(['CloudId', 'BiometricName']).agg({'MeasuredOnWeek': [list, 'count'], 'Value': list})
grouped_bms = grouped_bms.reset_index()
grouped_bms.sample(5)

Unnamed: 0_level_0,CloudId,BiometricName,MeasuredOnWeek,MeasuredOnWeek,Value
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,list,count,list
16882,12f7e5071657d4c6c08d24d190344467eff7ff1a,Leg Muscle Score,[20],1,[93.0]
127262,8b24cf838e2ce1cc823defb76eefb28b5dc86b35,Extra Cellular Water,"[16, 21, 23, 25, 30, 32, 34, 41, 46, 48, 50]",11,"[17.4, 17.3, 17.1, 17.1, 16.9, 16.9, 16.9, 16...."
33705,2564ac29f9066963aea3f3d4cb564d0c56ff9d4d,Total Body Water,"[18, 34, 39]",3,"[41.0, 41.7, 42.1]"
129194,8cfa114f0b6e4b15dd82d9bcc91275acfc76acc1,Right Arm Fat Mass,[34],1,[0.5]
46245,3128deeb4e115522ea4afa7d428c39354e4e2a59,Trunk Fat Free Mass,[48],1,[27.7]


Use only (CloudId, BiometricName) pairs with more than 3 measurements and spanning over at least 24 weeks.

In [6]:
has_long_interval = grouped_bms[('MeasuredOnWeek', 'list')].apply(lambda x: max(x) - min(x) >= 24)
has_many_measurements = grouped_bms[('MeasuredOnWeek', 'count')] > 3
print(f'Total number of (CloudId, BiometricName) pairs: {len(grouped_bms)}')
print(f'Number of pairs with long interval: {has_long_interval.sum()} ({has_long_interval.sum() / len(grouped_bms) * 100:.2f}%)')
print(f'Number of pairs with many measurements: {has_many_measurements.sum()} ({has_many_measurements.sum() / len(grouped_bms) * 100:.2f}%)')
print(f'Number of pairs with both long interval and many measurements: {(has_long_interval & has_many_measurements).sum()} ({(has_long_interval & has_many_measurements).sum() / len(grouped_bms) * 100:.2f}%)')
filtered_group_bms = grouped_bms[has_long_interval & has_many_measurements]

Total number of (CloudId, BiometricName) pairs: 234079
Number of pairs with long interval: 38935 (16.63%)
Number of pairs with many measurements: 38991 (16.66%)
Number of pairs with both long interval and many measurements: 25394 (10.85%)


In [7]:
filtered_bms = filtered_group_bms.explode(column=[('MeasuredOnWeek', 'list'), ('Value', 'list')], ignore_index=True)
filtered_bms = filtered_bms.drop(columns=[('MeasuredOnWeek', 'count')])
filtered_bms.columns = [col[0] for col in filtered_bms.columns]
filtered_bms = filtered_bms.astype({'MeasuredOnWeek': 'int32', 'Value': 'float64'})
filtered_bms.sample(5)

Unnamed: 0,CloudId,BiometricName,MeasuredOnWeek,Value
103050,8fc74b70904ea9e29c25b3a62a63635f289ee1cd,Right Arm Fat Perc Score,50,-4.0
27210,24eb66719bf38cd330ddcac684d2682f34d7cfda,Visceral Fat Rating,34,5.0
136123,c3ee772ee403bc0256c3df2129a145c4ac0bcc88,Right Arm Muscle Mass,41,3.6
116036,a37b56c49b9275cf57e5faf23fdee080fa6b0cdf,Weight,39,88.6
21020,1c69cf8489e152d9de60bebb0d03cb33dc6dd47e,Systolic Blood Pressure,18,136.0


Compute time series for each Cloud ID, Biomertric pair

In [8]:
bms_df = filtered_bms.groupby(['CloudId', 'BiometricName']).agg({'MeasuredOnWeek': list, 'Value': list}).reset_index()
bms_df.sample(5)

Unnamed: 0,CloudId,BiometricName,MeasuredOnWeek,Value
23513,edd8448c856164e7e9540ed98421e45ea6e691cd,Basal Metabolic Rate Score,"[21, 23, 41, 48]","[7.0, 7.0, 7.0, 6.0]"
5276,30f52a6043450a9b602ef2b8f2f9d6c58df5d427,Left Arm Muscle Mass Score,"[16, 38, 39, 48]","[-1.0, -1.0, -1.0, -1.0]"
17878,ada1a289ebb2145c5f4c08e3d101bfa8f1d5db36,Visceral Fat Rating,"[20, 22, 38, 47]","[4.0, 5.0, 4.0, 5.0]"
17286,a8b27682a8602d068bcb4ae0bf0c0617f5f126d3,Left Leg Muscle Mass,"[16, 25, 34, 39, 41, 48]","[9.8, 9.5, 9.4, 9.4, 9.6, 9.4]"
18106,b03a953223f6e87e8260641a75c29a105869f126,Standard Fat Perc,"[21, 25, 34, 39, 48]","[18.0, 18.0, 18.0, 18.0, 18.0]"


In [9]:
bms_df.rename(columns={'BiometricName': 'Name', 'MeasuredOnWeek': 'Week', 'Value': 'Value'}, inplace=True)

### Static Features

Add gender and age as static covariates

In [10]:
static_cov_df = biometrics[['CloudId', 'Gender', 'Age']].drop_duplicates(subset='CloudId')

df = bms_df.merge(static_cov_df, on='CloudId')
reordered_cols = ['CloudId', 'Gender', 'Age', 'Name', 'Week', 'Value'] 
df = df.reindex(columns=reordered_cols)

### Exercise Data

In [11]:
exercises = pd.read_pickle('data/exercise_df.pkl')

In [12]:
exercises

Unnamed: 0,CloudId,Year,Week,total_calories_week,total_minutes_week,cardio_calories_week,cardio_minutes_week,isotonic_calories_week,isotonic_minutes_week,avg_duration_per_workout,avg_calories_per_workout,avg_metsmin_workout,avg_isotonic_workouts,avg_cardio_workouts
0,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,2021,20,487.0,3775,218.0,1260.0,269.0,2515.0,3775.000000,487.000000,56.900000,9.0,2.0
1,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,2021,21,1918.0,12326,1120.0,4800.0,798.0,7526.0,4108.666667,639.333333,60.966667,27.0,6.0
4,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,2021,22,1093.0,8237,553.0,2945.0,540.0,5292.0,4118.500000,546.500000,55.450000,18.0,4.0
6,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,2021,23,321.0,1902,169.0,900.0,152.0,1002.0,1902.000000,321.000000,38.100000,4.0,2.0
7,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,2021,24,1129.0,8075,585.0,3000.0,544.0,5075.0,4037.500000,564.500000,58.500000,18.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364745,fffb154edf22df2daca3c00042c99166f3f55fb7,2021,38,490.0,3928,266.0,1200.0,224.0,2728.0,3928.000000,490.000000,45.100000,10.0,1.0
364746,fffb154edf22df2daca3c00042c99166f3f55fb7,2021,44,262.0,1500,243.0,1200.0,19.0,300.0,1500.000000,262.000000,11.200000,1.0,1.0
364747,fffb154edf22df2daca3c00042c99166f3f55fb7,2021,45,564.0,4549,280.0,1200.0,284.0,3349.0,4549.000000,564.000000,55.900000,13.0,1.0
364748,fffb154edf22df2daca3c00042c99166f3f55fb7,2021,46,351.0,2208,261.0,1200.0,90.0,1008.0,2208.000000,351.000000,24.100000,4.0,1.0


In [13]:
exercises_df = exercises.groupby('CloudId').agg(
    {
    'Week': list,
    'total_calories_week': list,
    'total_minutes_week': list,
    'cardio_calories_week': list,
    'cardio_minutes_week': list,
    'isotonic_calories_week': list,
    'isotonic_minutes_week': list,
    'avg_duration_per_workout': list,
    'avg_calories_per_workout': list,
    'avg_metsmin_workout': list,
    'avg_isotonic_workouts': list,
    'avg_cardio_workouts': list
     }
).reset_index()

In [14]:
ts_df = df.merge(exercises_df, on='CloudId')

# remove _x and _y from column names
def strip_col_suffix(col):
    return re.sub(r'_[xy]$', '', col)

ts_df.columns = pd.MultiIndex.from_tuples(
    [('CloudId', ''), ('Gender', ''), ('Age', '')] + 
    [('Biometrics', strip_col_suffix(col)) for col in ts_df.columns[3:6]] +
    [('Exercise', strip_col_suffix(col)) for col in ts_df.columns[6:]])

In [15]:
ts_df.to_pickle('processed_dataset.pkl')

### Compare time spans

## TimeSeries Creation

Other tasks:
- Exercise features should be encoded as covariates
- Month could also be encoded as covariate
- Consider using other abundant biometrics as covariates

In [16]:
ts_df = pd.read_pickle('processed_dataset.pkl')

In [17]:
ex_df = ts_df[['CloudId', 'Exercise']].drop_duplicates(subset=('CloudId', ''))
bm_df = ts_df[['CloudId', 'Gender', 'Age', 'Biometrics']]

In [18]:
def create_bm_timeseries(row):
    times = pd.Index(row[('Biometrics', 'Week')])
    values = row[('Biometrics', 'Value')]
    covs = pd.DataFrame(data={ 'Gender': [row[('Gender', '')]], 'Age': [row[('Age', '')]] })
    ts = TimeSeries.from_times_and_values(times=times, values=values, columns=[row[('Biometrics', 'Name')]], static_covariates=covs, freq=1)
    filled_ts = darts_fill_na(ts, fill='auto').astype(np.float32)
    return filled_ts

def create_ex_timeseries(row):
    times = pd.Index(row[('Exercise', 'Week')])
    agg_ts = None
    for col in ex_df['Exercise'].columns:
        if col == 'Week':
            continue
        values = row[('Exercise', col)]
        ts = TimeSeries.from_times_and_values(times=times, values=values, columns=[col], freq=1)
        ts = darts_fill_na(ts, fill=0.).astype(np.float32)
        if agg_ts is None:
            agg_ts = ts
        else:
            agg_ts = agg_ts.stack(ts)
    return agg_ts

In [19]:
bm_tms = bm_df.apply(create_bm_timeseries, axis=1)
bm_df.insert(len(bm_df.columns), ('Biometrics', 'TimeSeries'), bm_tms)

In [20]:
ex_tms = ex_df.apply(create_ex_timeseries, axis=1)
ex_df.insert(len(ex_df.columns), ('Exercise', 'TimeSeries'), ex_tms)

Create dataframe with all necessary timeseries

In [21]:
train_df = bm_df.loc[:, [('CloudId', ''), ('Gender', ''), ('Age', ''), ('Biometrics', 'Name'), ('Biometrics', 'TimeSeries')]]
train_df.columns = ['CloudId', 'Gender', 'Age', 'Biometric', 'BiometricTimeSeries']

In [22]:
ex_df_stripped = ex_df.loc[:, [('CloudId', ''), ('Exercise', 'TimeSeries')]]
ex_df_stripped.columns = ['CloudId', 'ExercisesTimeSeries']

In [23]:
train_df = train_df.merge(ex_df_stripped, on='CloudId')

In [24]:
train_df.to_pickle('timeseries_dataset.pkl')