The purpose of this notebook is to extend the alternative method for counting instances of moderate/vigorous physical activity [MVPA] to other participants

1. Segment each day into 5-minute "bouts"
2. Identify each day as having had the accelerometer on enough to believe that it was on for most/all waking hours
3. Counting the number of such days
4. Counting the number of MVPA bouts on these days and identifying them as "active" days
5. Computing the average MVPA bouts per active day

In [23]:
# Start by importing packages we'll need
import pandas as pd
import numpy as np
import os

In [35]:
# Iterate through the subfolders of the folder series_train.parquet/
# and load the data into a pandas dataframe
dfs = []
directory = 'series_train.parquet'

for root, dirs, files in os.walk(directory):
    for dir in dirs:
        print('directory=',dir)
        for subroot, subdirs, subfiles in os.walk(dir):
            print('subroot=',subroot,' subdir=',subdirs,' subfile=',subfiles)
            for filename in subfiles:
                print(filename)
                print(os.path.join(subroot, filename), 'testing')

directory= id=0d01bbf2
directory= id=cefdb7fe
directory= id=58391429
directory= id=2ca2206f
directory= id=19455336
directory= id=ca33a5e7
directory= id=92bb8516
directory= id=2812951b
directory= id=6b6467f4
directory= id=9d6b1410
directory= id=22c72c4e
directory= id=b3b200af
directory= id=ebf30e46
directory= id=2a0b8386
directory= id=71c1b1d2
directory= id=e683d2c9
directory= id=bebff291
directory= id=d4d2f272
directory= id=81d3ab22
directory= id=051680a0
directory= id=1aff6762
directory= id=a0522c83
directory= id=b447e66d
directory= id=90161e10
directory= id=adbd6839
directory= id=3e5d5b58
directory= id=cd68643b
directory= id=af4f064e
directory= id=554e46e9
directory= id=8f21c613
directory= id=3826be66
directory= id=2898269f
directory= id=43f5d3c5
directory= id=76fb4afb
directory= id=fc2c2c08
directory= id=1837456f
directory= id=4bc2bee9
directory= id=a402c6c8
directory= id=035c96dd
directory= id=29587a31
directory= id=ab16a20d
directory= id=fa34f945
directory= id=c9600298
directory= 

In [4]:
# Load the parquet data file
data = pd.read_parquet('series_train.parquet/id=0b4014f0/part-0.parquet')

# Add a new column that converts time_of_day into datetime
data['dt'] = pd.to_datetime(data['time_of_day'])

# Change the day in the dt variable to be equal to the relative_date_PCIAT value
data['dt_mod'] = data['dt'] + pd.to_timedelta(data['relative_date_PCIAT'], unit='D')

# Use dt_mod as the index. This will help with the resampling
data.set_index('dt_mod', inplace=True)

# Create a new data frame by grouping the observations into 5-minute intervals and computing the mean of each interval
data_resampled_5min = data.resample('5min').mean()


# Create a new variable called 'enmogroup' that increases by 1 each time the value of enmo is numerical
data_resampled_5min['enmogroup'] = data_resampled_5min['enmo'].notna().cumsum()

# Create a new data frame that lists the number of rows in each value of enmogroup
enmogroupcount = data_resampled_5min.groupby(by=["enmogroup"]).size().to_frame()

# Rename the column 0 as 'enmogroupsize'
enmogroupcount = enmogroupcount.rename(columns={0: 'enmogroupsize'})

# Merge data_resampled_5min and enmogroupcount on the variable enmogroup, keeping the index of data_resampled_5min
data_resampled_5min = data_resampled_5min.merge(enmogroupcount, how='left', left_on='enmogroup', right_index=True)

# Add a new variable 'smallinterval' when enmogroupsize is less than 7
data_resampled_5min['smallinterval'] = data_resampled_5min['enmogroupsize'] < 8

# When smallinterval is true, fill the NaN values; otherwise, retain the original enmo values (including NaN)
data_resampled_5min['filled_enmo'] = np.where(data_resampled_5min.smallinterval, data_resampled_5min.enmo.ffill(), data_resampled_5min.enmo)

# The code here will create a new data frame that lists the total number of valid bouts for the participant
# and will count the number of bouts with filled_enmo values over a particular threshold

# Start by counting the number of valid bouts in each day as a data frame
boutcount_filled = data_resampled_5min.groupby(data_resampled_5min.index.date).count()['filled_enmo'].to_frame()

# Rename filled_enmo as valid_bouts
boutcount_filled = boutcount_filled.rename(columns={'filled_enmo': 'valid_bouts'})

# Count the number of bouts in each day with filled_enmo at least 0.192
boutcount_MVPA = data_resampled_5min[data_resampled_5min['filled_enmo'] >= 0.192].groupby(data_resampled_5min[data_resampled_5min['filled_enmo'] >= 0.192].index.date).count()['filled_enmo'].to_frame()

# Rename filled_enmo as MVPA_bouts
boutcount_MVPA = boutcount_MVPA.rename(columns={'filled_enmo': 'MVPA_bouts'})

# Merge boutcount_filled and boutcount_MVPA
boutcount = boutcount_filled.merge(boutcount_MVPA, how='left', left_index=True, right_index=True)

# Compute a new variable 'included_day' to be True if valid_bouts is at least 150
boutcount['included_day'] = boutcount['valid_bouts'] >= 150

# Compute the mean of MVPA_bouts for all days where included_day is True
MVPA_mean = boutcount[boutcount['included_day'] == True]['MVPA_bouts'].mean()