The purpose of this notebook is to extend the alternative method for counting instances of moderate/vigorous physical activity [MVPA] to other participants

1. Segment each day into 5-minute "bouts"
2. Identify each day as having had the accelerometer on enough to believe that it was on for most/all waking hours
3. Counting the number of such days
4. Counting the number of MVPA bouts on these days and identifying them as "active" days
5. Computing the average MVPA bouts per active day

In [2]:
# Start by importing packages we'll need
import pandas as pd
import numpy as np
import os

In [14]:
# Get a list of all the folders to make sure none of the names will be problematic

folders = os.listdir('series_train.parquet/')

# convert folders to a data frame
folders_df = pd.DataFrame(folders, columns=['folder'])

# save folders_df as a csv
folders_df.to_csv('folders.csv', index=False)

In [21]:
# Specify cutoffs

#ENMO cutoffs in mg for MVPA
mvpa_cutoff1 = 0.192
mvpa_cutoff2 = 0.110

# Number of 'active' bouts required for a day to count as 'active'
active_bout_cutoff = 150

# Specify the length of the bouts
boutlength = '5min'

# Maximum number of 5-minute bouts that can be imputed as zeroes to account for the accelerometer not collected data when at rest
impute_max = 6

# Minimum number of 5-second intervals (within a 5-minute bout) that need to have data for the bout to be counted
impute_sec_min = 29

# First try this with just one folder
#folders = ['id=0a418b57', 'id=0a431608', 'id=0b7d7aec', 'id=0b7d9da6', 'id=0b50f3fa']

# Get a list of all folders in the series_train.parquet/ folder
folders = os.listdir('series_train.parquet/')

# Remove the item '.DS_Store' from folders
folders = [folder for folder in folders if folder != '.DS_Store']

# Remove the first three characters from each item in folders
folders = [folder[3:] for folder in folders]

# Create a new data frame with columns 'ID', 'ENMO_Avg_Active_Days_MVPA192', 'ENMO_Avg_Active_Days_MVPA110', 'ENMO_Avg_All_Days_MVPA192', 'ENMO_Avg_All_Days_MVPA110'
df = pd.DataFrame(columns=['ID', 'ENMO_Avg_Active_Days_MVPA192', 'ENMO_Avg_Active_Days_MVPA110', 'ENMO_Avg_All_Days_MVPA192', 'ENMO_Avg_All_Days_MVPA110', 'Positive_Anglez_Active_Days','Positive_Anglez_All_Days'])
df['ID'] = folders

# Set the ID column as the index
df.set_index('ID', inplace=True)

# Iterate through the folders
for id in folders:
    # Create the filename to load and load the file
    fileloc = 'series_train.parquet/id='+str(id)+'/part-0.parquet'
    data = pd.read_parquet(fileloc)

    # Remove any rows where the variable non-wear_flag is nonzero
    data = data[data['non-wear_flag'] == 0]

    # Change the time_of_day variable to a datetime and make it into the index
    data['dt'] = pd.to_datetime(data['time_of_day'])
    data['dt_mod'] = data['dt'] + pd.to_timedelta(data['relative_date_PCIAT'], unit='D')
    data.set_index('dt_mod', inplace=True)

    # Create a new data frame that counts the number of valid data points within each 5-minute ('boutlength') interval
    # This will later be used to exclude intervals that had fewer than 30 (out of 60) valid data points
    data['count'] = 1
    number_of_data_points = data.resample(boutlength).agg({'count':'sum'})
    data.drop('count', axis=1, inplace=True)

    # Create 5-minute "bouts" of averaged data and incorporate the number of valid data points within each interval as a new variable 'count'
    data_resampled_5min = data.resample(boutlength).mean()
    data_resampled_5min = data_resampled_5min.merge(number_of_data_points, left_index=True, right_index=True)

    # Some of the accelerometers stopped collecting data if they were stationary (but still on/worn)
    # This next section is an attempt to identify and fill in these seemingly missing values with "0" for the enmo value
    # It does this by identifying the length of each sequence of NaN values and filling them with 0 if thery are at most 30 minutes long
    # This also restricts this process to 5-minute bouts that had data for at least 30 of the 5-second-intervals within the bout
    data_resampled_5min['enmogroup'] = data_resampled_5min['enmo'].notna().cumsum()
    enmogroupcount = data_resampled_5min.groupby(by=["enmogroup"]).size().to_frame()
    enmogroupcount = enmogroupcount.rename(columns={0: 'enmogroupsize'})
    data_resampled_5min = data_resampled_5min.merge(enmogroupcount, how='left', left_on='enmogroup', right_index=True)
    data_resampled_5min['smallinterval'] = (data_resampled_5min['enmogroupsize'] < impute_max+2) & (data_resampled_5min['count']>impute_sec_min)
    data_resampled_5min['filled_enmo'] = np.where(data_resampled_5min.smallinterval, data_resampled_5min.enmo.fillna(0), data_resampled_5min.enmo)

    # Also fill in only anglez values where the count is large enough
    data_resampled_5min['filled_anglez'] = np.where(data_resampled_5min['count']>impute_sec_min, data_resampled_5min.anglez, np.nan)

    # The next code chunk will create a new data frame that lists the total number of valid bouts for the participant
    # and will count the number of bouts with filled_enmo values over a particular threshold
    # and then count the number of bouts with positive anglez values

    # Start by counting the number of valid bouts in each day as a data frame
    boutcount_filled = data_resampled_5min.groupby(data_resampled_5min.index.date).count()['filled_enmo'].to_frame()
    boutcount_filled = boutcount_filled.rename(columns={'filled_enmo': 'valid_bouts'})

    # Count the number of bouts in each day with filled_enmo at least mvpa_cutoff1
    boutcount_MVPA1 = data_resampled_5min[data_resampled_5min['filled_enmo'] >= mvpa_cutoff1].groupby(data_resampled_5min[data_resampled_5min['filled_enmo'] >= mvpa_cutoff1].index.date).count()['filled_enmo'].to_frame()
    boutcount_MVPA1 = boutcount_MVPA1.rename(columns={'filled_enmo': 'MVPA_bouts_over_cutoff1'})
    boutcount = boutcount_filled.merge(boutcount_MVPA1, how='left', left_index=True, right_index=True)

    # Count the number of bouts in each day with filled_enmo at least mvpa_cutoff2
    boutcount_MVPA2 = data_resampled_5min[data_resampled_5min['filled_enmo'] >= mvpa_cutoff2].groupby(data_resampled_5min[data_resampled_5min['filled_enmo'] >= mvpa_cutoff2].index.date).count()['filled_enmo'].to_frame()
    boutcount_MVPA2 = boutcount_MVPA2.rename(columns={'filled_enmo': 'MVPA_bouts_over_cutoff2'})
    boutcount = boutcount.merge(boutcount_MVPA2, how='left', left_index=True, right_index=True)

    # Count the number of bouts in each day with anglez at least 0
    boutcount_anglez = data_resampled_5min[data_resampled_5min['filled_anglez'] > 0].groupby(data_resampled_5min[data_resampled_5min['filled_anglez'] > 0].index.date).count()['filled_anglez'].to_frame()
    boutcount_anglez = boutcount_anglez.rename(columns={'filled_anglez': 'Positive_Anglez_Bouts'})
    boutcount = boutcount.merge(boutcount_anglez, how='left', left_index=True, right_index=True)

    # Compute a new variable 'included_day' to be True if valid_bouts is at least active_bout_cutoff
    boutcount['included_day'] = boutcount['valid_bouts'] >= active_bout_cutoff

    # Compute the mean of MVPA bouts over each cutoff
    MVPA_mean1 = boutcount[boutcount['included_day'] == True]['MVPA_bouts_over_cutoff1'].mean()
    MVPA_mean2 = boutcount[boutcount['included_day'] == True]['MVPA_bouts_over_cutoff2'].mean()
    MVPA_mean3 = boutcount['MVPA_bouts_over_cutoff1'].mean()
    MVPA_mean4 = boutcount['MVPA_bouts_over_cutoff2'].mean()
    Anglez_mean1 = boutcount[boutcount['included_day'] == True]['Positive_Anglez_Bouts'].mean()
    Anglez_mean2 = boutcount['Positive_Anglez_Bouts'].mean()

    # Copy the values into the data frame
    df.at[id, 'ENMO_Avg_Active_Days_MVPA192'] = MVPA_mean1
    df.at[id, 'ENMO_Avg_Active_Days_MVPA110'] = MVPA_mean2
    df.at[id, 'ENMO_Avg_All_Days_MVPA192'] = MVPA_mean3
    df.at[id, 'ENMO_Avg_All_Days_MVPA110'] = MVPA_mean4
    df.at[id, 'Positive_Anglez_Active_Days'] = Anglez_mean1
    df.at[id, 'Positive_Anglez_All_Days'] = Anglez_mean2

    # Replace any NaN values in df with 0
    df.fillna(0, inplace=True)

# Export df as a csv file
df.to_csv('Accelerometer_enmo_anglez_daily_averages.csv')

  df.fillna(0, inplace=True)


In [20]:
df

Unnamed: 0_level_0,ENMO_Avg_Active_Days_MVPA192,ENMO_Avg_Active_Days_MVPA110,ENMO_Avg_All_Days_MVPA192,ENMO_Avg_All_Days_MVPA110,Positive_Anglez_Active_Days,Positive_Anglez_All_Days
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0a418b57,8.294118,21.764706,8.454545,17.444444,19.058824,17.071429
0a431608,6.846154,14.142857,6.428571,13.636364,133.0,124.16
0b7d7aec,0.0,0.0,0.0,0.0,0.0,8.0
0b7d9da6,14.727273,32.391304,12.555556,27.310345,20.521739,19.5
0b50f3fa,18.2,30.5625,17.25,27.5,134.4375,117.285714


In [4]:
#### Below is the original code copied from Accelerometer_exploration_6. It can be ignored.

# Load the parquet data file
data = pd.read_parquet('series_train.parquet/id=0b4014f0/part-0.parquet')

# Add a new column that converts time_of_day into datetime
data['dt'] = pd.to_datetime(data['time_of_day'])

# Change the day in the dt variable to be equal to the relative_date_PCIAT value
data['dt_mod'] = data['dt'] + pd.to_timedelta(data['relative_date_PCIAT'], unit='D')

# Use dt_mod as the index. This will help with the resampling
data.set_index('dt_mod', inplace=True)

# Create a new data frame by grouping the observations into 5-minute intervals and computing the mean of each interval
data_resampled_5min = data.resample('5min').mean()

# Create a new variable called 'enmogroup' that increases by 1 each time the value of enmo is numerical
data_resampled_5min['enmogroup'] = data_resampled_5min['enmo'].notna().cumsum()

# Create a new data frame that lists the number of rows in each value of enmogroup
enmogroupcount = data_resampled_5min.groupby(by=["enmogroup"]).size().to_frame()

# Rename the column 0 as 'enmogroupsize'
enmogroupcount = enmogroupcount.rename(columns={0: 'enmogroupsize'})

# Merge data_resampled_5min and enmogroupcount on the variable enmogroup, keeping the index of data_resampled_5min
data_resampled_5min = data_resampled_5min.merge(enmogroupcount, how='left', left_on='enmogroup', right_index=True)

# Add a new variable 'smallinterval' when enmogroupsize is less than 7
data_resampled_5min['smallinterval'] = data_resampled_5min['enmogroupsize'] < 8

# When smallinterval is true, fill the NaN values; otherwise, retain the original enmo values (including NaN)
data_resampled_5min['filled_enmo'] = np.where(data_resampled_5min.smallinterval, data_resampled_5min.enmo.ffill(), data_resampled_5min.enmo)

# The code here will create a new data frame that lists the total number of valid bouts for the participant
# and will count the number of bouts with filled_enmo values over a particular threshold

# Start by counting the number of valid bouts in each day as a data frame
boutcount_filled = data_resampled_5min.groupby(data_resampled_5min.index.date).count()['filled_enmo'].to_frame()

# Rename filled_enmo as valid_bouts
boutcount_filled = boutcount_filled.rename(columns={'filled_enmo': 'valid_bouts'})

# Count the number of bouts in each day with filled_enmo at least 0.192
boutcount_MVPA = data_resampled_5min[data_resampled_5min['filled_enmo'] >= 0.192].groupby(data_resampled_5min[data_resampled_5min['filled_enmo'] >= 0.192].index.date).count()['filled_enmo'].to_frame()

# Rename filled_enmo as MVPA_bouts
boutcount_MVPA = boutcount_MVPA.rename(columns={'filled_enmo': 'MVPA_bouts'})

# Merge boutcount_filled and boutcount_MVPA
boutcount = boutcount_filled.merge(boutcount_MVPA, how='left', left_index=True, right_index=True)

# Compute a new variable 'included_day' to be True if valid_bouts is at least 150
boutcount['included_day'] = boutcount['valid_bouts'] >= 150

# Compute the mean of MVPA_bouts for all days where included_day is True
MVPA_mean = boutcount[boutcount['included_day'] == True]['MVPA_bouts'].mean()