In [223]:
import pandas as pd
from datetime import datetime, time

# Load file into dataframe
folder_path = 'c:/Users/aoife/Documents/Project/DataTables/'
save_path = 'C:/Users/aoife/Documents/Project/filtered_data/'



# Body Mass Index

In [224]:
# get the average height/body mass for each participant

df_bm = pd.read_csv(folder_path + 'body-mass.csv', usecols=['participantId', 'timestamp', 'value'], parse_dates=['timestamp'])
df_h = pd.read_csv(folder_path + 'height.csv', usecols=['participantId', 'timestamp', 'value'], parse_dates=['timestamp'])

bmi_dfs = [df_bm, df_h]

for df in bmi_dfs:
    if 'value' in df.columns:

        # Convert timestamp to datetime
        df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%dT%H:%M:%SZ')
        # Convert timestamp to date

        # Get the average value for each participant
        df['value'] = df.groupby(['participantId'])['value'].transform('mean')

        # Drop duplicates
        df.drop_duplicates(subset=['participantId'], inplace=True)

        print(df.head())
        print('----------------\n')

# Sort by participantId
df_bm.sort_values(by=['participantId'], inplace=True)
df_h.sort_values(by=['participantId'], inplace=True)

# Rename value columns
df_bm.rename(columns={'value': 'bodyMass_kg'}, inplace=True)
df_h.rename(columns={'value': 'height_m'}, inplace=True)

# Merge on participantId

df_bmi = pd.merge(df_bm, df_h, on='participantId')

df_bmi['startDate'] = df_bmi['timestamp_x'].dt.date

df_bmi['bmi'] = df_bmi['bodyMass_kg'] / (df_bmi['height_m'] ** 2)

# drop unnecessary columns and duplicates
df_bmi.drop(columns=['timestamp_x', 'timestamp_y'], inplace=True)
df_bmi.drop_duplicates(subset=['participantId'], inplace=True)

print(df_bmi.shape)


                          participantId                 timestamp       value
0  11b2bf4d-6020-4a86-81bd-237c5616c649 2016-08-24 23:56:14+00:00   97.976000
1  9cbbb597-30c0-4f48-aebd-7fe58c627cf6 2016-03-16 02:33:20+00:00   63.503000
2  941e9fa4-6ba5-41f5-9bcc-805849d751b8 2017-01-01 14:07:00+00:00   96.666000
3  41c683b0-f6c4-4a6a-8dd2-4b9a3dd30a32 2016-03-30 02:24:59+00:00  112.846333
4  35fbae45-0f21-45ea-8ed7-8d3361a29b86 2016-07-18 17:45:38+00:00   68.690083
----------------

                          participantId                 timestamp   value
0  11b2bf4d-6020-4a86-81bd-237c5616c649 2016-08-24 23:56:14+00:00  1.6256
1  9cbbb597-30c0-4f48-aebd-7fe58c627cf6 2016-03-16 02:33:20+00:00  1.6510
2  941e9fa4-6ba5-41f5-9bcc-805849d751b8 2017-01-01 14:07:00+00:00  1.8288
3  1cba6631-3fb8-4c18-904f-ab56fe82833d 2016-03-07 05:54:00+00:00  1.5494
4  eb2c4ff0-54e2-4c34-b73d-003e3e0427ad 2016-04-02 07:17:47+00:00  1.7018
----------------

(1790, 5)


# Heart Rate

In [225]:
import pandas as pd
from datetime import time
import numpy as np

df_hr = pd.read_csv(folder_path + 'heart-rate.csv', usecols=['participantId', 'timestamp', 'startTime', 'endTime', 'value'], parse_dates=['timestamp', 'startTime', 'endTime'])

# Convert startTime and endTime to datetime and get date
df_hr['startTime'] = pd.to_datetime(df_hr['startTime'])
df_hr['endTime'] = pd.to_datetime(df_hr['endTime'])
df_hr['startDate'] = df_hr['startTime'].dt.date

# Remove rows with duplicated participantId, startTime, and endTime
df_hr.drop_duplicates(subset=['participantId', 'startTime', 'endTime'], inplace=True)

# Initialize time of day columns as NaN
df_hr['morning_hr'] = np.nan
df_hr['afternoon_hr'] = np.nan
df_hr['evening_hr'] = np.nan
df_hr['night_hr'] = np.nan

# Define time boundaries
# TODO look into what times are best for these
morning_end = time(12, 0)
afternoon_end = time(18, 0)
evening_end = time(23, 59)
night_end = time(6, 0)

# Classify each heart rate entry into the correct time of day
for index, row in df_hr.iterrows():
    start_time = row['startTime'].time()
    if night_end <= start_time < morning_end:
        df_hr.at[index, 'morning_hr'] = row['value']
    elif morning_end <= start_time < afternoon_end:
        df_hr.at[index, 'afternoon_hr'] = row['value']
    elif afternoon_end <= start_time < evening_end:
        df_hr.at[index, 'evening_hr'] = row['value']
    else:
        df_hr.at[index, 'night_hr'] = row['value']

# Calculate a temporary daily average heart rate excluding NaN values for accurate imputation
df_hr['temp_daily_avg_hr'] = df_hr[['morning_hr', 'afternoon_hr', 'evening_hr', 'night_hr']].mean(axis=1, skipna=True)

# Group by participant and date to compute an accurate daily average for imputation, excluding days with all NaN values
accurate_daily_avg = df_hr.groupby(['participantId', 'startDate'])['temp_daily_avg_hr'].mean().reset_index().rename(columns={'temp_daily_avg_hr': 'accurate_daily_avg_hr'})

# Merge this accurate daily average back into the original dataframe
df_hr = pd.merge(df_hr, accurate_daily_avg, on=['participantId', 'startDate'], how='left')

# Impute missing values in each period with the recalculated accurate daily average
for period in ['morning_hr', 'afternoon_hr', 'evening_hr', 'night_hr']:
    df_hr[period] = df_hr[period].fillna(df_hr['accurate_daily_avg_hr'])

# Calculate mean, max, and min heart rate for each period (includes imputed values)
df_hr_aggregated = df_hr.groupby(['participantId', 'startDate']).agg(
    mean_hr_morning=('morning_hr', 'mean'),
    max_hr_morning=('morning_hr', 'max'),
    min_hr_morning=('morning_hr', 'min'),
    mean_hr_afternoon=('afternoon_hr', 'mean'),
    max_hr_afternoon=('afternoon_hr', 'max'),
    min_hr_afternoon=('afternoon_hr', 'min'),
    mean_hr_evening=('evening_hr', 'mean'),
    max_hr_evening=('evening_hr', 'max'),
    min_hr_evening=('evening_hr', 'min'),
    mean_hr_night=('night_hr', 'mean'),
    max_hr_night=('night_hr', 'max'),
    min_hr_night=('night_hr', 'min')
).reset_index()

# Drop the temporary columns
df_hr_summary = df_hr[['participantId', 'startDate', 'morning_hr', 'afternoon_hr', 'evening_hr', 'night_hr']].drop_duplicates(subset=['participantId', 'startDate'])

# Merge the aggregated values
df_hr_final = pd.merge(df_hr_summary, df_hr_aggregated, on=['participantId', 'startDate'], how='left')

# savr to csv just to check all the entries
df_hr_final.to_csv(save_path + 'heart_rate_summary.csv', index=False)

print(df_hr_final.head())


                          participantId   startDate  morning_hr  afternoon_hr  \
0  06bc6ebb-a233-469f-8091-90256f656b1b  2016-03-13    1.173082      1.500000   
1  2214f4fd-1ae0-4804-8663-b01c5f6d142c  2016-03-02    1.260992      1.333000   
2  b2571643-4aec-492f-bc7f-6f23c7fe239a  2018-10-03    1.300000      1.263071   
3  b1406c4e-e6ac-4297-a9e4-335ca5ef04de  2019-01-24    1.203742      1.203742   
4  b1406c4e-e6ac-4297-a9e4-335ca5ef04de  2019-01-23    1.903400      1.903400   

   evening_hr  night_hr  mean_hr_morning  max_hr_morning  min_hr_morning  \
0    1.173082  1.173082         1.159673        1.233000        0.867000   
1    1.260992  1.260992         1.260992        1.260992        1.260992   
2    1.263071  1.263071         1.285036        1.533000        1.133000   
3    1.203742  1.333000         1.203742        1.203742        1.203742   
4    1.917000  1.903400         1.903400        1.903400        1.903400   

   mean_hr_afternoon  max_hr_afternoon  min_hr_afternoon

# Step Count

In [226]:
import pandas as pd
from datetime import time, datetime
import numpy as np

df_sc = pd.read_csv(folder_path + 'step-count.csv', usecols=['participantId', 'timestamp', 'startTime', 'endTime', 'value'], parse_dates=['timestamp', 'startTime', 'endTime'])

df_sc['startTime'] = pd.to_datetime(df_sc['startTime'], utc=True)
df_sc['endTime'] = pd.to_datetime(df_sc['endTime'], utc=True)
df_sc['startDate'] = df_sc['startTime'].dt.date

df_sc['duration'] = (df_sc['endTime'] - df_sc['startTime']).dt.total_seconds()

# Calculate total steps per day for each participant
df_sc['totalSteps'] = df_sc.groupby(['participantId', 'startDate'])['value'].transform('sum')

# Calculate the total duration per day for each participant
df_sc['stepsTotalDuration'] = df_sc.groupby(['participantId', 'startDate'])['duration'].transform('sum')

# Time boundaries
morning_end = time(12, 0)
afternoon_end = time(18, 0)
evening_end = time(23, 59)
night_end = time(6, 0)

# Initialize step counts for different times of day as NaN
df_sc['morningSteps'] = np.nan
df_sc['afternoonSteps'] = np.nan
df_sc['eveningSteps'] = np.nan
df_sc['nightSteps'] = np.nan

# Classify steps into different times of the day
for index, row in df_sc.iterrows():
    start_time = row['startTime'].time()
    if night_end < start_time <= morning_end:
        df_sc.at[index, 'morningSteps'] = row['value']
    elif morning_end < start_time <= afternoon_end:
        df_sc.at[index, 'afternoonSteps'] = row['value']
    elif afternoon_end < start_time <= evening_end:
        df_sc.at[index, 'eveningSteps'] = row['value']
    else:
        df_sc.at[index, 'nightSteps'] = row['value']

# aggregate steps into totals for each time
df_summary = df_sc.groupby(['participantId', 'startDate']).agg(
    morningStepsTotal=('morningSteps', 'sum'),
    afternoonStepsTotal=('afternoonSteps', 'sum'),
    eveningStepsTotal=('eveningSteps', 'sum'),
    nightStepsTotal=('nightSteps', 'sum'),
    totalSteps=('totalSteps', 'first'), 
    stepsTotalDuration=('stepsTotalDuration', 'first')
).reset_index()

# drop unnecessary columns
df_sc.drop(columns=['morningSteps', 'afternoonSteps', 'eveningSteps', 'nightSteps', 'duration', 'value', 'startTime', 'endTime', 'totalSteps', 'stepsTotalDuration'], inplace=True)
# merge back together
df_sc = pd.merge(df_sc, df_summary, on=['participantId', 'startDate'], how='left')

# Drop duplicates
df_sc.drop_duplicates(subset=['participantId', 'startDate'], inplace=True)

df_sc.to_csv(save_path + 'step_count_summary.csv', index=False)

print(df_sc.head())


                            participantId                 timestamp  \
0    1f649060-680a-4c80-a551-be38ce46cb94 2016-05-14 03:04:03+00:00   
21   1f649060-680a-4c80-a551-be38ce46cb94 2016-05-14 03:04:03+00:00   
91   1f649060-680a-4c80-a551-be38ce46cb94 2016-05-14 03:04:03+00:00   
116  1f649060-680a-4c80-a551-be38ce46cb94 2016-05-14 03:04:07+00:00   
127  1f649060-680a-4c80-a551-be38ce46cb94 2016-05-14 03:04:07+00:00   

      startDate  morningStepsTotal  afternoonStepsTotal  eveningStepsTotal  \
0    2016-03-22                0.0               4422.0             5529.0   
21   2016-03-23                0.0               8885.0             5568.0   
91   2016-03-24               20.0               7859.0             5569.0   
116  2016-03-29              118.0               8130.0             5256.0   
127  2016-03-30                0.0               6803.0             4224.0   

     nightStepsTotal  totalSteps  stepsTotalDuration  
0               66.0       10017             1093

# Distance Walking/Running

In [227]:
import pandas as pd
from datetime import time
import numpy as np

df_dwr = pd.read_csv(folder_path + 'distance-walking-running.csv', usecols=['participantId', 'timestamp', 'startTime', 'endTime', 'value'], parse_dates=['timestamp', 'startTime', 'endTime'])

df_dwr['startTime'] = pd.to_datetime(df_dwr['startTime'], utc=True)
df_dwr['endTime'] = pd.to_datetime(df_dwr['endTime'], utc=True)
df_dwr['startDate'] = df_dwr['startTime'].dt.date

# Calculate total distance and duration
df_dwr['totalDistance'] = df_dwr.groupby(['participantId', 'startDate'])['value'].transform('sum')
df_dwr['duration'] = (df_dwr['endTime'] - df_dwr['startTime']).dt.total_seconds()
df_dwr['dwrTotalDuration'] = df_dwr.groupby(['participantId', 'startDate'])['duration'].transform('sum')

# Initialize distances for different times
df_dwr['morningDist'] = np.nan
df_dwr['afternoonDist'] = np.nan
df_dwr['eveningDist'] = np.nan
df_dwr['nightDist'] = np.nan

# Define time boundaries
morning_end = time(12, 0)
afternoon_end = time(18, 0)
evening_end = time(23, 59)
night_end = time(6, 0)

# Classify distances into times of day
for index, row in df_dwr.iterrows():
    start_time = row['startTime'].time()
    if night_end < start_time <= morning_end:
        df_dwr.at[index, 'morningDist'] = row['value']
    elif morning_end < start_time <= afternoon_end:
        df_dwr.at[index, 'afternoonDist'] = row['value']
    elif afternoon_end < start_time <= evening_end:
        df_dwr.at[index, 'eveningDist'] = row['value']
    else:
        df_dwr.at[index, 'nightDist'] = row['value']

# Aggregate daily distances
df_summary = df_dwr.groupby(['participantId', 'startDate']).agg(
    morningDistance=('morningDist', 'sum'),
    afternoonDistance=('afternoonDist', 'sum'),
    eveningDistance=('eveningDist', 'sum'),
    nightDistance=('nightDist', 'sum')
).reset_index()

# Merge summary back into df_dwr
df_dwr = pd.merge(df_dwr.drop(columns=['morningDist', 'afternoonDist', 'eveningDist', 'nightDist']), df_summary, on=['participantId', 'startDate'], how='left')

# Drop duplicates
df_dwr.drop_duplicates(subset=['participantId', 'startDate'], inplace=True)

print(df_dwr.shape)

df_dwr.to_csv(save_path + 'distance_walk_run_summary.csv', index=False)

(11507, 13)


# AM check-in

In [228]:

df_amch = pd.read_csv(folder_path + 'am-checkin.csv', usecols=['participantId', 'timestamp', 'AMCH1', 'AMCH2', 'AMCH2A','AMCH3','AMCH3A','AMCH4','AMCH5'], parse_dates=['timestamp'])

# fill NaN values of AMCH2A and AMCH3A with 0
df_amch['AMCH2A'] = df_amch['AMCH2A'].fillna(0)
df_amch['AMCH3A'] = df_amch['AMCH3A'].fillna(0)

df_amch['timestamp'] = pd.to_datetime(df_amch['timestamp'], utc=True)

df_amch['startDate'] = df_amch['timestamp'].dt.date

df_amch.drop(columns=['timestamp'], inplace=True)

# drop amch1, amch4
df_amch.drop(columns=['AMCH1', 'AMCH4'], inplace=True)

# for amch2, change true to 1 and false to 0
df_amch['AMCH2'] = df_amch['AMCH2'].map({True: 1, False: 0})

# drop any row with NaN values
df_amch = df_amch.dropna()

print(df_amch.shape)


(35908, 7)


# PM check-in

In [229]:

df_pmch = pd.read_csv(folder_path + 'pm-checkin.csv', usecols=['participantId', 'timestamp', 'alcohol', 'caffeine', 'NapCount','PMCH1','PMCH2A','PMCH3'], parse_dates=['timestamp'])

df_pmch['timestamp'] = pd.to_datetime(df_pmch['timestamp'], utc=True)

df_pmch['startDate'] = df_pmch['timestamp'].dt.date

df_pmch['PMCH2A'] = df_pmch['PMCH2A'].fillna(0)
df_pmch['NapCount'] = df_pmch['NapCount'].fillna(0)
df_pmch['alcohol'] = df_pmch['alcohol'].fillna(0)
df_pmch['caffeine'] = df_pmch['caffeine'].fillna(0)

df_pmch['medication'] = 0

def check_medication(value):
    return '100' in str(value).split(',')

df_pmch['medication'] = df_pmch.apply(lambda row: '1' if check_medication(row['PMCH3']) else row['medication'], axis=1)

df_pmch.drop(columns=['timestamp'], inplace=True)
df_pmch.drop(columns=['PMCH3'], inplace=True)
df_pmch = df_pmch.dropna()

print(df_pmch.shape)



(27246, 8)


# Sleep Quality

In [230]:
df_sq = pd.read_csv(folder_path + 'sleep-quality-checker.csv', usecols=['participantId', 'timestamp', 'value'], parse_dates=['timestamp'])

df_sq['timestamp'] = pd.to_datetime(df_sq['timestamp'], utc=True)

df_sq['startDate'] = df_sq['timestamp'].dt.date

# drop duplicates
df_sq.drop_duplicates(subset=['participantId', 'startDate'], inplace=True)

# drop unnecessary columns
df_sq.drop(columns=['timestamp'], inplace=True)

df_sq.rename(columns={'value': 'ssq_score'}, inplace=True)

print(df_sq.head())
print(df_sq.shape)


                          participantId  ssq_score   startDate
0  b4ebf7dd-4e30-4f7b-8ee8-5493a19c8c9f          4  2016-09-27
1  c3993552-69cb-45e4-b18a-5e6eecefb825          4  2016-03-07
2  78f60bd3-34f3-489e-a352-f9df564641c3          4  2016-03-05
3  9da1a89a-2145-4cca-b356-7b58aa7be8b0          4  2016-09-27
4  4aad9dbe-dd9e-4832-a198-3bd563457124          4  2016-03-03
(39123, 3)


# Survey Responses

In [231]:
import pandas as pd

# Read in data from surveys
# For each unique participnat in subset_dfs, check if they have answered both surveys. If so, copy the values to each row for that participant
# Note see above code

folder_path = 'c:/Users/aoife/Documents/Project/DataTables/'


df_about_me = pd.read_csv(folder_path + 'about-me.csv', usecols=['participantId', 'timestamp', 'alcohol', 'basic_expenses', 'caffeine', 'daily_activities', 'daily_activities', 'daily_smoking', 'education', 'flexible_work_hours', 'gender', 'good_life', 'hispanic', 'income', 'marital', 'race', 'smoking_status', 'menopause', 'recent_births', 'current_pregnant', 'work_schedule'], parse_dates=['timestamp'])

df_sleep_habits = pd.read_csv(folder_path + 'sleep-habits.csv', usecols=['participantId', 'timestamp', 'alarm_dependency', 'driving_sleepy', 'falling_asleep', 'morning_person', 'nap_duration', 'sleep_lost', 'sleep_needed', 'sleep_partner', 'sleep_time_workday', 'sleep_time_weekend', 'wake_up_choices', 'wake_ups', 'weekly_naps', 'what_wakes_you'], parse_dates=['timestamp'])



In [232]:
# rename the alcolhol column in about_me and sleep_assessment to be more detailed

df_about_me.rename(columns={'alcohol': 'alcohol_consumption'}, inplace=True)

# Convert timestamp to datetime
df_about_me['timestamp'] = pd.to_datetime(df_about_me['timestamp'], utc=True)
df_sleep_habits['timestamp'] = pd.to_datetime(df_sleep_habits['timestamp'], utc=True)

# Change the timestamp to only contain the date
df_about_me['date'] = df_about_me['timestamp'].dt.date
df_sleep_habits['date'] = df_sleep_habits['timestamp'].dt.date


In [233]:
# Merge survey dfs

surveys_list = [df_about_me, df_sleep_habits]

# Merge the DataFrames
df_surveys = pd.merge(surveys_list[0], surveys_list[1], on=['participantId', 'date'], how='outer', suffixes=('', '_y'))
#df_surveys = pd.merge(df_surveys, surveys_list[2], on=['participantId', 'date'], how='outer', suffixes=('', '_y'))


In [234]:
# Remove duplicates
df_surveys.drop_duplicates(subset=['participantId'], inplace=True)

# For the 'menopause' column, replace NaN with 3
df_surveys['menopause'] = df_surveys['menopause'].fillna(3)

# For the recent_births column, replace NaN with 4
df_surveys['recent_births'] = df_surveys['recent_births'].fillna(4)

# Replace current_pregnant NaN with 0
df_surveys['current_pregnant'] = df_surveys['current_pregnant'].fillna(0)

# replace driving_sleepy NaN with 6
df_surveys['driving_sleepy'] = df_surveys['driving_sleepy'].fillna(6)

# replace falling_asleep NaN with 0
df_surveys['falling_asleep'] = df_surveys['falling_asleep'].fillna(0)

# replace morning_person NaN with 3
df_surveys['morning_person'] = df_surveys['morning_person'].fillna(3)

# replace nap_duration NaN with 6
df_surveys['nap_duration'] = df_surveys['nap_duration'].fillna(6)

# replace sleep_lost NaN with 0
df_surveys['sleep_lost'] = df_surveys['sleep_lost'].fillna(0)

# replace what_wakes_you NaN with 13
df_surveys['what_wakes_you'] = df_surveys['what_wakes_you'].fillna(13)

# Remove unnecessary columns

df_surveys.drop(columns=['timestamp', 'date', 'timestamp_y'], inplace=True)

In [235]:
def join_multiple_race(row):
    if ',' in str(row):
        return 6
    return row

df_surveys['hispanic'] = df_surveys['hispanic'].apply(join_multiple_race)
df_surveys['race'] = df_surveys['race'].apply(join_multiple_race)

def join_multiple_sleep_partner(row):
    if ',' in str(row):
        return 6
    return row

df_surveys['sleep_partner'] = df_surveys['sleep_partner'].apply(join_multiple_sleep_partner)

# create new columns for wakeup reasons

df_surveys['noise_light'] = 0
df_surveys['stress_thinking'] = 0
df_surveys['other_person'] = 0
df_surveys['pain_discomfort'] = 0
df_surveys['nightmares'] = 0
df_surveys['bathroom_urges'] = 0
df_surveys['other_reasons'] = 0

def check_wakeup_reason(row, number):
    if number == 1:
        df_surveys.loc[row.name, 'noise_light'] = 1
    elif number == 2:
        df_surveys.loc[row.name, 'stress_thinking'] = 1
    elif number == 3:
        df_surveys.loc[row.name, 'other_person'] = 1
    elif number == 4:
        df_surveys.loc[row.name, 'pain_discomfort'] = 1
    elif number == 5:
        df_surveys.loc[row.name, 'nightmares'] = 1
    elif number == 6:
        df_surveys.loc[row.name, 'bathroom_urges'] = 1
    else:
        df_surveys.loc[row.name, 'other_reasons'] = 1


for i, row in df_surveys.iterrows():
    if ',' in str(row['what_wakes_you']):
        nums = map(int, row['what_wakes_you'].split(','))
        for number in nums:
            check_wakeup_reason(row, number)
    else:
        check_wakeup_reason(row, int(row['what_wakes_you']))

# Remove what_wakes_you
df_surveys.drop(columns=['what_wakes_you'], inplace=True)

print(df_surveys.head())

                          participantId  alcohol_consumption  basic_expenses  \
0  0c82c9d1-25ba-4cb2-95df-f79fca0b8464                  1.0             3.0   
1  20a71d11-3d78-4ee0-a172-5dd8f7e33bd2                  0.0             4.0   
2  f5aac809-fd16-4733-83ee-0991eaf7036f                  0.0             3.0   
3  5ceb42a9-cd99-4ba8-93d8-4fc4a5de5a4f                  2.0             4.0   
4  80bfb0f6-601c-47ed-9538-bedb8eb6c69f                  1.0             2.0   

   caffeine  daily_activities  daily_smoking  education  flexible_work_hours  \
0       4.0               1.0            1.0        4.0                  2.0   
1       5.0               1.0            3.0        4.0                  1.0   
2       3.0               1.0            3.0        4.0                  2.0   
3       1.0               1.0            3.0        5.0                  2.0   
4       3.0               4.0            3.0        5.0                  2.0   

   gender  good_life  ... wake_up_choi

# Pulling Everything Together

1. Add each dataframe to a list
2. Gather a list of unique participants
3. Choose a number of unique participants, and merge their data across each dataframe
4. Continue for all participants
5. Remove NaN values
6. Concatenate each new dataframe

In [236]:
# merge function

def merge_dfs(dfs_list, participants):
    merged_df = None
    for df in dfs_list:
        # Filter dataframe for participants we currently work with
        curr_df = df[df['participantId'].isin(participants)]
        if merged_df is None:
            merged_df = curr_df
        else:
            merged_df = pd.merge(merged_df, curr_df, on=['participantId', 'startDate'], how='outer', suffixes=('', '_y'))
    return merged_df


In [237]:

def get_separate_tables(num_participants):
    
    # Create a list of DataFrames
    dfs_list = [df_bmi, df_hr_final, df_sc, df_dwr, df_sq]	


    # Create a list of all participants
    participants = set(dfs_list[0]['participantId'].unique())

    # Sort the list of participants alphabetically
    participants = sorted(list(participants))
    
    # print length of participants
    print(len(participants))

    result_dfs = []

    while participants:
        # Get the next group of participants
        next_group = participants[:num_participants]

        # Merge the DataFrames for the next group of participants
        merge_df = merge_dfs(dfs_list, next_group)

        print(merge_df.head())

        # Add the merged DataFrame to the list of results
        result_dfs.append(merge_df)

        # Remove the participants that were just used
        participants = participants[num_participants:]

    return result_dfs


In [238]:
# Call get_separate_tables() and check the resulting dfs

subset_dfs = get_separate_tables(20)

# print the shape of each DataFrame
for i, df in enumerate(subset_dfs):
    print(f"DataFrame {i}: {df.shape}")


1790
                          participantId  bodyMass_kg  height_m   startDate  \
0  00a55fb5-da33-4e2e-ae61-28f589fcc174       51.256    1.5748  2016-05-12   
1  00d6d2ee-ccea-45c7-9772-b19fd9bef2bf       74.389    1.9050  2018-04-20   
2  00fd4039-9b5e-4bbb-8295-4983a3f58371       59.874    1.6510  2018-08-15   
3  0113e483-0fc8-4892-a4fc-0b2f3820dde3       68.039    1.7018  2019-03-06   
4  013d82d7-b3cc-4007-b00c-4d1d75bab9dd       76.204    1.6002  2018-02-21   

         bmi  morning_hr  afternoon_hr  evening_hr  night_hr  mean_hr_morning  \
0  20.667783    1.400000      1.438426    1.438426  1.438426         1.438137   
1  20.498343         NaN           NaN         NaN       NaN              NaN   
2  21.965653    1.290195      1.290195    1.290195  1.383000         1.266626   
3  23.493130    1.292230      1.000000    1.292230  1.292230         1.292230   
4  29.759747         NaN           NaN         NaN       NaN              NaN   

   ...                   endTime    val

In [239]:
# For each unique participant, check if they have a value for weight, height, and bmi. If so, copy values to each row for that participant

for df in subset_dfs:

    # Loop through participants
    for participant in df['participantId'].unique():
        # Find the index of the row containing non NaN values for weight, height, and bmi
        non_nan_index = df[(df['participantId'] == participant) & (~df['bodyMass_kg'].isnull()) & (~df['height_m'].isnull()) & (~df['bmi'].isnull())].index

        # If there is a row with non NaN values, copy the values to all rows for that participant
        if len(non_nan_index) > 0:
            non_nan_index = non_nan_index[0]
            non_nan_row = df.loc[non_nan_index]
            df.loc[df['participantId'] == participant, 'bodyMass_kg'] = non_nan_row['bodyMass_kg']
            df.loc[df['participantId'] == participant, 'height_m'] = non_nan_row['height_m']
            df.loc[df['participantId'] == participant, 'bmi'] = non_nan_row['bmi']
    print(f"DataFrame : {df.shape}")

DataFrame : (77, 40)
DataFrame : (120, 40)
DataFrame : (190, 40)
DataFrame : (378, 40)
DataFrame : (205, 40)
DataFrame : (65, 40)
DataFrame : (252, 40)
DataFrame : (95, 40)
DataFrame : (135, 40)
DataFrame : (164, 40)
DataFrame : (43, 40)
DataFrame : (117, 40)
DataFrame : (79, 40)
DataFrame : (71, 40)
DataFrame : (268, 40)
DataFrame : (109, 40)
DataFrame : (167, 40)
DataFrame : (97, 40)
DataFrame : (268, 40)
DataFrame : (166, 40)
DataFrame : (192, 40)
DataFrame : (243, 40)
DataFrame : (198, 40)
DataFrame : (866, 40)
DataFrame : (50, 40)
DataFrame : (99, 40)
DataFrame : (261, 40)
DataFrame : (192, 40)
DataFrame : (84, 40)
DataFrame : (349, 40)
DataFrame : (68, 40)
DataFrame : (58, 40)
DataFrame : (157, 40)
DataFrame : (197, 40)
DataFrame : (72, 40)
DataFrame : (681, 40)
DataFrame : (712, 40)
DataFrame : (170, 40)
DataFrame : (309, 40)
DataFrame : (473, 40)
DataFrame : (147, 40)
DataFrame : (156, 40)
DataFrame : (100, 40)
DataFrame : (122, 40)
DataFrame : (257, 40)
DataFrame : (204, 40)
D

In [240]:
# Concatenate the DataFrames in subset_dfs into a single DataFrame

concatenated_df = pd.concat(subset_dfs, ignore_index=True)

print(concatenated_df.head())
print(concatenated_df.shape)

# Save the concatenated DataFrame to a CSV file
concatenated_df.to_csv(save_path + 'all_data.csv', index=False)



                          participantId  bodyMass_kg  height_m   startDate  \
0  00a55fb5-da33-4e2e-ae61-28f589fcc174       51.256    1.5748  2016-05-12   
1  00d6d2ee-ccea-45c7-9772-b19fd9bef2bf       74.389    1.9050  2018-04-20   
2  00fd4039-9b5e-4bbb-8295-4983a3f58371       59.874    1.6510  2018-08-15   
3  0113e483-0fc8-4892-a4fc-0b2f3820dde3       68.039    1.7018  2019-03-06   
4  013d82d7-b3cc-4007-b00c-4d1d75bab9dd       76.204    1.6002  2018-02-21   

         bmi  morning_hr  afternoon_hr  evening_hr  night_hr  mean_hr_morning  \
0  20.667783    1.400000      1.438426    1.438426  1.438426         1.438137   
1  20.498343         NaN           NaN         NaN       NaN              NaN   
2  21.965653    1.290195      1.290195    1.290195  1.383000         1.266626   
3  23.493130    1.292230      1.000000    1.292230  1.292230         1.292230   
4  29.759747         NaN           NaN         NaN       NaN              NaN   

   ...                   endTime    value  t

In [241]:
# Merge the surveys DataFrame with the concatenated DataFrame

activity_and_survey_df = pd.merge(concatenated_df, df_surveys, on=['participantId'], how='outer', suffixes=('', '_y'))

#print(activity_and_survey_df.head())

# Print num rows
print(f"Number of rows in activity_and_survey_df before NaN removed: {len(activity_and_survey_df)}")

# Remove rows that have NaN values
activity_and_survey_df = activity_and_survey_df.dropna()

print(f"Number of rows in activity_and_survey_df after NaN removed: {len(activity_and_survey_df)}")

print(activity_and_survey_df.head())
print(activity_and_survey_df.shape)

activity_and_survey_df.to_csv(save_path + 'timed_df.csv', index=False)


Number of rows in activity_and_survey_df before NaN removed: 23697
Number of rows in activity_and_survey_df after NaN removed: 664
                            participantId  bodyMass_kg  height_m   startDate  \
0    00a55fb5-da33-4e2e-ae61-28f589fcc174       51.256    1.5748  2016-05-12   
9    00fd4039-9b5e-4bbb-8295-4983a3f58371       59.874    1.6510  2018-08-15   
52   02d5125e-684f-4166-a3b7-5df1bcfc1661       59.874    1.7018  2016-03-05   
184  080292d4-a0b0-4dd7-a7dd-191c8ac71664       77.111    1.8034  2017-10-05   
186  080bacc1-4661-4735-acca-7d27ad1a4192       90.718    1.8034  2016-03-02   

           bmi  morning_hr  afternoon_hr  evening_hr  night_hr  \
0    20.667783    1.400000      1.438426    1.438426  1.438426   
9    21.965653    1.290195      1.290195    1.290195  1.383000   
52   20.673844    1.107266      1.107266    1.107266  1.283000   
184  23.710036    1.056040      1.056040    1.056040  1.000000   
186  27.893906    0.900000      1.179194    1.179194  1.17

In [242]:
# remove rows where startDate and participant Id are duplicated

activity_and_survey_df.drop_duplicates(subset=['participantId', 'startDate'], inplace=True)

In [243]:
# Drop rows with NaN values
#activity_and_survey_df = activity_and_survey_df.dropna()

# Save the final DataFrame to a CSV file
activity_and_survey_df.to_csv(save_path + 'final_timed_data.csv', index=False)

In [244]:
print(activity_and_survey_df.head(10))

                             participantId  bodyMass_kg  height_m   startDate  \
0     00a55fb5-da33-4e2e-ae61-28f589fcc174       51.256    1.5748  2016-05-12   
9     00fd4039-9b5e-4bbb-8295-4983a3f58371       59.874    1.6510  2018-08-15   
52    02d5125e-684f-4166-a3b7-5df1bcfc1661       59.874    1.7018  2016-03-05   
184   080292d4-a0b0-4dd7-a7dd-191c8ac71664       77.111    1.8034  2017-10-05   
186   080bacc1-4661-4735-acca-7d27ad1a4192       90.718    1.8034  2016-03-02   
410   0b79acd2-ea0b-406f-aa41-e48b98f19bb4       72.575    1.6764  2016-03-02   
900   0f00f803-6e81-4b77-9efc-3be73b5bface       98.652    1.8542  2017-03-05   
904   0f279ffd-a1b5-473f-9cfc-01e5510ea5a4      111.584    1.8034  2018-06-19   
1287  156fe69a-1c98-4337-b310-aee985e94279       60.781    1.6256  2016-04-02   
1518  1aaad641-a3b6-4e47-a010-e8cb01ce3d03       82.010    1.7526  2016-03-12   

            bmi  morning_hr  afternoon_hr  evening_hr  night_hr  \
0     20.667783    1.400000      1.438426