 This data preparation will focus on the features used in Toward a Holistic Computational Representation for Sleep Quality and its Support for Explainability

The data will be prepared for Linear Regression, Random Forest Regressor, and Keras.

In [14]:
# read in data

import pandas as pd
import numpy as np

folder_path = 'c:/Users/aoife/Documents/Project/DataTables/'
save_path = 'C:/Users/aoife/Documents/Project/second-approach/'

df_onboarding = pd.read_csv(folder_path + 'onboarding-demographics.csv', usecols=['participantId', 'age_years', 'gender', 'timestamp'])
df_bm = pd.read_csv(folder_path + 'body-mass.csv', usecols=['participantId', 'value', 'timestamp'])
df_height = pd.read_csv(folder_path + 'height.csv', usecols=['participantId', 'value', 'timestamp'])
df_my_health = pd.read_csv(folder_path + 'my-health.csv', usecols=['participantId', 'stressed', 'cancer', 'diabetes', 'timestamp'])
df_hr = pd.read_csv(folder_path + 'heart-rate.csv', usecols=['participantId', 'value', 'startTime', 'endTime', 'timestamp'])
df_nt = pd.read_csv(folder_path + 'nap-tracker.csv', usecols=['participantId', 'NapDuration', 'timestamp'])
df_about_me = pd.read_csv(folder_path + 'about-me.csv', usecols=['participantId', 'menopause', 'timestamp'])
df_steps = pd.read_csv(folder_path + 'step-count.csv', usecols=['participantId', 'value', 'startTime', 'endTime', 'timestamp'])
df_sq = pd.read_csv(folder_path + 'sleep-quality-checker.csv', usecols=['participantId', 'value', 'timestamp'])


In [15]:
# convert timestamp data to datetime in all dataframes

df_onboarding['timestamp'] = pd.to_datetime(df_onboarding['timestamp'], utc=True)
df_my_health['timestamp'] = pd.to_datetime(df_my_health['timestamp'], utc=True)
df_hr['timestamp'] = pd.to_datetime(df_hr['timestamp'], utc=True)
df_hr['startTime'] = pd.to_datetime(df_hr['startTime'], utc=True)
df_hr['endTime'] = pd.to_datetime(df_hr['endTime'], utc=True)
df_nt['timestamp'] = pd.to_datetime(df_nt['timestamp'], utc=True)
df_about_me['timestamp'] = pd.to_datetime(df_about_me['timestamp'], utc=True)
df_steps['timestamp'] = pd.to_datetime(df_steps['timestamp'], utc=True)
df_steps['startTime'] = pd.to_datetime(df_steps['startTime'], utc=True)
df_steps['endTime'] = pd.to_datetime(df_steps['endTime'], utc=True)
df_sq['timestamp'] = pd.to_datetime(df_sq['timestamp'], utc=True)


In [16]:
df_hr['startDate'] = df_hr['startTime'].dt.date
df_steps['startDate'] = df_steps['startTime'].dt.date

# change all timestamps to date only
df_onboarding['startDate'] = df_onboarding['timestamp'].dt.date
df_my_health['startDate'] = df_my_health['timestamp'].dt.date
df_nt['startDate'] = df_nt['timestamp'].dt.date
df_about_me['startDate'] = df_about_me['timestamp'].dt.date
df_sq['startDate'] = df_sq['timestamp'].dt.date


# preview
print(df_onboarding.head())
print(df_bm.head())
print(df_height.head())
print(df_my_health.head())
print(df_hr.head())
print(df_nt.head())
print(df_about_me.head())
print(df_steps.head())
print(df_sq.head())


                          participantId  gender  age_years  \
0  3d6c0442-9150-4974-8257-3a94461c8790    Male       27.0   
1  a12e2ca0-2cf4-498f-a51d-1644630511ec  Female       19.0   
2  f63a6a72-88c1-4b49-9326-578f33ed8e9a    Male       51.0   
3  00a347ef-8bac-4b44-8820-338fd910d4e0    Male       34.0   
4  f3f37736-a61f-414e-903a-9de0d92d1ded    Male       19.0   

                  timestamp   startDate  
0 2016-07-21 21:44:06+00:00  2016-07-21  
1 2016-07-22 00:29:50+00:00  2016-07-22  
2 2016-03-10 16:34:55+00:00  2016-03-10  
3 2016-03-04 16:15:23+00:00  2016-03-04  
4 2016-04-02 17:50:39+00:00  2016-04-02  
                          participantId             timestamp    value
0  11b2bf4d-6020-4a86-81bd-237c5616c649  2016-08-24T23:56:14Z   97.976
1  9cbbb597-30c0-4f48-aebd-7fe58c627cf6  2016-03-16T02:33:20Z   63.503
2  941e9fa4-6ba5-41f5-9bcc-805849d751b8  2017-01-01T14:07:00Z   96.717
3  41c683b0-f6c4-4a6a-8dd2-4b9a3dd30a32  2016-03-30T02:24:59Z  113.398
4  35fbae45-0f21-45e

In [17]:
import pandas as pd
from datetime import time
import numpy as np

df_hr = pd.read_csv(folder_path + 'heart-rate.csv', usecols=['participantId', 'timestamp', 'startTime', 'endTime', 'value'], parse_dates=['timestamp', 'startTime', 'endTime'])

# Convert startTime and endTime to datetime and get date
df_hr['startTime'] = pd.to_datetime(df_hr['startTime'])
df_hr['endTime'] = pd.to_datetime(df_hr['endTime'])
df_hr['startDate'] = df_hr['startTime'].dt.date

# Remove rows with duplicated participantId, startTime, and endTime
df_hr.drop_duplicates(subset=['participantId', 'startTime', 'endTime'], inplace=True)

# Initialize time of day columns as NaN
df_hr['morning_hr'] = np.nan
df_hr['afternoon_hr'] = np.nan
df_hr['evening_hr'] = np.nan
df_hr['night_hr'] = np.nan

# Define time boundaries
# TODO look into what times are best for these
morning_end = time(12, 0)
afternoon_end = time(18, 0)
evening_end = time(23, 59)
night_end = time(6, 0)

# Classify each heart rate entry into the correct time of day
for index, row in df_hr.iterrows():
    start_time = row['startTime'].time()
    if night_end <= start_time < morning_end:
        df_hr.at[index, 'morning_hr'] = row['value']
    elif morning_end <= start_time < afternoon_end:
        df_hr.at[index, 'afternoon_hr'] = row['value']
    elif afternoon_end <= start_time < evening_end:
        df_hr.at[index, 'evening_hr'] = row['value']
    else:
        df_hr.at[index, 'night_hr'] = row['value']

# Calculate a temporary daily average heart rate excluding NaN values for accurate imputation
df_hr['temp_daily_avg_hr'] = df_hr[['morning_hr', 'afternoon_hr', 'evening_hr', 'night_hr']].mean(axis=1, skipna=True)

# Group by participant and date to compute an accurate daily average for imputation, excluding days with all NaN values
accurate_daily_avg = df_hr.groupby(['participantId', 'startDate'])['temp_daily_avg_hr'].mean().reset_index().rename(columns={'temp_daily_avg_hr': 'accurate_daily_avg_hr'})

# Merge this accurate daily average back into the original dataframe
df_hr = pd.merge(df_hr, accurate_daily_avg, on=['participantId', 'startDate'], how='left')

# Impute missing values in each period with the recalculated accurate daily average
for period in ['morning_hr', 'afternoon_hr', 'evening_hr', 'night_hr']:
    df_hr[period] = df_hr[period].fillna(df_hr['accurate_daily_avg_hr'])

# Calculate mean, max, and min heart rate for each period (includes imputed values)
df_hr_aggregated = df_hr.groupby(['participantId', 'startDate']).agg(
    mean_hr_morning=('morning_hr', 'mean'),
    max_hr_morning=('morning_hr', 'max'),
    min_hr_morning=('morning_hr', 'min'),
    mean_hr_afternoon=('afternoon_hr', 'mean'),
    max_hr_afternoon=('afternoon_hr', 'max'),
    min_hr_afternoon=('afternoon_hr', 'min'),
    mean_hr_evening=('evening_hr', 'mean'),
    max_hr_evening=('evening_hr', 'max'),
    min_hr_evening=('evening_hr', 'min'),
    mean_hr_night=('night_hr', 'mean'),
    max_hr_night=('night_hr', 'max'),
    min_hr_night=('night_hr', 'min')
).reset_index()

# Drop the temporary columns
df_hr_summary = df_hr[['participantId', 'startDate', 'morning_hr', 'afternoon_hr', 'evening_hr', 'night_hr']].drop_duplicates(subset=['participantId', 'startDate'])

# Merge the aggregated values
df_hr_final = pd.merge(df_hr_summary, df_hr_aggregated, on=['participantId', 'startDate'], how='left')

df_hr_final.drop_duplicates(subset=['participantId', 'startDate'], keep='first', inplace=True)

# savr to csv just to check all the entries
df_hr_final.to_csv(save_path + 'heart_rate_summary.csv', index=False)

print(df_hr_final.head())


                          participantId   startDate  morning_hr  afternoon_hr  \
0  06bc6ebb-a233-469f-8091-90256f656b1b  2016-03-13    1.173082      1.500000   
1  2214f4fd-1ae0-4804-8663-b01c5f6d142c  2016-03-02    1.260992      1.333000   
2  b2571643-4aec-492f-bc7f-6f23c7fe239a  2018-10-03    1.300000      1.263071   
3  b1406c4e-e6ac-4297-a9e4-335ca5ef04de  2019-01-24    1.203742      1.203742   
4  b1406c4e-e6ac-4297-a9e4-335ca5ef04de  2019-01-23    1.903400      1.903400   

   evening_hr  night_hr  mean_hr_morning  max_hr_morning  min_hr_morning  \
0    1.173082  1.173082         1.159673        1.233000        0.867000   
1    1.260992  1.260992         1.260992        1.260992        1.260992   
2    1.263071  1.263071         1.285036        1.533000        1.133000   
3    1.203742  1.333000         1.203742        1.203742        1.203742   
4    1.917000  1.903400         1.903400        1.903400        1.903400   

   mean_hr_afternoon  max_hr_afternoon  min_hr_afternoon

In [18]:
import pandas as pd
from datetime import time, datetime
import numpy as np

df_sc = pd.read_csv(folder_path + 'step-count.csv', usecols=['participantId', 'timestamp', 'startTime', 'endTime', 'value'], parse_dates=['timestamp', 'startTime', 'endTime'])

df_sc['startTime'] = pd.to_datetime(df_sc['startTime'], utc=True)
df_sc['endTime'] = pd.to_datetime(df_sc['endTime'], utc=True)
df_sc['startDate'] = df_sc['startTime'].dt.date

df_sc['duration'] = (df_sc['endTime'] - df_sc['startTime']).dt.total_seconds()


# Calculate total steps per day for each participant
df_sc['totalSteps'] = df_sc.groupby(['participantId', 'startDate'])['value'].transform('sum')

# Calculate the total duration per day for each participant
df_sc['stepsTotalDuration'] = df_sc.groupby(['participantId', 'startDate'])['duration'].transform('sum')
df_sc['steps_per_hour'] = df_sc['totalSteps'] / (df_sc['stepsTotalDuration'] / (60*60))

# Time boundaries
morning_end = time(12, 0)
afternoon_end = time(18, 0)
evening_end = time(23, 59)
night_end = time(6, 0)

# Initialize step counts for different times of day as NaN
df_sc['morningSteps'] = np.nan
df_sc['afternoonSteps'] = np.nan
df_sc['eveningSteps'] = np.nan
df_sc['nightSteps'] = np.nan

# Classify steps into different times of the day
for index, row in df_sc.iterrows():
    start_time = row['startTime'].time()
    if night_end < start_time <= morning_end:
        df_sc.at[index, 'morningSteps'] = row['value']
    elif morning_end < start_time <= afternoon_end:
        df_sc.at[index, 'afternoonSteps'] = row['value']
    elif afternoon_end < start_time <= evening_end:
        df_sc.at[index, 'eveningSteps'] = row['value']
    else:
        df_sc.at[index, 'nightSteps'] = row['value']

# aggregate steps into totals for each time
df_summary = df_sc.groupby(['participantId', 'startDate']).agg(
    morningStepsTotal=('morningSteps', 'sum'),
    afternoonStepsTotal=('afternoonSteps', 'sum'),
    eveningStepsTotal=('eveningSteps', 'sum'),
    nightStepsTotal=('nightSteps', 'sum'),
    totalSteps=('totalSteps', 'first'), 
    stepsTotalDuration=('stepsTotalDuration', 'first')
).reset_index()

# drop unnecessary columns
df_sc.drop(columns=['morningSteps', 'afternoonSteps', 'eveningSteps', 'nightSteps', 'duration', 'value', 'startTime', 'endTime', 'totalSteps', 'stepsTotalDuration'], inplace=True)
# merge back together
df_sc = pd.merge(df_sc, df_summary, on=['participantId', 'startDate'], how='left')

# Drop duplicates
df_sc.drop_duplicates(subset=['participantId', 'startDate'], inplace=True)

df_sc.to_csv(save_path + 'step_count_summary.csv', index=False)

print(df_sc.head())


                            participantId                 timestamp  \
0    1f649060-680a-4c80-a551-be38ce46cb94 2016-05-14 03:04:03+00:00   
21   1f649060-680a-4c80-a551-be38ce46cb94 2016-05-14 03:04:03+00:00   
91   1f649060-680a-4c80-a551-be38ce46cb94 2016-05-14 03:04:03+00:00   
116  1f649060-680a-4c80-a551-be38ce46cb94 2016-05-14 03:04:07+00:00   
127  1f649060-680a-4c80-a551-be38ce46cb94 2016-05-14 03:04:07+00:00   

      startDate  steps_per_hour  morningStepsTotal  afternoonStepsTotal  \
0    2016-03-22     3296.571899                0.0               4422.0   
21   2016-03-23     3838.334779                0.0               8885.0   
91   2016-03-24     4264.419532               20.0               7859.0   
116  2016-03-29     3292.721759              118.0               8130.0   
127  2016-03-30     3490.214189                0.0               6803.0   

     eveningStepsTotal  nightStepsTotal  totalSteps  stepsTotalDuration  
0               5529.0             66.0       10

In [19]:
# calculate bmi based on height and weight

# rename value columns
df_bm.rename(columns={'value': 'weight'}, inplace=True)
df_height.rename(columns={'value': 'height'}, inplace=True)

# merge height and weight data
df_bmi = pd.merge(df_bm, df_height, on=['participantId', 'timestamp'], how='outer')

# calculate bmi
df_bmi['bmi'] = df_bmi['weight'] / ((df_bmi['height'] / 100) ** 2)


In [20]:
# remove startdate

df_about_me.drop(columns=['startDate'], inplace=True)
df_my_health.drop(columns=['startDate'], inplace=True)



In [21]:
# for NapDuration in df_nt, fill NaN with 0
df_nt['NapDuration'] = df_nt['NapDuration'].fillna(0)


In [22]:
print(df_nt.head())
print(df_about_me.head())

                          participantId  NapDuration  \
0  cb4c4df0-6685-4743-a3cf-d29c3d6941c9       4500.0   
1  1f230684-4fa7-48ce-a7c2-6372238fe486      24708.0   
2  02b96d81-c9f6-4a73-a075-75a64e54d005       3300.0   
3  28e7a976-55e2-4ff1-8e95-7278f1ee9ec9          0.0   
4  1dcf514b-8610-40f3-9a78-43ef80b5b7b7       4093.0   

                  timestamp   startDate  
0 2016-07-11 02:37:13+00:00  2016-07-11  
1 2016-09-07 12:20:41+00:00  2016-09-07  
2 2016-03-12 01:47:15+00:00  2016-03-12  
3 2016-09-07 22:48:16+00:00  2016-09-07  
4 2016-09-08 13:21:58+00:00  2016-09-08  
                          participantId  menopause                 timestamp
0  0c82c9d1-25ba-4cb2-95df-f79fca0b8464        3.0 2016-03-04 22:54:18+00:00
1  20a71d11-3d78-4ee0-a172-5dd8f7e33bd2        NaN 2016-03-04 22:59:09+00:00
2  f5aac809-fd16-4733-83ee-0991eaf7036f        NaN 2016-03-04 23:11:14+00:00
3  5ceb42a9-cd99-4ba8-93d8-4fc4a5de5a4f        NaN 2016-03-04 23:22:08+00:00
4  80bfb0f6-601c-47ed-9538

In [23]:
# rename sleep quality value column
df_sq.rename(columns={'value': 'sq_score'}, inplace=True)

In [24]:
# merge datasets

# add to list
from functools import reduce


dataframes = [df_hr_final, df_sc, df_nt]

# merge dataframes
df_merged = pd.DataFrame()

df_merged = reduce(lambda left, right: pd.merge(left, right, on=['participantId', 'startDate'], how='outer'), dataframes)

# drop duplicates
df_merged.drop_duplicates(subset=['participantId', 'startDate'], inplace=True)
# drop timestamp
#df_merged.drop(columns=['timestamp'], inplace=True)
print (df_merged.head())


                          participantId   startDate  morning_hr  afternoon_hr  \
0  06bc6ebb-a233-469f-8091-90256f656b1b  2016-03-13    1.173082      1.500000   
1  2214f4fd-1ae0-4804-8663-b01c5f6d142c  2016-03-02    1.260992      1.333000   
2  b2571643-4aec-492f-bc7f-6f23c7fe239a  2018-10-03    1.300000      1.263071   
3  b1406c4e-e6ac-4297-a9e4-335ca5ef04de  2019-01-24    1.203742      1.203742   
4  b1406c4e-e6ac-4297-a9e4-335ca5ef04de  2019-01-23    1.903400      1.903400   

   evening_hr  night_hr  mean_hr_morning  max_hr_morning  min_hr_morning  \
0    1.173082  1.173082         1.159673        1.233000        0.867000   
1    1.260992  1.260992         1.260992        1.260992        1.260992   
2    1.263071  1.263071         1.285036        1.533000        1.133000   
3    1.203742  1.333000         1.203742        1.203742        1.203742   
4    1.917000  1.903400         1.903400        1.903400        1.903400   

   mean_hr_afternoon  ...               timestamp_x  ste

In [25]:
# merge with other dataframes

dataframes = [ df_onboarding, df_my_health, df_about_me, df_bmi, df_sq]

for df in dataframes:
    if 'timestamp' in df.columns:
        df.drop(columns=['timestamp'], inplace=True)

# merge dataframes
df_surveys = pd.DataFrame()

df_surveys = reduce(lambda left, right: pd.merge(left, right, on=['participantId'], how='outer', suffixes=('', '_y')), dataframes)



In [26]:


dataframes = [df_merged, df_surveys]

for df in dataframes:
    if 'timestamp' in df.columns:
        df.drop(columns=['timestamp'], inplace=True)

# merge dataframes
df_merged = pd.DataFrame()

df_merged = reduce(lambda left, right: pd.merge(left, right, on=['participantId'], how='outer', suffixes=('', '_y')), dataframes)




MemoryError: Unable to allocate 43.5 GiB for an array with shape (5837071630,) and data type int64

In [None]:
# for nap tracker, fill nan with 0

df_merged['NapDuration'].fillna(0, inplace=True)

# for menopause, fill nan with 3
df_merged['menopause'].fillna(3, inplace=True)

In [None]:
print(df_merged.head())

# drop duplicates
df_merged.drop_duplicates(subset=['participantId', 'startDate'], inplace=True)

# save to csv
df_merged.to_csv(save_path + 'merged_data_before_dropna.csv', index=False)


                          participantId   startDate  morning_hr  afternoon_hr  \
0  06bc6ebb-a233-469f-8091-90256f656b1b  2016-03-13    1.173082      1.500000   
1  2214f4fd-1ae0-4804-8663-b01c5f6d142c  2016-03-02    1.260992      1.333000   
2  b2571643-4aec-492f-bc7f-6f23c7fe239a  2018-10-03    1.300000      1.263071   
3  b2571643-4aec-492f-bc7f-6f23c7fe239a  2018-10-03    1.300000      1.263071   
4  b2571643-4aec-492f-bc7f-6f23c7fe239a  2018-10-03    1.300000      1.263071   

   evening_hr  night_hr  mean_hr_morning  max_hr_morning  min_hr_morning  \
0    1.173082  1.173082         1.159673        1.233000        0.867000   
1    1.260992  1.260992         1.260992        1.260992        1.260992   
2    1.263071  1.263071         1.285036        1.533000        1.133000   
3    1.263071  1.263071         1.285036        1.533000        1.133000   
4    1.263071  1.263071         1.285036        1.533000        1.133000   

   mean_hr_afternoon  ...  timestamp_y  gender  age_year

In [None]:
# TODO: before this, remove extra startdates


# remove NaN
df_merged = df_merged.dropna()


In [None]:
# save to csv

df_merged.to_csv(save_path + 'initial_data.csv', index=False)

In [None]:
# print number of participants in merged data
print(len(df_merged['participantId'].unique()))

11
