# Import Data

In [1]:
# import dependencies
from bs4 import BeautifulSoup
import pandas as pd
from datetime import date, datetime, timedelta, time
import re
import numpy as np

In [2]:
# open and read the xml file
file_path = ('../apple_health_export/export.xml')
with open(file_path, 'r') as f:
    data = f.read()

bs_data = BeautifulSoup(data, 'xml')

### Get the data we need

In [3]:
# find all of the hr data
hr_data = bs_data.find_all(type='HKQuantityTypeIdentifierHeartRate')

In [4]:
# find all of the audio exposure data
sound_data = bs_data.find_all(type="HKQuantityTypeIdentifierEnvironmentalAudioExposure")

## Rework Data

#### Heart Rate Data

In [7]:
# create the lists we want to hold the data
date_creation = []
time_creation = []
hr_value = []
hr_bin = []
time_bin = []

# create a dictionary with the data
hr_dict = {
    'date_created': date_creation,
    'time_created': time_creation,
    'hr_values': hr_value,
    'hr_bin': hr_bin,
    'time_bin': time_bin
}

# loop through the data to pull the data we need
for idx, hr in enumerate(hr_data):
    creation_list = hr['creationDate']
    creation = creation_list.split()
    date_creation.append(creation[0])
    time_creation.append(creation[1])

    hr_value.append(float(hr['value']))

# create bins for hr values
# high > 100
# low < 60
# normal 60-100
for i in hr_value:
    if i > 100:
        hr_bin.append('high')
    elif i < 60:
        hr_bin.append('low')
    else:
        hr_bin.append('normal')

# bins for each time range
def time_in_range(start_time, end_time, creation_time):
    return start_time.strftime('%H:%M:%S') <= creation_time.strftime('%H:%M:%S') <= end_time.strftime('%H:%M:%S')

# the start and end times
# before first class 0
start_time_1 = time(6, 45, 0)
end_time_1 = time(9, 25, 0)
# 5th Grade
start_time_2 = time(9, 30, 0)
end_time_2 = time(10, 15, 0)
# 4th Grade
start_time_3 = time(10, 20, 0)
end_time_3 = time(11, 5, 0)
# lunch
start_time_4 = time(11, 5, 0)
end_time_4 = time(11, 35, 0)
# kinder
start_time_5 = time(11, 35, 0)
end_time_5 = time(12, 20, 0)
# 1st grade
start_time_6 = time(12, 25, 0)
end_time_6 = time(13, 10, 0)
# 3rd grade
start_time_7 = time(13, 15, 0)
end_time_7 = time(14, 0, 0)
# 2nd grade
start_time_8 = time(14, 5, 0)
end_time_8 = time(14, 50, 0)
# dismissal
start_time_9 = time(14, 50, 0)
end_time_9 = time(16, 0, 0)


for i in hr_dict['time_created']:
    creation_time = i
    time_dt = datetime.strptime(creation_time, '%H:%M:%S')
    time_dt.strftime('%H:%M:%S')
    if time_in_range(start_time_1, end_time_1, time_dt) == True:
        time_bin.append(1)
    elif time_in_range(start_time_2, end_time_2, time_dt) == True:
        time_bin.append(2)
    elif time_in_range(start_time_3, end_time_3, time_dt) == True:
        time_bin.append(3)
    elif time_in_range(start_time_4, end_time_4, time_dt) == True:
        time_bin.append(4)
    elif time_in_range(start_time_5, end_time_5, time_dt) == True:
        time_bin.append(5)
    elif time_in_range(start_time_6, end_time_6, time_dt) == True:
        time_bin.append(6)
    elif time_in_range(start_time_7, end_time_7, time_dt) == True:
        time_bin.append(7)
    elif time_in_range(start_time_8, end_time_8, time_dt) == True:
        time_bin.append(8)
    elif time_in_range(start_time_9, end_time_9, time_dt) == True:
        time_bin.append(9)
    else:
        time_bin.append(0)

In [9]:
# create a df
hr_df = pd.DataFrame(hr_dict)
hr_df.describe()

Unnamed: 0,hr_values,time_bin
count,125156.0,125156.0
mean,86.310397,2.38424
std,24.331602,3.229963
min,39.0,0.0
25%,70.0,0.0
50%,81.0,0.0
75%,96.0,5.0
max,203.0,9.0


# Work Days

In [10]:
# Returns whether day is in the range [start, end]
def date_in_range(start, end, day):
    return start <= day <= end

# the start and end days
start = datetime(2022, 3, 28)
end = datetime(2022, 6, 9)

In [22]:
date_creation = []
time_creation = []
hr_value = []
hr_bin = []
time_bin = []
day_week = []

# grabbing all the rows for the selected time range
for idx, row in hr_df.iterrows():
    day = row['date_created']
    day_dt = datetime.strptime(day, '%Y-%m-%d')
    if date_in_range(start, end, day_dt) == True:
        day_week.append(datetime.weekday(day_dt))
        date_creation.append(day)
        time_creation.append(row['time_created'])
        hr_value.append(row['hr_values'])
        hr_bin.append(row['hr_bin'])
        time_bin.append(row['time_bin'])

school_dict = {
    'date_created': date_creation,
    'time_created': time_creation,
    'hr_values': hr_value,
    'hr_bin': hr_bin,
    'time_bin': time_bin,
    'day_week': day_week
}

school_df = pd.DataFrame(school_dict)

# removing all the weekend days
school_df = school_df[school_df['day_week'] < 5]
school_df.to_csv('work_hr.csv', index =False)

## Daily DataFrame
- find unique dates
- find avg_hr, min_hr, max_hr, median_hr for each day

In [70]:
unique_dates = []
daily_info = []
avg_hr_list = []
max_hr_list = []
min_hr_list = []
median_hr_list = []
day_of_week_list = []

# find each day within the school_df
for i in school_df['date_created']:
    if i not in unique_dates:
        unique_dates.append(i)

# find the avg_hr, min_hr, max_hr, and median_hr for each day
for i in unique_dates:
    hrs = []
    for idx, row in school_df.iterrows():
        if i == row['date_created']:
            hrs.append(row['hr_values'])

    # use numpy to determine mean, min, max, median
    avg_hr = np.mean(hrs)
    min_hr = np.min(hrs)
    max_hr = np.max(hrs)
    median_hr = np.median(hrs)
    
    # append each respective list with value
    avg_hr_list.append(avg_hr)
    max_hr_list.append(max_hr)
    min_hr_list.append(min_hr)
    median_hr_list.append(median_hr)

    day = i
    day_dt = datetime.strptime(day, '%Y-%m-%d')
    day_of_week = datetime.weekday(day_dt)
    day_of_week_list.append(day_of_week)

    daily_dicts = {
        'day': i,
        'avg_hr': avg_hr,
        'min_hr': min_hr,
        'max_hr': max_hr,
        'median_hr': median_hr, 
        'day_week': day_of_week
    }

    daily_info.append(daily_dicts)

# create daily_df
daily_df = pd.DataFrame(daily_info)

In [74]:
# show daily_df
daily_df.head(5)

Unnamed: 0,day,avg_hr,min_hr,max_hr,median_hr,week_day
0,2022-03-28,94.961433,61.0,126.0,100.0,0
1,2022-03-29,82.673681,56.0,127.0,81.0,1
2,2022-03-30,75.8487,51.0,111.0,75.0,2
3,2022-03-31,83.492542,59.0,123.0,81.0,3
4,2022-04-01,92.928273,56.0,126.0,95.0,4


### Figuring out which day of the week had highest values

In [94]:

monday_df = pd.DataFrame(school_df[school_df['day_week']==0])
tuesday_df = pd.DataFrame(school_df[school_df['day_week']==1])

In [104]:
mon_avg = school_df[school_df['day_week']==0]
mon_avg = np.mean(mon_avg['hr_values'])
mon_avg

98.088683247918

In [106]:
tue_avg = school_df[school_df['day_week']==1]
tue_avg = np.mean(tue_avg['hr_values'])
tue_avg

89.22082773393461

In [107]:
wed_avg = school_df[school_df['day_week']==2]
wed_avg = np.mean(wed_avg['hr_values'])
wed_avg

79.64808369330453

In [108]:
thu_avg = school_df[school_df['day_week']==3]
thu_avg = np.mean(thu_avg['hr_values'])
thu_avg

75.47185761904761

In [109]:
fri_avg = school_df[school_df['day_week']==4]
fri_avg = np.mean(fri_avg['hr_values'])
fri_avg

85.36060530570995