# **Apple Health Data XML -> CSV**
---

### Import Libraries

In [1]:
import xml.etree.ElementTree as ET 
import pandas as pd 
import numpy as np
import datetime as dt 

### Parse XML File into Element Tree Object 

In [2]:
# Parse XML file into an Element Tree Object
tree = ET.parse('data/export.xml')

#extract attributes for every health record 
root = tree.getroot()
record_list = [x.attrib for x in root.iter('Record')]

### Create Pandas Dataframe 

In [3]:
# Create Pandas Dataframe 
record_data = pd.DataFrame(record_list)

# Proper Type to Dates 
for col in ['creationDate', 'startDate', 'endDate']:
    record_data[col] = pd.to_datetime(record_data[col])

# Value is Numeric, Nan if Fails 
record_data['value'] = pd.to_numeric(record_data['value'], errors='coerce')

# some records do NOT measure anything (just count occurences)
# fill with 1.0 (one time) makes easier to aggregate
record_data['value'] = record_data['value'].fillna(1.0)

# shorter observation names
record_data['type'] = record_data['type'].str.replace('HKQuantityTypeIdentifier', '')
record_data['type'] = record_data['type'].str.replace('HKCategoryTypeIdentifier', '')
record_data.tail()

Unnamed: 0,type,sourceName,sourceVersion,unit,creationDate,startDate,endDate,value,device
177481,HeadphoneAudioExposureEvent,Brighton Mini,15.6,,2022-11-22 16:41:27-05:00,2022-11-16 18:17:49-05:00,2022-11-22 16:41:27-05:00,1.0,
177482,HeadphoneAudioExposureEvent,Brighton Mini,15.6,,2022-11-28 14:31:18-05:00,2022-11-22 16:41:27-05:00,2022-11-28 14:31:18-05:00,1.0,
177483,HeadphoneAudioExposureEvent,Brighton Mini,15.6,,2022-11-29 16:22:42-05:00,2022-11-28 14:31:18-05:00,2022-11-29 16:22:42-05:00,1.0,
177484,HeadphoneAudioExposureEvent,Brighton Mini,15.6,,2022-12-02 08:18:16-05:00,2022-11-29 16:22:42-05:00,2022-12-02 08:18:15-05:00,1.0,
177485,HeadphoneAudioExposureEvent,Brighton Mini,15.6,,2022-12-06 09:22:06-05:00,2022-12-02 08:18:15-05:00,2022-12-06 09:22:06-05:00,1.0,


In [4]:
record_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177486 entries, 0 to 177485
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype                                 
---  ------         --------------   -----                                 
 0   type           177486 non-null  object                                
 1   sourceName     177486 non-null  object                                
 2   sourceVersion  177486 non-null  object                                
 3   unit           177356 non-null  object                                
 4   creationDate   177486 non-null  datetime64[ns, pytz.FixedOffset(-300)]
 5   startDate      177486 non-null  datetime64[ns, pytz.FixedOffset(-300)]
 6   endDate        177486 non-null  datetime64[ns, pytz.FixedOffset(-300)]
 7   value          177486 non-null  float64                               
 8   device         177354 non-null  object                                
dtypes: datetime64[ns, pytz.FixedOffset(-300)](3), flo

# Pull `StepCount` and `DistanceWalkingRunning` to Dataframe 

In [5]:
# pivot dataframe 
pivot_df = record_data.pivot_table(index='endDate', columns='type', values='value')
# sum StepCount and DistanceWalkingRunning 
df = pivot_df.resample('D').agg({'StepCount' : sum,
                                 'DistanceWalkingRunning' : sum})
                                 
df.index = df.index.strftime('%Y-%m-%d')
df

type,StepCount,DistanceWalkingRunning
endDate,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-09-10,3246.0,1.309123
2021-09-11,10003.0,4.154720
2021-09-12,4812.0,2.097997
2021-09-13,4672.0,1.776384
2021-09-14,5914.0,2.324539
...,...,...
2022-12-03,12476.0,4.832174
2022-12-04,3428.0,1.365090
2022-12-05,16428.0,6.891993
2022-12-06,12660.0,5.101112


### Pandas DF to CSV 

In [6]:
df.to_csv('data/steps_distance.csv')