# Packages

In [1]:
import xml.etree.ElementTree as ET
import os

# Practice

We will play around with parsing XML files by using an example file, 'country_data.xml'

In [4]:
# XML file to parse
file = 'data/country_data.xml'

# Parse XML file
tree = ET.parse(file)

XML files are organized hierarchically. We can start extracting data by getting the root of the tree.

In [6]:
# Get the root element
root = tree.getroot()
print(root)

<Element 'data' at 0x110b531a0>


XML is organized in elements, similar to HTML. The root element has a tag: "data".

We can investigate the number of child nodes attached to this root node using `len()`

In [12]:
# Get the number of child nodes attached to root
len(root)

3

There are 3 child nodes. These might in turn have child nodes (grand-child nodes).

We can iterate over these child nodes. This only iterates over the first level.

In [13]:
# Iterate over the child nodes
for child in root:
    print(child)

<Element 'country' at 0x110b53650>
<Element 'country' at 0x110ef9d50>
<Element 'country' at 0x110ef9ee0>


The 3 child nodes attached to root are tagged "country". These are the names of the elements, called a "tag". The tag can be extracted directly as an attribute of the node: 

In [14]:
# Get the tags for each of the child nodes
for child in root:
    print(child.tag)

country
country
country


Elements can contain attributes. These are stored in the "attrib" attribute of the node, which returns a dictionary.

In [15]:
# Get the attributes of each child node
for child in root:
    print(child.attrib)

{'name': 'Liechtenstein'}
{'name': 'Singapore'}
{'name': 'Panama'}


In [16]:
for child in root:
    print(type(child.attrib))

<class 'dict'>
<class 'dict'>
<class 'dict'>


Each of the child nodes contains a dictionary of attributes

Individual nodes can be accessed using list indexing. For example, the third child attached to root is:

In [34]:
child3 = root[2]
child3

<Element 'country' at 0x10ad74540>

This element has a tag and attributes as described above, but also contains a set of children nodes itself. 

In [35]:
len(child3)

5

This particular child node contains 5 children nodes. 

In [36]:
for grandchild in child:
    print(grandchild.tag)

rank
year
gdppc
neighbor
neighbor


These grandchildren nodes have different tags. 

In [37]:
for grandchild in child:
    print(grandchild.attrib)

{}
{}
{}
{'name': 'Costa Rica', 'direction': 'W'}
{'name': 'Colombia', 'direction': 'E'}


Some have attributes and others don't. 

Data can also be stored as text between the tags in the XML file. This text can be pulled using the .text attribute. 

In [38]:
for grandchild in child:
    print(grandchild.text)

68
2011
13600
None
None


Some of the grandchildren nodes have associated text, and others don't.

We can use the .iter() method to pull elements with specific tags throughout the tree. For example, we can pull all elements tagged 'neighbor':

In [39]:
for neighbor in root.iter('neighbor'):
    print(neighbor)

<Element 'neighbor' at 0x10ad74310>
<Element 'neighbor' at 0x10ad74360>
<Element 'neighbor' at 0x10ad744f0>
<Element 'neighbor' at 0x10ad74680>
<Element 'neighbor' at 0x10ad746d0>


We can also use the `.get()` method to pull a certain attribute from an element:

In [82]:
element = root[2]
element.attrib

{'name': 'Panama'}

In [83]:
element.get('name')

'Panama'

Element attributes can also be returned as a list of tuples:

In [100]:
element.items()

[('name', 'Panama')]

The keys of the attributes can also be returned:

In [101]:
element.keys()

['name']

# Apple Health Data

What is the structure of the Apple Health app export?

In [3]:
file = os.path.join('data', 'apple_health_export', 'export.xml')
health = ET.parse(file)

health_root = health.getroot()
len(health_root)

377926

There are a ton of entries in this XML file. What are the tags?

In [4]:
tags_all = [child.tag for child in health_root]
tags_unique = set(tags_all)
tags_unique

{'ExportDate', 'Me', 'Record', 'Workout'}

There are only 4 kinds of tags. How many elements do each of these tags have?

In [5]:
for tag in tags_unique:
    tag_list = [child for child in health_root.iter(tag)]
    print("{}: {}".format(tag, len(tag_list)))

Workout: 111
Me: 1
Record: 377813
ExportDate: 1


So most of the data is stored in tags "Workout" and "Record"

What does the top of the data look like?

In [6]:
for child in health_root[:10]:
    print(child)

<Element 'ExportDate' at 0x10b3e11c0>
<Element 'Me' at 0x10b3e13a0>
<Element 'Record' at 0x10b3e1530>
<Element 'Record' at 0x10b3e1710>
<Element 'Record' at 0x10b3e18a0>
<Element 'Record' at 0x10b3e1a30>
<Element 'Record' at 0x10b3e1bc0>
<Element 'Record' at 0x10b3e1d50>
<Element 'Record' at 0x10b3e1ee0>
<Element 'Record' at 0x10b3e2070>


Let's look at one of these 'Record' elements

In [7]:
record = health_root[2]
record.attrib

{'type': 'HKQuantityTypeIdentifierHeight',
 'sourceName': 'Health',
 'sourceVersion': '10.0.1',
 'unit': 'ft',
 'creationDate': '2016-12-06 17:49:02 -0400',
 'startDate': '2016-12-06 17:48:00 -0400',
 'endDate': '2016-12-06 17:48:00 -0400',
 'value': '5.33333'}

Does the record have any text?

In [8]:
print(repr(record.text))

'\n  '


No.

There's a bunch of informaiton about the type of record. I think the relevant thing here is the 'type' field of the attributes. 

So what are the different "types" of records that we have?

In [9]:
record_types_unique = list(set(record_types_all))
print(len(record_types_unique))

32


There are 32 unique record types. What are they?

In [10]:
record_types_unique

['HKQuantityTypeIdentifierWalkingStepLength',
 'HKQuantityTypeIdentifierBasalEnergyBurned',
 'HKQuantityTypeIdentifierActiveEnergyBurned',
 'HKQuantityTypeIdentifierHeadphoneAudioExposure',
 'HKQuantityTypeIdentifierDietarySugar',
 'HKCategoryTypeIdentifierToothbrushingEvent',
 'HKQuantityTypeIdentifierDietaryFatSaturated',
 'HKQuantityTypeIdentifierDietaryFatTotal',
 'HKQuantityTypeIdentifierStepCount',
 'HKQuantityTypeIdentifierDistanceWalkingRunning',
 'HKQuantityTypeIdentifierDietaryFatPolyunsaturated',
 'HKCategoryTypeIdentifierMindfulSession',
 'HKQuantityTypeIdentifierDietaryEnergyConsumed',
 'HKQuantityTypeIdentifierWalkingSpeed',
 'HKQuantityTypeIdentifierWalkingDoubleSupportPercentage',
 'HKQuantityTypeIdentifierDietaryIron',
 'HKQuantityTypeIdentifierFlightsClimbed',
 'HKQuantityTypeIdentifierWalkingAsymmetryPercentage',
 'HKQuantityTypeIdentifierAppleWalkingSteadiness',
 'HKQuantityTypeIdentifierDietarySodium',
 'HKQuantityTypeIdentifierDietaryCarbohydrates',
 'HKDataTypeSl

Notice how these type names also contain information. We have "Quantity", "Category", and "Data" types of records. 

There is a lot of information stored here. It seems like most of the information from Apple Health is stored in the records. 

How do I extract this data? Let's start by looking at height. 

In [11]:
height_records = [child.attrib for child in health_root if child.get('type') == 'HKQuantityTypeIdentifierHeight']
print(len(height_records))

1


We have a single record for height, which makes sense since I only entered my height once. 
What does this look like?

In [12]:
height_records

[{'type': 'HKQuantityTypeIdentifierHeight',
  'sourceName': 'Health',
  'sourceVersion': '10.0.1',
  'unit': 'ft',
  'creationDate': '2016-12-06 17:49:02 -0400',
  'startDate': '2016-12-06 17:48:00 -0400',
  'endDate': '2016-12-06 17:48:00 -0400',
  'value': '5.33333'}]

Information get care about is probably the date, the unit, and the value.

What about weight? I think the relevant type is 'HKQuantityTypeIdentifierBodyMass'.

In [13]:
weight_records = [child.attrib for child in health_root if child.get('type') == 'HKQuantityTypeIdentifierBodyMass']
print(len(weight_records))

2227


We've got a ton of data here. 

In [14]:
weight_records[:5]

[{'type': 'HKQuantityTypeIdentifierBodyMass',
  'sourceName': 'Health',
  'sourceVersion': '10.0.1',
  'unit': 'lb',
  'creationDate': '2016-12-06 17:48:20 -0400',
  'startDate': '2016-12-06 17:48:20 -0400',
  'endDate': '2016-12-06 17:48:20 -0400',
  'value': '132.277'},
 {'type': 'HKQuantityTypeIdentifierBodyMass',
  'sourceName': 'MyFitnessPal',
  'sourceVersion': '18277',
  'unit': 'lb',
  'creationDate': '2016-12-06 18:48:55 -0400',
  'startDate': '2016-12-05 18:48:00 -0400',
  'endDate': '2016-12-05 18:48:00 -0400',
  'value': '121'},
 {'type': 'HKQuantityTypeIdentifierBodyMass',
  'sourceName': 'MyFitnessPal',
  'sourceVersion': '18277',
  'unit': 'lb',
  'creationDate': '2016-12-07 12:27:10 -0400',
  'startDate': '2016-12-06 12:27:00 -0400',
  'endDate': '2016-12-06 12:27:00 -0400',
  'value': '118.4'},
 {'type': 'HKQuantityTypeIdentifierBodyMass',
  'sourceName': 'MyFitnessPal',
  'sourceVersion': '18277',
  'unit': 'lb',
  'creationDate': '2016-12-08 12:51:13 -0400',
  'start

At this point I could extract this into a data frame. I can't do it with all records, but I can probably do it with all records of a certain type. 

In [15]:
set([len(x) for x in weight_records])

{8}

All of these weight records have the same length. 
I would need to reshape it to get it into data frame format though. Or convert each entry to a data frame and concatenate them. 

In [16]:
weight_keys = list(weight_records[0].keys())
weight_keys

['type',
 'sourceName',
 'sourceVersion',
 'unit',
 'creationDate',
 'startDate',
 'endDate',
 'value']

In [17]:
weight_dict = {key:None for key in weight_keys}
for key in weight_dict.keys():
    weight_dict[key] = [record[key] for record in weight_records]

In [18]:
import pandas as pd
df_weight = pd.DataFrame(weight_dict)

outfile = 'apple_health_weight.csv'
df_weight.to_csv(outfile, index = False)

There you go. I've parsed the weight data from Apple Health and extracted it to CSV for further analysis. 

What else do we have in these records?

In [29]:
sorted(record_types_unique)

['HKCategoryTypeIdentifierMindfulSession',
 'HKCategoryTypeIdentifierSleepAnalysis',
 'HKCategoryTypeIdentifierToothbrushingEvent',
 'HKDataTypeSleepDurationGoal',
 'HKQuantityTypeIdentifierActiveEnergyBurned',
 'HKQuantityTypeIdentifierAppleWalkingSteadiness',
 'HKQuantityTypeIdentifierBasalEnergyBurned',
 'HKQuantityTypeIdentifierBodyMass',
 'HKQuantityTypeIdentifierDietaryCalcium',
 'HKQuantityTypeIdentifierDietaryCarbohydrates',
 'HKQuantityTypeIdentifierDietaryCholesterol',
 'HKQuantityTypeIdentifierDietaryEnergyConsumed',
 'HKQuantityTypeIdentifierDietaryFatMonounsaturated',
 'HKQuantityTypeIdentifierDietaryFatPolyunsaturated',
 'HKQuantityTypeIdentifierDietaryFatSaturated',
 'HKQuantityTypeIdentifierDietaryFatTotal',
 'HKQuantityTypeIdentifierDietaryFiber',
 'HKQuantityTypeIdentifierDietaryIron',
 'HKQuantityTypeIdentifierDietaryPotassium',
 'HKQuantityTypeIdentifierDietaryProtein',
 'HKQuantityTypeIdentifierDietarySodium',
 'HKQuantityTypeIdentifierDietarySugar',
 'HKQuantityTy

Lots of these contains "Dietary" information.

In [116]:
dietary_types = [record for record in record_types_unique if 'Dietary' in record]
dietary_types

['HKQuantityTypeIdentifierDietaryCalcium',
 'HKQuantityTypeIdentifierDietaryCarbohydrates',
 'HKQuantityTypeIdentifierDietaryCholesterol',
 'HKQuantityTypeIdentifierDietaryEnergyConsumed',
 'HKQuantityTypeIdentifierDietaryFatMonounsaturated',
 'HKQuantityTypeIdentifierDietaryFatPolyunsaturated',
 'HKQuantityTypeIdentifierDietaryFatSaturated',
 'HKQuantityTypeIdentifierDietaryFatTotal',
 'HKQuantityTypeIdentifierDietaryFiber',
 'HKQuantityTypeIdentifierDietaryIron',
 'HKQuantityTypeIdentifierDietaryPotassium',
 'HKQuantityTypeIdentifierDietaryProtein',
 'HKQuantityTypeIdentifierDietarySodium',
 'HKQuantityTypeIdentifierDietarySugar',
 'HKQuantityTypeIdentifierDietaryVitaminC']

We've got all of this nutritional information. 
Is there total calories somewhere? I think it's the "EnergyConsumed" entry.

In [30]:
caloric_records = [child.attrib for child in health_root if child.get('type') == 'HKQuantityTypeIdentifierDietaryEnergyConsumed']
print(len(caloric_records))

3301


In [33]:
caloric_records[(len(caloric_records)-5):]

[{'type': 'HKQuantityTypeIdentifierDietaryEnergyConsumed',
  'sourceName': 'MyFitnessPal',
  'sourceVersion': '52161',
  'unit': 'Cal',
  'creationDate': '2024-08-28 19:58:33 -0400',
  'startDate': '2024-08-28 11:36:00 -0400',
  'endDate': '2024-08-28 11:36:00 -0400',
  'value': '620'},
 {'type': 'HKQuantityTypeIdentifierDietaryEnergyConsumed',
  'sourceName': 'MyFitnessPal',
  'sourceVersion': '52161',
  'unit': 'Cal',
  'creationDate': '2024-08-29 13:09:46 -0400',
  'startDate': '2024-08-29 13:09:00 -0400',
  'endDate': '2024-08-29 13:09:00 -0400',
  'value': '525.02'},
 {'type': 'HKQuantityTypeIdentifierDietaryEnergyConsumed',
  'sourceName': 'MyFitnessPal',
  'sourceVersion': '52161',
  'unit': 'Cal',
  'creationDate': '2024-08-29 13:09:49 -0400',
  'startDate': '2024-08-29 13:09:00 -0400',
  'endDate': '2024-08-29 13:09:00 -0400',
  'value': '525.6'},
 {'type': 'HKQuantityTypeIdentifierDietaryEnergyConsumed',
  'sourceName': 'MyFitnessPal',
  'sourceVersion': '52161',
  'unit': 'C

Notice how there are multiple entries on the same day. Taking a look at MyFitnessPal, it looks like these entries correspond to specific meal categories. 
Am I missing information here?

In [52]:
caloric_indices = []
for i in range(len(health_root)):
    element = health_root[i]
    if element.tag == 'Record':
        element_type = element.get('type')
        if element_type == 'HKQuantityTypeIdentifierDietaryEnergyConsumed':
            caloric_indices.append(i)

caloric_indices[0]

243840

In [55]:
caloric_element = health_root[caloric_indices[0]]
caloric_element.attrib

{'type': 'HKQuantityTypeIdentifierDietaryEnergyConsumed',
 'sourceName': 'MyFitnessPal',
 'sourceVersion': '18277',
 'unit': 'Cal',
 'creationDate': '2016-12-06 18:54:42 -0400',
 'startDate': '2016-12-05 18:54:00 -0400',
 'endDate': '2016-12-05 18:54:00 -0400',
 'value': '585'}

In [56]:
len(caloric_element)

2

In [57]:
for node in caloric_element:
    print(node)

<Element 'MetadataEntry' at 0x135cfc180>
<Element 'MetadataEntry' at 0x135cfc1d0>


So these caloric entries have child nodes with tag 'MetadataEntry'. What do these contain?

In [58]:
for node in caloric_element:
    print(node.attrib)

{'key': 'meal', 'value': 'Breakfast'}
{'key': 'Meal', 'value': 'Breakfast'}


Perfect. They contain the meal information. I don't know why there are two entries though?

In [61]:
for node in caloric_element:
    print(node.text)

None
None


No associated text.

I can collate all of this into one dictionary.

In [63]:
caloric_records[0]

{'type': 'HKQuantityTypeIdentifierDietaryEnergyConsumed',
 'sourceName': 'MyFitnessPal',
 'sourceVersion': '18277',
 'unit': 'Cal',
 'creationDate': '2016-12-06 18:54:42 -0400',
 'startDate': '2016-12-05 18:54:00 -0400',
 'endDate': '2016-12-05 18:54:00 -0400',
 'value': '585'}

In [92]:
caloric_records = []
for element in health_root.iter('Record'):
    element_type = element.get('type')
    if element_type == 'HKQuantityTypeIdentifierDietaryEnergyConsumed':
        element_dict = element.attrib.copy()
        element_dict.update({element[0].get('key'):element[0].get('value')})
        caloric_records.append(element_dict)

Make sure all records have the same length:

In [96]:
set([len(record) for record in caloric_records])

{9}

Now we need to reshape this.

In [98]:
# Get the dictionary keys
caloric_keys = list(caloric_records[0].keys())

# Reshape the dictionary
caloric_dict = {key:None for key in caloric_keys}
for key in caloric_dict.keys():
    caloric_dict[key] = [record[key] for record in caloric_records]

In [102]:
df_calories = pd.DataFrame(caloric_dict)
df_calories.head()

Unnamed: 0,type,sourceName,sourceVersion,unit,creationDate,startDate,endDate,value,meal
0,HKQuantityTypeIdentifierDietaryEnergyConsumed,MyFitnessPal,18277,Cal,2016-12-06 18:54:42 -0400,2016-12-05 18:54:00 -0400,2016-12-05 18:54:00 -0400,585.0,Breakfast
1,HKQuantityTypeIdentifierDietaryEnergyConsumed,MyFitnessPal,18277,Cal,2016-12-06 18:54:49 -0400,2016-12-05 18:54:00 -0400,2016-12-05 18:54:00 -0400,260.0,Snacks
2,HKQuantityTypeIdentifierDietaryEnergyConsumed,MyFitnessPal,18277,Cal,2016-12-06 18:55:03 -0400,2016-12-05 18:55:00 -0400,2016-12-05 18:55:00 -0400,680.0,Lunch
3,HKQuantityTypeIdentifierDietaryEnergyConsumed,MyFitnessPal,18277,Cal,2016-12-06 18:56:42 -0400,2016-12-06 18:56:00 -0400,2016-12-06 18:56:00 -0400,902.0,Breakfast
4,HKQuantityTypeIdentifierDietaryEnergyConsumed,MyFitnessPal,18277,Cal,2016-12-06 18:57:22 -0400,2016-12-06 18:56:00 -0400,2016-12-06 18:56:00 -0400,690.878,Dinner


In [104]:
outfile = 'apple_health_calories.csv'
df_calories.to_csv(outfile, index = False)

Okay that's great. Then I should have caloric entries for each meal. I should be able to join additional information on date. 

What does the other nutritional information look like? Is it also broken down by meal like this?

In [117]:
dietary_types

['HKQuantityTypeIdentifierDietaryCalcium',
 'HKQuantityTypeIdentifierDietaryCarbohydrates',
 'HKQuantityTypeIdentifierDietaryCholesterol',
 'HKQuantityTypeIdentifierDietaryEnergyConsumed',
 'HKQuantityTypeIdentifierDietaryFatMonounsaturated',
 'HKQuantityTypeIdentifierDietaryFatPolyunsaturated',
 'HKQuantityTypeIdentifierDietaryFatSaturated',
 'HKQuantityTypeIdentifierDietaryFatTotal',
 'HKQuantityTypeIdentifierDietaryFiber',
 'HKQuantityTypeIdentifierDietaryIron',
 'HKQuantityTypeIdentifierDietaryPotassium',
 'HKQuantityTypeIdentifierDietaryProtein',
 'HKQuantityTypeIdentifierDietarySodium',
 'HKQuantityTypeIdentifierDietarySugar',
 'HKQuantityTypeIdentifierDietaryVitaminC']

Let's take a look at protein. 

In [107]:
for i in range(len(health_root)):
    element = health_root[i]
    if element.tag == 'Record':
        element_type = element.get('type')
        if element_type == 'HKQuantityTypeIdentifierDietaryProtein':
            first_protein_index = i
            break

first_protein_index

247141

In [109]:
protein_element = health_root[first_protein_index]
protein_element.attrib

{'type': 'HKQuantityTypeIdentifierDietaryProtein',
 'sourceName': 'MyFitnessPal',
 'sourceVersion': '18277',
 'unit': 'g',
 'creationDate': '2016-12-06 18:54:42 -0400',
 'startDate': '2016-12-05 18:54:00 -0400',
 'endDate': '2016-12-05 18:54:00 -0400',
 'value': '22.1'}

Okay the value looks small enough that this might be a meal.

In [110]:
len(protein_element)

2

In [111]:
for element in protein_element:
    print(element.tag)

MetadataEntry
MetadataEntry


Getting similar MetadataEntry tags.

In [112]:
for element in protein_element:
    print(element.attrib)

{'key': 'meal', 'value': 'Breakfast'}
{'key': 'Meal', 'value': 'Breakfast'}


Great we also get meal information.

I wonder, is this true for all of these dietary information entries? My guess is that it is for everything that is pulled from MyFitnessPal.

In [118]:
dietary_types

['HKQuantityTypeIdentifierDietaryCalcium',
 'HKQuantityTypeIdentifierDietaryCarbohydrates',
 'HKQuantityTypeIdentifierDietaryCholesterol',
 'HKQuantityTypeIdentifierDietaryEnergyConsumed',
 'HKQuantityTypeIdentifierDietaryFatMonounsaturated',
 'HKQuantityTypeIdentifierDietaryFatPolyunsaturated',
 'HKQuantityTypeIdentifierDietaryFatSaturated',
 'HKQuantityTypeIdentifierDietaryFatTotal',
 'HKQuantityTypeIdentifierDietaryFiber',
 'HKQuantityTypeIdentifierDietaryIron',
 'HKQuantityTypeIdentifierDietaryPotassium',
 'HKQuantityTypeIdentifierDietaryProtein',
 'HKQuantityTypeIdentifierDietarySodium',
 'HKQuantityTypeIdentifierDietarySugar',
 'HKQuantityTypeIdentifierDietaryVitaminC']

In [124]:
dietary_dict = {key:None for key in dietary_types}
dietary_dict

{'HKQuantityTypeIdentifierDietaryCalcium': None,
 'HKQuantityTypeIdentifierDietaryCarbohydrates': None,
 'HKQuantityTypeIdentifierDietaryCholesterol': None,
 'HKQuantityTypeIdentifierDietaryEnergyConsumed': None,
 'HKQuantityTypeIdentifierDietaryFatMonounsaturated': None,
 'HKQuantityTypeIdentifierDietaryFatPolyunsaturated': None,
 'HKQuantityTypeIdentifierDietaryFatSaturated': None,
 'HKQuantityTypeIdentifierDietaryFatTotal': None,
 'HKQuantityTypeIdentifierDietaryFiber': None,
 'HKQuantityTypeIdentifierDietaryIron': None,
 'HKQuantityTypeIdentifierDietaryPotassium': None,
 'HKQuantityTypeIdentifierDietaryProtein': None,
 'HKQuantityTypeIdentifierDietarySodium': None,
 'HKQuantityTypeIdentifierDietarySugar': None,
 'HKQuantityTypeIdentifierDietaryVitaminC': None}

In [135]:
for key, val in dietary_dict.items():
    # key = 'HKQuantityTypeIdentifierDietaryCalcium'
    dietary_dict[key] = []
    for element in health_root.iter('Record'):
        element_type = element.get('type')
        if element_type == dietary_type:
            dietary_dict[key].append(len(element))
    dietary_dict[key] = list(set(dietary_dict[key]))

dietary_dict

{'HKQuantityTypeIdentifierDietaryCalcium': [2],
 'HKQuantityTypeIdentifierDietaryCarbohydrates': [2],
 'HKQuantityTypeIdentifierDietaryCholesterol': [2],
 'HKQuantityTypeIdentifierDietaryEnergyConsumed': [2],
 'HKQuantityTypeIdentifierDietaryFatMonounsaturated': [2],
 'HKQuantityTypeIdentifierDietaryFatPolyunsaturated': [2],
 'HKQuantityTypeIdentifierDietaryFatSaturated': [2],
 'HKQuantityTypeIdentifierDietaryFatTotal': [2],
 'HKQuantityTypeIdentifierDietaryFiber': [2],
 'HKQuantityTypeIdentifierDietaryIron': [2],
 'HKQuantityTypeIdentifierDietaryPotassium': [2],
 'HKQuantityTypeIdentifierDietaryProtein': [2],
 'HKQuantityTypeIdentifierDietarySodium': [2],
 'HKQuantityTypeIdentifierDietarySugar': [2],
 'HKQuantityTypeIdentifierDietaryVitaminC': [2]}

All of these different dietary elements have 2 sub-elements. I'm betting those are probably meal information from MyFitnessPal. 

What's the next move then? Extract all dietary information as a set of CSVs. I basically just need to look what I did for the calories. 

In [136]:
dietary_types

['HKQuantityTypeIdentifierDietaryCalcium',
 'HKQuantityTypeIdentifierDietaryCarbohydrates',
 'HKQuantityTypeIdentifierDietaryCholesterol',
 'HKQuantityTypeIdentifierDietaryEnergyConsumed',
 'HKQuantityTypeIdentifierDietaryFatMonounsaturated',
 'HKQuantityTypeIdentifierDietaryFatPolyunsaturated',
 'HKQuantityTypeIdentifierDietaryFatSaturated',
 'HKQuantityTypeIdentifierDietaryFatTotal',
 'HKQuantityTypeIdentifierDietaryFiber',
 'HKQuantityTypeIdentifierDietaryIron',
 'HKQuantityTypeIdentifierDietaryPotassium',
 'HKQuantityTypeIdentifierDietaryProtein',
 'HKQuantityTypeIdentifierDietarySodium',
 'HKQuantityTypeIdentifierDietarySugar',
 'HKQuantityTypeIdentifierDietaryVitaminC']

In [153]:
for dietary_type in dietary_types:
# dietary_type = 'HKQuantityTypeIdentifierDietaryCalcium'
    
    records = []
    for element in health_root.iter('Record'):
        element_type = element.get('type')
        if element_type == dietary_type:
            element_dict = element.attrib.copy()
            element_dict.update({element[0].get('key'):element[0].get('value')})
            records.append(element_dict)

    # Get the dictionary keys
    records_keys = list(records[0].keys())
    
    # Reshape the dictionary
    records_dict = {key:None for key in records_keys}
    for key in records_dict.keys():
        records_dict[key] = [record[key] for record in records]
    
    dietary_type_clean = dietary_type.replace('HKQuantityTypeIdentifier', '')
    outfile = 'AppleHealth_{}.csv'.format(dietary_type_clean)
    
    pd.DataFrame(records_dict).to_csv(outfile, index = False)

That should do it. Hopefully anyway.

In [154]:
sorted(record_types_unique)

['HKCategoryTypeIdentifierMindfulSession',
 'HKCategoryTypeIdentifierSleepAnalysis',
 'HKCategoryTypeIdentifierToothbrushingEvent',
 'HKDataTypeSleepDurationGoal',
 'HKQuantityTypeIdentifierActiveEnergyBurned',
 'HKQuantityTypeIdentifierAppleWalkingSteadiness',
 'HKQuantityTypeIdentifierBasalEnergyBurned',
 'HKQuantityTypeIdentifierBodyMass',
 'HKQuantityTypeIdentifierDietaryCalcium',
 'HKQuantityTypeIdentifierDietaryCarbohydrates',
 'HKQuantityTypeIdentifierDietaryCholesterol',
 'HKQuantityTypeIdentifierDietaryEnergyConsumed',
 'HKQuantityTypeIdentifierDietaryFatMonounsaturated',
 'HKQuantityTypeIdentifierDietaryFatPolyunsaturated',
 'HKQuantityTypeIdentifierDietaryFatSaturated',
 'HKQuantityTypeIdentifierDietaryFatTotal',
 'HKQuantityTypeIdentifierDietaryFiber',
 'HKQuantityTypeIdentifierDietaryIron',
 'HKQuantityTypeIdentifierDietaryPotassium',
 'HKQuantityTypeIdentifierDietaryProtein',
 'HKQuantityTypeIdentifierDietarySodium',
 'HKQuantityTypeIdentifierDietarySugar',
 'HKQuantityTy

In [157]:
sleep_indices = []
for i in range(len(health_root)):
    element = health_root[i]
    if element.tag == 'Record':
        element_type = element.get('type')
        if element_type == 'HKCategoryTypeIdentifierSleepAnalysis':
            sleep_indices.append(i)

len(sleep_indices)

2785

In [162]:
sleep_element = health_root[sleep_indices[len(sleep_indices)-1]]
sleep_element.attrib

{'type': 'HKCategoryTypeIdentifierSleepAnalysis',
 'sourceName': 'Antoine’s iPhone',
 'sourceVersion': '17.5.1',
 'creationDate': '2024-08-15 08:00:02 -0400',
 'startDate': '2024-08-14 23:00:00 -0400',
 'endDate': '2024-08-15 08:00:02 -0400',
 'value': 'HKCategoryValueSleepAnalysisInBed'}

In [163]:
len(sleep_element)

1

In [165]:
for element in sleep_element:
    print(element.tag)

MetadataEntry


In [166]:
for element in sleep_element:
    print(element.attrib)

{'key': 'HKTimeZone', 'value': 'America/Toronto'}


# Workout elements

In [None]:
element_test = health_root.find('Workout')
element_test.attrib

In [18]:
element_test.get('duration')

'6.833333333333333'