In [2]:
# Initialize
import pandas as pd
import xml.etree.ElementTree as ET
import csv
from datetime import datetime, timedelta

# Path to your Apple Health XML export
xml_file_path = 'Data/Apple health/exportacion.xml'

# Load and parse the XML file
tree = ET.parse(xml_file_path)
root = tree.getroot()

# Initialize variables for date range
min_date, max_date = None, None

# Determine the date range
for record in root.findall('.//Record'):
    if record.attrib.get('sourceName') == 'MyFitnessPal':
        start_date = datetime.strptime(record.attrib['startDate'], '%Y-%m-%d %H:%M:%S %z').date()
        if min_date is None or start_date < min_date:
            min_date = start_date
        if max_date is None or start_date > max_date:
            max_date = start_date

# Create a mapping of xml names for MyFitnessPal to human-readable names
food_mapping = {
    'HKQuantityTypeIdentifierDietaryCalcium': 'calcium',
    'HKQuantityTypeIdentifierDietaryCarbohydrates':'carbs',
    'HKQuantityTypeIdentifierDietaryCholesterol': 'cholesterol',
    'HKQuantityTypeIdentifierDietaryEnergyConsumed': 'calories',
    'HKQuantityTypeIdentifierDietaryFatMonounsaturated': 'monounsaturated fat',
    'HKQuantityTypeIdentifierDietaryFatPolyunsaturated': 'polyunsaturated fat',
    'HKQuantityTypeIdentifierDietaryFatSaturated': 'saturated fat',
    'HKQuantityTypeIdentifierDietaryFatTotal': 'fat',
    'HKQuantityTypeIdentifierDietaryFiber': 'fiber',
    'HKQuantityTypeIdentifierDietaryIron': 'iron',
    'HKQuantityTypeIdentifierDietaryPotassium': 'potassium',
    'HKQuantityTypeIdentifierDietaryProtein': 'protein',
    'HKQuantityTypeIdentifierDietarySodium': 'sodium',
    'HKQuantityTypeIdentifierDietarySugar': 'sugar',
    'HKQuantityTypeIdentifierDietaryVitaminC': 'vitamin_c'
}

# Get food data parsing MyFitnessPal records from the xml
data = []

for record in root.findall('.//Record'):
    if record.attrib.get('sourceName') == 'MyFitnessPal':
        start_date = datetime.strptime(record.attrib['startDate'], '%Y-%m-%d %H:%M:%S %z').date()
        if start_date >= min_date and start_date <= max_date:
            data.append({
                'date': start_date,
                'time': datetime.strptime(record.attrib['startDate'], '%Y-%m-%d %H:%M:%S %z').time(),
                'type': food_mapping.get(record.attrib.get('type'), 'Unknown'),
                'food': record.attrib.get('value'),
                'unit': record.attrib.get('unit'),
         })

# Create a DataFrame from the data
df = pd.DataFrame(data)

# Group by date, time and type
df = df.groupby(['date', 'time', 'type'])['food'].sum().unstack().reset_index()

df.head()

type,date,time,Unknown,calcium,calories,carbs,cholesterol,fat,fiber,iron,monounsaturated fat,polyunsaturated fat,potassium,protein,saturated fat,sodium,sugar,vitamin_c
0,2024-03-16,10:51:00,,11.9403,40.2428,7.28118,0.0,0.0,0.597015,0.214925,0.0,0.0,375.0,0.599415,0.0,14.9254,6.56717,28.6567
1,2024-03-16,16:37:00,,128.044,1095.36,68.8976,304.374,67.5813,3.90055,5.2222,11.3821,5.12652,595.397,56.1978,25.794,1983.91,9.23707,3.776
2,2024-03-16,17:13:00,,0.0,192.0,12.0,0.0,1.65,0.65,0.0,0.0,0.0,0.0,32.0,0.55,300.177,0.65,0.0
3,2024-03-16,23:13:00,,22.4,1104.4,118.2,50.0,51.64,3.2,0.7704,3.112,1.136,306.0,32.388,2.156,360.0,5.0,0.0
4,2024-03-17,13:45:00,,0.0,200.0,25.9,0.0,7.4,0.0,0.0,0.0,0.0,0.2,5.9,0.0,0.1,0.0,0.0


In [5]:
# Handle situations in which a variable has two decimal dots. In that case, it should just keep the first one
for col in df.columns[3:]:
    df[col] = df[col].apply(lambda x: x if x is None or (isinstance(x, str) and x.count('.') <= 1) else x[:x.find('.', x.find('.') + 1)] if isinstance(x, str) else x)



In [None]:

# Write the DataFrame to a CSV file
df.to_csv('Data/Cleaned/Food.csv', index=False)
print("Food CSV file has been created.")

### Weight data

# Create a mapping of xml names for MyFitnessPal to human-readable names
weight_mapping = {
    'HKQuantityTypeIdentifierBodyFatPercentage': 'fat_percentage',
    'HKQuantityTypeIdentifierLeanBodyMass':'lean_body_mass',
    'HKQuantityTypeIdentifierBodyMassIndex': 'BMI',
    'HKQuantityTypeIdentifierBodyMass': 'weight'
}

# Get weight data parsing MyFitnessPal records from the xml
data = []

for record in root.findall('.//Record'):
    if record.attrib.get('sourceName') == 'VeSync':
        start_date = datetime.strptime(record.attrib['startDate'], '%Y-%m-%d %H:%M:%S %z').date()
        if start_date >= min_date and start_date <= max_date:
            data.append({
                'date': start_date,
                'type': weight_mapping.get(record.attrib.get('type'), 'Unknown'),
                'value': record.attrib.get('value')
         })

# Create a DataFrame from the data
df = pd.DataFrame(data)

# Convert 'value' column to numeric
df['value'] = pd.to_numeric(df['value'], errors='coerce')

# Group by date
df = df.groupby(['date', 'type'])['value'].mean().unstack().reset_index()

# Remove type column
df.columns.name = None

# Write the DataFrame to a CSV file
df.to_csv('Data/Cleaned/Weight.csv', index=False)
print("Weight CSV file has been created.")