# Data-Collecting

I will read the data from the XML & JSON files and process them accordingly. Then finally I will save them in final .csv files.


# Types of Data

*   Apple Health Data (Step Counts, Flights Climbed, Walking/Running Distance)
*   Period Tracker App (Dates of menstrual, follicular, luteal, and ovulation phases)

In [14]:
import os
import pandas as pd
import json
import csv
import os
import xml.etree.ElementTree as ET
from datetime import datetime, timedelta
from IPython.display import display
from tabulate import tabulate

## Apple Health

This data holds my daily step count, flights climbed, walking/running distance information.

In [33]:
xml_file_path = "/content/Raw_Data/export.xml"

start_date_filter = datetime.strptime("2021-05-07", "%Y-%m-%d")

all_records = []

tree = ET.parse(xml_file_path)
root = tree.getroot()

for record in root.findall("Record"):
    record_type = record.get("type", "Unknown").replace("HKQuantityTypeIdentifier", "")
    start_date = record.get("startDate", "Unknown")
    value = record.get("value", "Unknown")

    try:
        record_date = datetime.strptime(start_date.split(" ")[0], "%Y-%m-%d")
        if record_date >= start_date_filter and record_type in {
            "StepCount",
            "DistanceWalkingRunning",
            "FlightsClimbed"
        }:
            value = float(value)
            all_records.append({
                "date": record_date.strftime("%Y-%m-%d"),
                "type": record_type,
                "value": value
            })
    except ValueError:
        continue

df = pd.DataFrame(all_records)

df = df.groupby(["date", "type"], as_index=False).agg({"value": "sum"})

all_dates = pd.date_range(start=df["date"].min(), end=df["date"].max())
all_types = ["StepCount", "FlightsClimbed", "DistanceWalkingRunning"]

full_grid = pd.DataFrame([(d.strftime("%Y-%m-%d"), t) for d in all_dates for t in all_types], columns=["date", "type"])

df = pd.merge(full_grid, df, on=["date", "type"], how="left").fillna(0)

output_dir = "/content/Output"
output_file = os.path.join(output_dir, "apple_health_combined_data.csv")

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the data to a CSV file
df.to_csv(output_file, index=False, columns=["date", "type", "value"])
print(f"Final data has been saved to {output_file}")

display(df.head(10))
print("\n...")
display(df.tail(10))

Final data has been saved to /content/Output/apple_health_combined_data.csv


Unnamed: 0,date,type,value
0,2021-05-07,StepCount,2027.0
1,2021-05-07,FlightsClimbed,2.0
2,2021-05-07,DistanceWalkingRunning,1.49692
3,2021-05-08,StepCount,30.0
4,2021-05-08,FlightsClimbed,0.0
5,2021-05-08,DistanceWalkingRunning,0.01674
6,2021-05-09,StepCount,126.0
7,2021-05-09,FlightsClimbed,0.0
8,2021-05-09,DistanceWalkingRunning,0.08299
9,2021-05-10,StepCount,858.0



...


Unnamed: 0,date,type,value
3971,2024-12-20,DistanceWalkingRunning,2.704406
3972,2024-12-21,StepCount,333.0
3973,2024-12-21,FlightsClimbed,2.0
3974,2024-12-21,DistanceWalkingRunning,0.23393
3975,2024-12-22,StepCount,1429.0
3976,2024-12-22,FlightsClimbed,4.0
3977,2024-12-22,DistanceWalkingRunning,0.97289
3978,2024-12-23,StepCount,2284.0
3979,2024-12-23,FlightsClimbed,4.0
3980,2024-12-23,DistanceWalkingRunning,1.63034


### Step Counts & Flights Climbed & Walking/Running Distance

In [35]:
xml_file_path = "/content/Raw_Data/export.xml"

start_date_filter = datetime.strptime("2021-05-07", "%Y-%m-%d")

data_by_type = {
    "HKQuantityTypeIdentifierStepCount": "Step Count",
    "HKQuantityTypeIdentifierFlightsClimbed": "Flights Climbed",
    "HKQuantityTypeIdentifierDistanceWalkingRunning": "Walking/Running Distance"
}

tree = ET.parse(xml_file_path)
root = tree.getroot()

categorized_data = {friendly_name: [] for friendly_name in data_by_type.values()}

for record in root.findall("Record"):
    record_type = record.get("type", "Unknown")
    start_date = record.get("startDate", "Unknown")
    value = record.get("value", "Unknown")

    try:
        record_date = datetime.strptime(start_date.split(" ")[0], "%Y-%m-%d")
        if record_date >= start_date_filter and record_type in data_by_type:
            friendly_name = data_by_type[record_type]
            categorized_data[friendly_name].append({
                "date": record_date.strftime("%Y-%m-%d"),
                "value": float(value)
            })
    except ValueError:
        continue

output_dir = "/content/Output"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

all_dates = pd.date_range(start=start_date_filter, end=datetime.now()).strftime("%Y-%m-%d")

file_order = [
    ("Step Count", "stepcount_data.csv"),
    ("Flights Climbed", "flightsclimbed_data.csv"),
    ("Walking/Running Distance", "distancewalkingrunning_data.csv")
]

for friendly_name, file_name in file_order:
    records = categorized_data[friendly_name]
    df = pd.DataFrame(records)

    df = df.groupby("date", as_index=False).agg({"value": "sum"})

    full_df = pd.DataFrame({"date": all_dates})
    full_df = pd.merge(full_df, df, on="date", how="left").fillna(0)

    # Save to CSV
    output_file = os.path.join(output_dir, file_name)
    full_df.to_csv(output_file, index=False)
    print(f"{friendly_name} data has been saved to {output_file}")

    display(df.head(10))
    print("\n...")
    display(df.tail(10))
    print('')

Step Count data has been saved to /content/Output/stepcount_data.csv


Unnamed: 0,date,value
0,2021-05-07,2027.0
1,2021-05-08,30.0
2,2021-05-09,126.0
3,2021-05-10,858.0
4,2021-05-11,5083.0
5,2021-05-12,1112.0
6,2021-05-13,1158.0
7,2021-05-14,1633.0
8,2021-05-15,266.0
9,2021-05-16,1605.0



...


Unnamed: 0,date,value
1311,2024-12-14,1269.0
1312,2024-12-15,1620.0
1313,2024-12-16,6279.0
1314,2024-12-17,2911.0
1315,2024-12-18,689.0
1316,2024-12-19,4908.0
1317,2024-12-20,3622.0
1318,2024-12-21,333.0
1319,2024-12-22,1429.0
1320,2024-12-23,2284.0



Flights Climbed data has been saved to /content/Output/flightsclimbed_data.csv


Unnamed: 0,date,value
0,2021-05-07,2.0
1,2021-05-10,3.0
2,2021-05-11,6.0
3,2021-05-12,9.0
4,2021-05-13,6.0
5,2021-05-14,5.0
6,2021-05-15,4.0
7,2021-05-16,2.0
8,2021-05-17,5.0
9,2021-05-18,2.0



...


Unnamed: 0,date,value
1182,2024-12-14,7.0
1183,2024-12-15,5.0
1184,2024-12-16,10.0
1185,2024-12-17,5.0
1186,2024-12-18,1.0
1187,2024-12-19,9.0
1188,2024-12-20,8.0
1189,2024-12-21,2.0
1190,2024-12-22,4.0
1191,2024-12-23,4.0



Walking/Running Distance data has been saved to /content/Output/distancewalkingrunning_data.csv


Unnamed: 0,date,value
0,2021-05-07,1.49692
1,2021-05-08,0.01674
2,2021-05-09,0.08299
3,2021-05-10,0.68904
4,2021-05-11,3.85689
5,2021-05-12,0.79825
6,2021-05-13,0.7972
7,2021-05-14,1.21137
8,2021-05-15,0.19182
9,2021-05-16,1.15714



...


Unnamed: 0,date,value
1311,2024-12-14,0.856154
1312,2024-12-15,1.09142
1313,2024-12-16,4.50687
1314,2024-12-17,2.18232
1315,2024-12-18,0.51544
1316,2024-12-19,3.49344
1317,2024-12-20,2.704406
1318,2024-12-21,0.23393
1319,2024-12-22,0.97289
1320,2024-12-23,1.63034





In [36]:
flights_file = '/content/Output/flightsclimbed_data.csv'
steps_file = '/content/Output/stepcount_data.csv'
distance_file = '/content/Output/distancewalkingrunning_data.csv'

distance_df = pd.read_csv(distance_file)
flights_df = pd.read_csv(flights_file)
steps_df = pd.read_csv(steps_file)

distance_df.rename(columns={"value": "Distance (km)"}, inplace=True)
flights_df.rename(columns={"value": "Flights Climbed"}, inplace=True)
steps_df.rename(columns={"value": "Step Count"}, inplace=True)

merged_df = steps_df.merge(flights_df, on="date", how="outer").merge(distance_df, on="date", how="outer")

merged_df.fillna(0, inplace=True)

merged_df.sort_values(by="date", inplace=True)

display(merged_df.head(10))
print("\n...")
display(merged_df.tail(10))

Unnamed: 0,date,Step Count,Flights Climbed,Distance (km)
0,2021-05-07,2027.0,2.0,1.49692
1,2021-05-08,30.0,0.0,0.01674
2,2021-05-09,126.0,0.0,0.08299
3,2021-05-10,858.0,3.0,0.68904
4,2021-05-11,5083.0,6.0,3.85689
5,2021-05-12,1112.0,9.0,0.79825
6,2021-05-13,1158.0,6.0,0.7972
7,2021-05-14,1633.0,5.0,1.21137
8,2021-05-15,266.0,4.0,0.19182
9,2021-05-16,1605.0,2.0,1.15714



...


Unnamed: 0,date,Step Count,Flights Climbed,Distance (km)
1319,2024-12-16,6279.0,10.0,4.50687
1320,2024-12-17,2911.0,5.0,2.18232
1321,2024-12-18,689.0,1.0,0.51544
1322,2024-12-19,4908.0,9.0,3.49344
1323,2024-12-20,3622.0,8.0,2.704406
1324,2024-12-21,333.0,2.0,0.23393
1325,2024-12-22,1429.0,4.0,0.97289
1326,2024-12-23,2284.0,4.0,1.63034
1327,2024-12-24,0.0,0.0,0.0
1328,2024-12-25,0.0,0.0,0.0


## Period Tracker App (Clue)

This data holds my dates of menstrual, follicular, luteal, and ovulation phases.

### Period Dates

Extracts and processes the data from the period tracker JSON file, filtering for period entries and saving it as a CSV.

In [37]:
json_file_path = "/content/Raw_Data/measurements.json"

csv_file_path = os.path.join("/content/Output", "all_dates_with_period_flags.csv")

with open(json_file_path, 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

period_dates = [datetime.strptime(item["date"], "%Y-%m-%d") for item in data if item["type"] == "period"]
period_dates = sorted(period_dates)

start_date = period_dates[0]
end_date = period_dates[-1]
all_dates = [start_date + timedelta(days=i) for i in range((end_date - start_date).days + 1)]

all_dates_with_flags = [
    {
        "date": current_date.strftime("%Y-%m-%d"),
        "is_period": 1 if current_date in period_dates else 0
    }
    for current_date in all_dates
]

with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["date", "is_period"])
    for item in all_dates_with_flags:
        writer.writerow([item["date"], item["is_period"]])

print(f"CSV file with all dates and period flags has been successfully saved to {csv_file_path}.")

df = pd.read_csv(csv_file_path)

print("\n")
display(df.head(10))
print("\n...")
display(df.tail(10))

CSV file with all dates and period flags has been successfully saved to /content/Output/all_dates_with_period_flags.csv.




Unnamed: 0,date,is_period
0,2021-05-07,1
1,2021-05-08,1
2,2021-05-09,1
3,2021-05-10,1
4,2021-05-11,0
5,2021-05-12,0
6,2021-05-13,0
7,2021-05-14,0
8,2021-05-15,0
9,2021-05-16,0



...


Unnamed: 0,date,is_period
1300,2024-11-27,0
1301,2024-11-28,0
1302,2024-11-29,0
1303,2024-11-30,0
1304,2024-12-01,0
1305,2024-12-02,1
1306,2024-12-03,1
1307,2024-12-04,1
1308,2024-12-05,1
1309,2024-12-06,1


### Monthly Phases

Computes phase ranges (e.g., follicular, luteal, ovulation, menstruation) based on the period dates from the first script and saves the results.

In [38]:
json_file_path = "/content/Raw_Data/measurements.json"
csv_file_path = os.path.join("/content/Output", "phase_date_ranges.csv")

with open(json_file_path, 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

data.sort(key=lambda x: datetime.strptime(x["date"], "%Y-%m-%d"))

phases = []
for i in range(len(data) - 1):
    start_date = datetime.strptime(data[i]["date"], "%Y-%m-%d")
    end_date = datetime.strptime(data[i + 1]["date"], "%Y-%m-%d")
    duration = (end_date - start_date).days

    menstruation_end = start_date + timedelta(days=5)
    phases.append(["Menstruation", start_date.strftime("%Y-%m-%d"), menstruation_end.strftime("%Y-%m-%d")])

    follicular_start = menstruation_end + timedelta(days=1)
    ovulation_start = follicular_start + timedelta(days=9)
    phases.append(["Follicular", follicular_start.strftime("%Y-%m-%d"), ovulation_start.strftime("%Y-%m-%d")])

    ovulation_end = ovulation_start + timedelta(days=1)
    phases.append(["Ovulation", ovulation_start.strftime("%Y-%m-%d"), ovulation_end.strftime("%Y-%m-%d")])

    luteal_start = ovulation_end + timedelta(days=1)
    luteal_end = end_date
    phases.append(["Luteal", luteal_start.strftime("%Y-%m-%d"), luteal_end.strftime("%Y-%m-%d")])

with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["Phase", "Start Date", "End Date"])
    for phase in phases:
        writer.writerow(phase)

df = pd.read_csv(csv_file_path)

display(df.head(10))
print("\n...")
display(df.tail(10))

Unnamed: 0,Phase,Start Date,End Date
0,Menstruation,2021-05-07,2021-05-12
1,Follicular,2021-05-13,2021-05-22
2,Ovulation,2021-05-22,2021-05-23
3,Luteal,2021-05-24,2021-05-08
4,Menstruation,2021-05-08,2021-05-13
5,Follicular,2021-05-14,2021-05-23
6,Ovulation,2021-05-23,2021-05-24
7,Luteal,2021-05-25,2021-05-09
8,Menstruation,2021-05-09,2021-05-14
9,Follicular,2021-05-15,2021-05-24



...


Unnamed: 0,Phase,Start Date,End Date
942,Ovulation,2024-12-18,2024-12-19
943,Luteal,2024-12-20,2024-12-04
944,Menstruation,2024-12-04,2024-12-09
945,Follicular,2024-12-10,2024-12-19
946,Ovulation,2024-12-19,2024-12-20
947,Luteal,2024-12-21,2024-12-05
948,Menstruation,2024-12-05,2024-12-10
949,Follicular,2024-12-11,2024-12-20
950,Ovulation,2024-12-20,2024-12-21
951,Luteal,2024-12-22,2024-12-06


In [39]:
csv_file_path = "/content/Output/phase_date_ranges.csv"

df = pd.read_csv(csv_file_path)

menstruation_df = df[df['Phase'] == 'Menstruation'][['Start Date', 'End Date']].reset_index(drop=True)
follicular_df = df[df['Phase'] == 'Follicular'][['Start Date', 'End Date']].reset_index(drop=True)
ovulation_df = df[df['Phase'] == 'Ovulation'][['Start Date', 'End Date']].reset_index(drop=True)
luteal_df = df[df['Phase'] == 'Luteal'][['Start Date', 'End Date']].reset_index(drop=True)

menstruation_df['Menstruation Start-End'] = menstruation_df['Start Date'] + " - " + menstruation_df['End Date']
follicular_df['Follicular Start-End'] = follicular_df['Start Date'] + " - " + follicular_df['End Date']
ovulation_df['Ovulation Start-End'] = ovulation_df['Start Date'] + " - " + ovulation_df['End Date']
luteal_df['Luteal Start-End'] = luteal_df['Start Date'] + " - " + luteal_df['End Date']

menstruation_df = menstruation_df[['Menstruation Start-End']]
follicular_df = follicular_df[['Follicular Start-End']]
ovulation_df = ovulation_df[['Ovulation Start-End']]
luteal_df = luteal_df[['Luteal Start-End']]

combined_df = pd.concat([menstruation_df, follicular_df, ovulation_df, luteal_df], axis=1)

display(combined_df.head(10))

print("\n...")
display(combined_df.tail(10))

Unnamed: 0,Menstruation Start-End,Follicular Start-End,Ovulation Start-End,Luteal Start-End
0,2021-05-07 - 2021-05-12,2021-05-13 - 2021-05-22,2021-05-22 - 2021-05-23,2021-05-24 - 2021-05-08
1,2021-05-08 - 2021-05-13,2021-05-14 - 2021-05-23,2021-05-23 - 2021-05-24,2021-05-25 - 2021-05-09
2,2021-05-09 - 2021-05-14,2021-05-15 - 2021-05-24,2021-05-24 - 2021-05-25,2021-05-26 - 2021-05-10
3,2021-05-10 - 2021-05-15,2021-05-16 - 2021-05-25,2021-05-25 - 2021-05-26,2021-05-27 - 2021-06-03
4,2021-06-03 - 2021-06-08,2021-06-09 - 2021-06-18,2021-06-18 - 2021-06-19,2021-06-20 - 2021-06-04
5,2021-06-04 - 2021-06-09,2021-06-10 - 2021-06-19,2021-06-19 - 2021-06-20,2021-06-21 - 2021-06-05
6,2021-06-05 - 2021-06-10,2021-06-11 - 2021-06-20,2021-06-20 - 2021-06-21,2021-06-22 - 2021-06-06
7,2021-06-06 - 2021-06-11,2021-06-12 - 2021-06-21,2021-06-21 - 2021-06-22,2021-06-23 - 2021-06-07
8,2021-06-07 - 2021-06-12,2021-06-13 - 2021-06-22,2021-06-22 - 2021-06-23,2021-06-24 - 2021-06-29
9,2021-06-29 - 2021-07-04,2021-07-05 - 2021-07-14,2021-07-14 - 2021-07-15,2021-07-16 - 2021-06-30



...


Unnamed: 0,Menstruation Start-End,Follicular Start-End,Ovulation Start-End,Luteal Start-End
228,2024-10-10 - 2024-10-15,2024-10-16 - 2024-10-25,2024-10-25 - 2024-10-26,2024-10-27 - 2024-11-03
229,2024-11-03 - 2024-11-08,2024-11-09 - 2024-11-18,2024-11-18 - 2024-11-19,2024-11-20 - 2024-11-04
230,2024-11-04 - 2024-11-09,2024-11-10 - 2024-11-19,2024-11-19 - 2024-11-20,2024-11-21 - 2024-11-05
231,2024-11-05 - 2024-11-10,2024-11-11 - 2024-11-20,2024-11-20 - 2024-11-21,2024-11-22 - 2024-11-06
232,2024-11-06 - 2024-11-11,2024-11-12 - 2024-11-21,2024-11-21 - 2024-11-22,2024-11-23 - 2024-11-07
233,2024-11-07 - 2024-11-12,2024-11-13 - 2024-11-22,2024-11-22 - 2024-11-23,2024-11-24 - 2024-12-02
234,2024-12-02 - 2024-12-07,2024-12-08 - 2024-12-17,2024-12-17 - 2024-12-18,2024-12-19 - 2024-12-03
235,2024-12-03 - 2024-12-08,2024-12-09 - 2024-12-18,2024-12-18 - 2024-12-19,2024-12-20 - 2024-12-04
236,2024-12-04 - 2024-12-09,2024-12-10 - 2024-12-19,2024-12-19 - 2024-12-20,2024-12-21 - 2024-12-05
237,2024-12-05 - 2024-12-10,2024-12-11 - 2024-12-20,2024-12-20 - 2024-12-21,2024-12-22 - 2024-12-06
