In [1]:
import json
import xml.etree.ElementTree as ET

import pandas as pd


In [2]:
# XMLファイルを読み込み、解析
tree = ET.parse("../data/apple_health_export/export.xml")
root = tree.getroot()

# すべてのRecord要素を取得
records = root.findall("Record")

# 各Record要素を抽出しデータフレーム化
data = [record.attrib for record in records]
health_df = pd.DataFrame(data)


In [3]:
health_df.head()

Unnamed: 0,type,sourceName,sourceVersion,unit,creationDate,startDate,endDate,value,device
0,HKQuantityTypeIdentifierDietaryWater,Water Reminder,1.7.28,mL,2021-08-27 23:25:27 +0900,2021-08-27 23:25:27 +0900,2021-08-27 23:25:27 +0900,40.0,
1,HKQuantityTypeIdentifierBodyMassIndex,omron connect,006.009.00001.001,count,2021-12-09 11:42:09 +0900,2021-12-09 11:05:36 +0900,2021-12-09 11:05:36 +0900,21.1,
2,HKQuantityTypeIdentifierBodyMassIndex,omron connect,006.009.00001.001,count,2021-12-09 22:52:33 +0900,2021-12-09 22:47:16 +0900,2021-12-09 22:47:16 +0900,20.2,
3,HKQuantityTypeIdentifierBodyMassIndex,omron connect,006.009.00001.001,count,2021-12-10 07:42:08 +0900,2021-12-10 07:39:57 +0900,2021-12-10 07:39:57 +0900,20.3,
4,HKQuantityTypeIdentifierBodyMassIndex,omron connect,006.009.00001.001,count,2021-12-11 12:54:25 +0900,2021-12-11 01:06:30 +0900,2021-12-11 01:06:30 +0900,20.5,


In [4]:
health_df.shape

(1800451, 9)

In [5]:
# 個人情報を削除
health_df = health_df.drop(["sourceVersion", "device"], axis=1)

"""replace_dict.json
{
    "replace_dict":{
        "リプレイスしたい文字": "リプレイス後の文字",
    }
}

"""
with open("../data/replace_dict.json", "r") as f:
    replace_dict = json.load(f)

health_df["sourceName"] = health_df["sourceName"].replace(replace_dict["replace_dict"], regex=True)

In [6]:
# お昼寝とStreaksで立てているも目標がバレるので削除
remove_source = ["AutoSleep", "Streaks"]
health_df = health_df[~health_df["sourceName"].isin(remove_source)]

In [7]:
# データ期間を絞る
start_date = "2022-01-01"
health_df = (
    health_df[health_df["creationDate"] >= start_date]
    .sort_values(["creationDate", "type"], ascending=[True, True])
    .reset_index(drop=True)
)

In [8]:
health_df.head()

Unnamed: 0,type,sourceName,unit,creationDate,startDate,endDate,value
0,HKQuantityTypeIdentifierActiveEnergyBurned,Apple Watch,kcal,2022-01-01 00:00:03 +0900,2021-12-31 23:53:45 +0900,2021-12-31 23:54:46 +0900,0.172
1,HKQuantityTypeIdentifierActiveEnergyBurned,Apple Watch,kcal,2022-01-01 00:00:03 +0900,2021-12-31 23:54:57 +0900,2021-12-31 23:55:48 +0900,0.239
2,HKQuantityTypeIdentifierActiveEnergyBurned,Apple Watch,kcal,2022-01-01 00:00:03 +0900,2021-12-31 23:55:58 +0900,2021-12-31 23:57:00 +0900,0.188
3,HKQuantityTypeIdentifierActiveEnergyBurned,Apple Watch,kcal,2022-01-01 00:00:03 +0900,2021-12-31 23:57:00 +0900,2021-12-31 23:57:10 +0900,0.067
4,HKQuantityTypeIdentifierHeartRate,Apple Watch,count/min,2022-01-01 00:00:03 +0900,2021-12-31 23:57:35 +0900,2021-12-31 23:57:35 +0900,79.0


In [9]:
health_df.shape

(1552977, 7)

In [10]:
health_df.to_csv("../data/dataset.csv", index=False)