In [13]:
import xml.etree.ElementTree as ET
import os
from datetime import datetime

def filter_and_split_step_data(input_file, output_dir, start_date, end_date, lines_per_file):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    context = ET.iterparse(input_file, events=("start", "end"))
    filtered_records = []
    file_index = 1

    start_date = datetime.strptime(start_date, "%Y-%m-%d")
    end_date = datetime.strptime(end_date, "%Y-%m-%d")

    xml_header = '<?xml version="1.0" encoding="UTF-8"?>\n<HealthData>\n'
    xml_footer = '</HealthData>'

    for event, elem in context:
        if event == "start" and elem.tag == "Record":
            record_type = elem.attrib.get("type", "")
            start_record_date = elem.attrib.get("startDate", "")

            # Sadece Step Count kayıtlarını kontrol et
            if record_type == "HKQuantityTypeIdentifierStepCount":
                try:
                    record_date = datetime.strptime(start_record_date[:10], "%Y-%m-%d")
                except ValueError:
                    elem.clear()
                    continue

                if start_date <= record_date <= end_date:
                    filtered_records.append(ET.tostring(elem, encoding="unicode"))

                    if len(filtered_records) >= lines_per_file:
                        write_to_file(output_dir, file_index, xml_header, xml_footer, filtered_records)
                        filtered_records = []
                        file_index += 1
                        print(f"File written: part_{file_index - 1}.xml")

            elem.clear()

    if filtered_records:
        write_to_file(output_dir, file_index, xml_header, xml_footer, filtered_records)
        print(f"File written: part_{file_index}.xml")

def write_to_file(output_dir, index, header, footer, records):
    file_name = f"part_{index}.xml"
    with open(os.path.join(output_dir, file_name), 'w', encoding='utf-8') as out_file:
        out_file.write(header)
        out_file.writelines(records)
        out_file.write(footer)

# Parametreler
input_file = "dışa aktarılan.xml"  # XML dosyasının adı
output_dir = "filtered_xml_parts"  # Filtrelenmiş dosyaların kaydedileceği klasör
start_date = "2022-10-03"  # Başlangıç tarihi
end_date = "2024-09-23"    # Bitiş tarihi
lines_per_file = 5000      # Her dosyada bulunacak kayıt sayısı

# Çalıştır
filter_and_split_step_data(input_file, output_dir, start_date, end_date, lines_per_file)


File written: part_1.xml
File written: part_2.xml
File written: part_3.xml
File written: part_4.xml
File written: part_5.xml


In [15]:
import os
print(os.getcwd())


/Users/cagatayeroglu
