In [None]:
# ------------------------------------------------------------
# parse_past_performances.py
# ------------------------------------------------------------
import xml.etree.ElementTree as ET
from pathlib import Path
import pandas as pd
import itertools

XML_PATH = Path("/mnt/data/SIMD20230101AQU_USA.xml")

# ---------- helpers ---------------------------------------------------------
def text(elem, path, default=None):
    """Return elem.find(path).text stripped or default."""
    child = elem.find(path)
    return child.text.strip() if child is not None and child.text else default


def add_seq(rows, seq_key):
    """Append a running sequence number inside each pp_id group."""
    rows.sort(key=lambda r: (r["pp_id"], r.get("sequence", 0)))
    for _, group in itertools.groupby(rows, key=lambda r: r["pp_id"]):
        for i, row in enumerate(group, 1):
            row[seq_key] = i
    return rows


# ---------- parse -----------------------------------------------------------
tree = ET.parse(XML_PATH)
root = tree.getroot()

main_rows, frac_rows, pcall_rows = [], [], []
pp_id_counter = 0

for starter in root.findall(".//Race/Starters"):
    horse = starter.find("Horse")
    if horse is None:
        continue

    # --- horse-level columns -------------------------------------------------
    horse_dict = {
        "Horse.RegistrationNumber": text(horse, "RegistrationNumber"),
        "Horse.HorseName":          text(horse, "HorseName"),
        "Horse.YearOfBirth":        text(horse, "YearOfBirth"),
        "Horse.Sire.HorseName":     text(horse, "Sire/HorseName"),
        "Horse.Dam.HorseName":      text(horse, "Dam/HorseName"),
        "Horse.Sex.Value":          text(horse, "Sex/Value"),
    }

    # --- each past performance ---------------------------------------------
    for pp in starter.findall("PastPerformance"):
        pp_id_counter += 1
        pp_id = pp_id_counter  # simple running id

        # core race-level data
        main_row = {
            **horse_dict,
            "pp_id":                    pp_id,
            "Track.TrackID":            text(pp, "Track/TrackID"),
            "RaceDate":                 text(pp, "RaceDate"),
            "Grade":                    text(pp, "Grade"),
            "StakesIndicator":          text(pp, "StakesIndicator"),
            "ConditionsOfRace":         text(pp, "ConditionsOfRace"),
            "AgeRestriction":           text(pp, "AgeRestriction"),
            "SexRestriction":           text(pp, "SexRestriction"),
            "RaceRestrictions":         text(pp, "RaceRestrictions/Text"),
            "MaximumClaimingPrice":     text(pp, "MaximumClaimingPrice"),
            "PurseUSA":                 text(pp, "PurseUSA"),
            "DistanceUnit":             text(pp, "Distance/DistanceUnit/Value"),
            "TrackCondition":           text(pp, "TrackCondition/Value"),
            "TrackSealedIndicator":     text(pp, "TrackSealedIndicator"),
            "OffTurfIndicator":         text(pp, "OffTurfIndicator"),
            "NumberOfStarters":         text(pp, "NumberOfStarters"),
            "RaceName":                 text(pp, "RaceName"),
        }

        # company-line (take the one that corresponds to this horse, if present)
        cl = pp.find("CompanyLine")
        if cl is not None:
            main_row.update({
                "CompanyLine.LengthsAheadAtFinish": text(cl, "LengthsAheadAtFinish"),
                "CompanyLine.PositionAtFinish":     text(cl, "PositionAtFinish"),
                "CompanyLine.OfficialPosition":     text(cl, "OfficialPosition"),
            })

        # start section
        start = pp.find("Start")
        if start is not None:
            start_dict = {
                "Start.WeightCarried":        text(start, "WeightCarried"),
                "Start.Medication.Value":     text(start, "Medication/Value"),
                "Start.Equipment.Value":      text(start, "Equipment/Value"),
                "Start.EarningsUSA":          text(start, "EarningsUSA"),
                "Start.EarningsForeign":      text(start, "EarningsForeign"),
                "Start.Odds":                 text(start, "Odds"),
                "Start.Favorite":             text(start, "Favorite"),
                "Start.PostPosition":         text(start, "PostPosition"),
                "Start.OfficialFinish":       text(start, "OfficialFinish"),
                "Start.RaceRating":           text(start, "RaceRating"),
                "Start.ClassRating":          text(start, "ClassRating"),
                "Start.PaceFigure1":          text(start, "PaceFigure1"),
                "Start.PaceFigure2":          text(start, "PaceFigure2"),
                "Start.PaceFigure3":          text(start, "PaceFigure3"),
                "Start.SpeedFigure":          text(start, "SpeedFigure"),
                "Start.ClaimPriceUSA":        text(start, "ClaimPriceUSA"),
                "Start.ClaimedFlag":          text(start, "ClaimedFlag"),
                "Start.TimeOfHorse":          text(start, "TimeOfHorse"),
            }
            main_row.update(start_dict)

            # ---- points of call (many) -------------------------------------
            for poc in start.findall("PointOfCall"):
                pcall_rows.append({
                    "pp_id":           pp_id,
                    "PointOfCall":     text(poc, "PointOfCall"),
                    "Position":        text(poc, "Position"),
                    "LengthsAhead":    text(poc, "LengthsAhead"),
                    "LengthsBehind":   text(poc, "LengthsBehind"),
                })

        # ---- fractions (many) ---------------------------------------------
        for frac in pp.findall("Fractions/Fraction"):
            frac_rows.append({
                "pp_id":           pp_id,
                "Fraction":        text(frac, "Fraction"),
                "Time":            text(frac, "Time"),
            })

        main_rows.append(main_row)

# ---------- sequencing & DataFrame build ------------------------------------
frac_rows   = add_seq(frac_rows,   "sequence")   # sequence 1…n within pp
pcall_rows  = add_seq(pcall_rows,  "sequence")

df_main     = pd.DataFrame(main_rows)
df_frac     = pd.DataFrame(frac_rows)
df_pcall    = pd.DataFrame(pcall_rows)

# ---------- save ------------------------------------------------------------
df_main.to_csv("past_performances.csv", index=False)
df_frac.to_csv("fractions.csv",        index=False)
df_pcall.to_csv("points_of_call.csv",  index=False)

print("✓ Extracted:")
print("  •", len(df_main),  "past performances")
print("  •", len(df_frac),  "fractions")
print("  •", len(df_pcall), "points-of-call")
