# 🏇 PUREUNITY Corrected Filter — vNext (LTO Locked)

Full‑Parse + REL‑Spread + Guardrail Logic. Runs in **Google Colab** (desktop or mobile) and **VS Code/Jupyter**.

**Quick flow:** Install ▶️ Choose input (Upload or Drive) ▶️ Parse ▶️ Rank ▶️ Table ▶️ (optional) Export CSV.


In [None]:
# ✅ One-time installs per new runtime
!pip install -q pdfplumber pandas

import re
import pdfplumber
import pandas as pd


In [None]:
# 🔧 Choose how you want to provide the PDF
SELECT_INPUT = 'upload'   # 'upload' or 'drive'
# If using Google Drive, put your PDF in My Drive and set the path below (example):
DRIVE_PDF_PATH = '/content/drive/My Drive/ATR_formcard.pdf'


In [None]:
pdf_path = None
if SELECT_INPUT.lower() == 'drive':
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    pdf_path = DRIVE_PDF_PATH
    print('Using Google Drive file:', pdf_path)
else:
    from google.colab import files
    print('📄 Upload your ATR “Form Printouts” PDF…')
    uploaded = files.upload()
    pdf_path = list(uploaded.keys())[0]
    print('Using uploaded file:', pdf_path)


In [None]:

# ------------------------------------------------------------
# 1️⃣  PARSER  — Raw-Line Scan (no dropped runners)
# ------------------------------------------------------------
def parse_races_from_pdf(path: str):
    """
    Reads an ATR 'Form Printouts' PDF and extracts every race
    block + runner + form string. Guarantees no missing runners.
    """
    races, current_race = [], None

    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            text = page.extract_text() or ""
            lines = text.splitlines()
            for i, line in enumerate(lines):
                # --- Detect race header
                if re.search(r"\(R\d+\)", line):
                    if current_race:
                        if current_race["runners"] == 0:
                            current_race["runners"] = len(current_race["horses"])
                        races.append(current_race)

                    # Build race shell
                    time_match = re.search(r"\d{1,2}:\d{2}", line)
                    race_time = time_match.group() if time_match else "??:??"
                    current_race = {"time": race_time, "runners": 0, "horses": []}

                    # Peek ahead for "XX RUNNERS"
                    for la in lines[i:i+3]:
                        m = re.search(r"(\d+)\s+RUNNERS", la.upper())
                        if m:
                            current_race["runners"] = int(m.group(1))

                # --- Capture runner line
                elif current_race and "ATR VERDICT" not in (line or ""):
                    parts = (line or "").strip().split()
                    if len(parts) > 1:
                        form = parts[-1]
                        name = " ".join(parts[:-1])
                        if re.match(r"^[\dA-Z\-PBFUR]+$", form):
                            current_race["horses"].append({"name": name, "form": form})

        # push final race
        if current_race:
            if current_race["runners"] == 0:
                current_race["runners"] = len(current_race["horses"])
            races.append(current_race)
    return races


# ------------------------------------------------------------
# 2️⃣  REL-SCORING
# ------------------------------------------------------------
def score_rel(form: str) -> int:
    """
    REL4 = ≥2 wins OR ≥3 placings last 5
    REL3 = ≥1 win OR ≥2 placings last 4
    REL2 = ≥1 placing last 4
    REL1 = else
    """
    digits = [int(ch) for ch in re.sub(r"[^0-9]", "", form)[-5:]]
    if digits.count(1) >= 2 or sum(d <= 3 for d in digits) >= 3:
        return 4
    elif digits.count(1) >= 1 or sum(d <= 3 for d in digits[-4:]) >= 2:
        return 3
    elif sum(d <= 4 for d in digits[-4:]) >= 1:
        return 2
    return 1


# ------------------------------------------------------------
# 3️⃣  MAIN FILTER LOGIC
# ------------------------------------------------------------
def unity_corrected_filter(races):
    """
    Applies REL scoring, LTO tie-break, and guardrails.
    Returns list of race dictionaries for display or export.
    """
    out = []

    for race in races:
        horses = race["horses"]

        # --- Score REL, placings, and LTO
        for h in horses:
            h["rel"] = score_rel(h["form"])
            h["places"] = sum(ch in "23" for ch in h["form"][-5:])
            digits = [int(ch) for ch in re.sub(r"[^0-9]", "", h["form"])]
            h["lto"] = digits[-1] if digits else 99  # smaller = better finish

        # --- Sort by REL > placings > LTO > alpha
        def rank_key(h): return (-h["rel"], -h["places"], h["lto"], h["name"])
        ranked = sorted(horses, key=rank_key)

        gold = ranked[0]["name"] if ranked else None
        silver = ranked[1]["name"] if len(ranked) > 1 else None

        guardrail = (
            "No Bet (<7)" if race["runners"] < 7
            else ("N (default) / Y (expand)" if race["runners"] > 13 else "—")
        )

        out.append({
            "Time": race["time"],
            "Runners": race["runners"],
            "🥇 Gold": gold,
            "🥈 Silver": silver,
            "Guardrail": guardrail
        })
    return out


In [None]:
# ▶️ Run the filter
races = parse_races_from_pdf(pdf_path)
selections = unity_corrected_filter(races)

df = pd.DataFrame(selections)
from IPython.display import display
display(df)
print('\nMarkdown copy:')
print(df.to_markdown(index=False))


In [None]:
# 💾 Optional: save to CSV (downloads in Colab left pane > Files)
csv_path = 'pureunity_vnext_selections.csv'
df.to_csv(csv_path, index=False)
print('Saved:', csv_path)
