# 🏇 Unity Filter Colab v3 — ATR PDF → Top-2 per Race

**What’s new in v3**
- Step 2: Friendlier upload function ("📂 Please upload today’s ATR racecard PDF...").
- Step 4: Improved regex race splitter (detects `(R1)`, `(R2)`, etc reliably).
- Runs Unity Filter wrapper deterministically and prints Top-2 for all races.

**Steps**
1. Install dependencies
2. Upload ATR PDF (file picker)
3. Extract runner lines
4. Split into races
5. Convert to DataFrames
6. Unity Filter wrapper
7. Run all races → Top-2 grid

In [None]:
# === Step 1: Install dependencies ===
!pip -q install pandas pdfplumber

In [None]:
# === Step 2: Upload ATR PDF (file picker) ===
from google.colab import files

def pick_pdf():
    print("📂 Please upload today’s ATR racecard PDF...")
    uploaded = files.upload()
    pdf_path = next(iter(uploaded.keys()))
    print("✅ Uploaded:", pdf_path)
    return pdf_path

pdf_path = pick_pdf()

In [None]:
# === Step 3: Extract runner lines with pdfplumber ===
import pdfplumber

def extract_runner_lines(path):
    rows = []
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            text = page.extract_text().splitlines()
            for line in text:
                if line.strip() and line[0].isdigit():
                    rows.append(line.strip())
    return rows

rows = extract_runner_lines(pdf_path)
print("Sample lines:", rows[:10])

In [None]:
# === Step 4: Split races by header markers ===
import re

def split_races(rows):
    races, current = [], []
    for line in rows:
        if re.match(r"^\(R\d+\)", line):  # detect headers like (R1), (R2)
            if current:
                races.append(current)
                current = []
            # skip the header itself
        else:
            current.append(line)
    if current:
        races.append(current)
    return races

races = split_races(rows)
print("Found", len(races), "races")
for i, r in enumerate(races, 1):
    print(f"Race {i}: {len(r)} runners")

In [None]:
# === Step 5: Convert to DataFrame per race ===
import pandas as pd
import re

def parse_race(lines):
    data = []
    for line in lines:
        parts = line.split()
        if len(parts) < 2:
            continue
        try:
            no = parts[0]
            draw = parts[1] if parts[1].startswith("(") else ""
            form = ""
            name_parts = []
            i = 2 if draw else 1
            # optional form string like "321F" etc.
            if i < len(parts) and re.match(r"[0-9FURP\-]+", parts[i]):
                form = parts[i]
                i += 1
            # collect horse name until digits appear
            while i < len(parts) and not parts[i].isdigit():
                name_parts.append(parts[i])
                i += 1
            name = " ".join(name_parts)
            data.append({"name": name, "form": form, "trainer": "?", "or_rating": None})
        except:
            continue
    return pd.DataFrame(data)

race_dfs = [parse_race(r) for r in races]
print(race_dfs[0].head() if race_dfs else "No races parsed")

In [None]:
# === Step 6: Unity Filter Wrapper ===
class UnityFilter:
    def __init__(self):
        self.trainer_scores = {
            "aidan o'brien": 4.0, "a p o'brien": 4.0,
            "joseph patrick o'brien": 3.8, "donnacha o'brien": 3.5,
            "dermot weld": 3.2, "jessica harrington": 2.8,
            "william haggas": 3.5, "john gosden": 3.5,
            "j & t gosden": 3.5, "charlie appleby": 3.2,
            "sir michael stoute": 3.0,
        }

    def score_runner(self, row):
        score = 0.0
        for ch in str(row["form"]):
            if ch == "1": score += 3
            elif ch == "2": score += 2
            elif ch == "3": score += 1
            elif ch.upper() in "FURP": score -= 1
        return score

    def top2(self, race_df):
        df = race_df.copy()
        df["Score"] = df.apply(self.score_runner, axis=1)
        df = df.sort_values("Score", ascending=False).reset_index(drop=True)
        if len(df) >= 2:
            return df.iloc[0], df.iloc[1]
        elif len(df) == 1:
            return df.iloc[0], None
        else:
            return None, None

uf = UnityFilter()

In [None]:
# === Step 7: Run all races and show grid ===
results = []
for i, df in enumerate(race_dfs, start=1):
    if df.empty:
        continue
    top1, top2 = uf.top2(df)
    if top1 is not None:
        results.append({
            "Race #": i,
            "🥇 1st": top1["name"],
            "Score1": top1["Score"],
            "🥈 2nd": top2["name"] if top2 is not None else "",
            "Score2": top2["Score"] if top2 is not None else ""
        })

results_df = pd.DataFrame(results)
results_df

---
### Notes
- pdfplumber is more reliable for ATR form PDFs than Camelot.
- Trainer/OR columns are placeholders; can be expanded later.
- Replace `score_runner` with your full Unity logic when ready.