# 🏇 Unity Filter — ATR PDF → Top‑2 per Race (Colab)

**What this notebook does**
1. Installs dependencies (Camelot, pandas).
2. Lets you upload an ATR racecard PDF from your device.
3. Parses the PDF into tables.
4. Runs a deterministic Unity filter wrapper.
5. Prints a clean Top‑2 selections grid for each race.

> Tip: If parsing finds 0 tables, switch `PARSING_FLAVOR` to `"lattice"` in the setup cell.

In [None]:
# === Step 1: Install dependencies (run once per session) ===
!apt-get -y install ghostscript > /dev/null
!pip -q install pandas camelot-py[cv] pdfplumber tabula-py > /dev/null

import warnings
warnings.filterwarnings("ignore")

# Switch between 'stream' (text-based tables) and 'lattice' (gridlines).
PARSING_FLAVOR = "stream"  # change to "lattice" if 0 tables found

In [None]:
# === Step 2: Upload your ATR PDF ===
from google.colab import files
uploaded = files.upload()

pdf_path = next(iter(uploaded.keys()))
print("Uploaded:", pdf_path)

In [None]:
# === Step 3: Parse the PDF into tables ===
import camelot
import pandas as pd

def parse_pdf_tables(path, flavor="stream"):
    try:
        tables = camelot.read_pdf(path, pages="all", flavor=flavor)
        print(f"[Camelot] Found {len(tables)} tables using flavor='{flavor}'")
        return [t.df for t in tables]
    except Exception as e:
        print("Camelot failed:", e)
        return []

tables = parse_pdf_tables(pdf_path, flavor=PARSING_FLAVOR)
if not tables:
    print("No tables found. Try setting PARSING_FLAVOR = 'lattice' and re-run the previous cells.")
else:
    # Show a preview of the first table
    from IPython.display import display
    display(tables[0].head() if hasattr(tables[0], "head") else pd.DataFrame(tables[0]).head())

In [None]:
# === Step 4: Column Mapping Helper ===
# ATR layouts can vary. We provide a heuristic mapper + quick override.
import re
import numpy as np
import pandas as pd

def looks_like_form(s):
    # A crude check for form strings like "321F6" etc.
    s = str(s)
    return bool(re.search(r"[0-9FURP\-]+", s, re.I)) and len(s) <= 10

def coerce_number(x):
    try:
        return float(str(x).strip())
    except:
        return np.nan

def map_columns(df):
    # Attempt to map the raw table columns to the expected schema:
    # name, form, trainer, age, or_rating
    #
    # Heuristics:
    #   - name: the longest text-ish column (often index 2)
    #   - form: short string with digits/F/U/R/P
    #   - trainer: text column near the end
    #   - age: small integer 2-12 where possible
    #   - or_rating: 40-120 typical

    df2 = df.copy()
    df2.columns = list(range(len(df2.columns)))  # index columns numerically

    # Guess candidates
    # Name column: choose the text-heavy column with longest median length
    name_col = max(df2.columns, key=lambda c: df2[c].astype(str).str.len().median())

    # Form column: column with highest rate of "looks_like_form"
    form_col = max(df2.columns, key=lambda c: df2[c].apply(looks_like_form).mean())

    # Trainer column: choose a text-ish column that's not name/form and towards the right if possible
    remaining = [c for c in df2.columns if c not in (name_col, form_col)]
    trainer_col = max(remaining, key=lambda c: (df2[c].astype(str).str.len().median(), c))

    # Age column: numeric-ish small integers
    age_candidates = remaining.copy()
    age_scores = {}
    for c in age_candidates:
        vals = pd.to_numeric(df2[c], errors="coerce")
        # proportion of values between 2 and 15 (typical ages)
        pct = ((vals>=2) & (vals<=15)).mean()
        age_scores[c] = pct
    age_col = max(age_scores, key=age_scores.get)

    # OR rating: numeric-ish between 40 and 120 (flat)
    or_candidates = [c for c in remaining if c not in (trainer_col, age_col)]
    or_scores = {}
    for c in or_candidates:
        vals = pd.to_numeric(df2[c], errors="coerce")
        pct = ((vals>=35) & (vals<=125)).mean()
        or_scores[c] = pct
    or_col = max(or_scores, key=or_scores.get) if or_scores else age_col

    mapped = pd.DataFrame({
        "name": df2[name_col].astype(str).str.strip(),
        "form": df2[form_col].astype(str).str.strip(),
        "trainer": df2[trainer_col].astype(str).str.strip(),
        "age": pd.to_numeric(df2[age_col], errors="coerce"),
        "or_rating": pd.to_numeric(df2[or_col], errors="coerce")
    })

    # Basic cleanups
    mapped = mapped.replace({"": pd.NA, "—": pd.NA, "-": pd.NA})
    return mapped

# Quick test mapping on first table if available
if tables:
    test_mapped = map_columns(tables[0])
    from IPython.display import display
    display(test_mapped.head())

In [None]:
# === Step 5: Deterministic Unity Filter Wrapper ===
import pandas as pd

class UnityFilter:
    def __init__(self):
        # Minimal, deterministic example. Replace with your full logic if desired.
        self.trainer_scores = {
            "aidan o'brien": 4.0,
            "a p o'brien": 4.0,
            "joseph patrick o'brien": 3.8,
            "donnacha o'brien": 3.5,
            "dermot weld": 3.2,
            "jessica harrington": 2.8,
            "william haggas": 3.5,
            "john gosden": 3.5,
            "j & t gosden": 3.5,
            "charlie appleby": 3.2,
            "sir michael stoute": 3.0,
        }

    def score_runner(self, row: pd.Series) -> float:
        score = 0.0
        # REL-ish scoring from form string (simple and deterministic)
        for ch in str(row.get("form", "")):
            if ch == "1": score += 3
            elif ch == "2": score += 2
            elif ch == "3": score += 1
            elif ch.upper() in "FURP": score -= 1

        # Trainer influence (optional)
        t = str(row.get("trainer", "")).lower().strip()
        score += self.trainer_scores.get(t, 0)

        # OR rating influence (optional)
        try:
            score += float(row.get("or_rating", 0)) / 100.0
        except:
            pass

        return score

    def score_race(self, race_df: pd.DataFrame) -> pd.DataFrame:
        df = race_df.copy()
        df["Score"] = df.apply(self.score_runner, axis=1)
        return df.sort_values("Score", ascending=False).reset_index(drop=True)

    def top2(self, race_df: pd.DataFrame):
        scored = self.score_race(race_df)
        return scored.iloc[0], scored.iloc[1]

uf = UnityFilter()

In [None]:
# === Step 6: Run on all races and print a grid ===
import pandas as pd

results = []
for i, raw in enumerate(tables, start=1):
    try:
        race_df = map_columns(raw)
        # basic de-dup/cleanup: drop header-like rows by filtering non-empty names
        race_df = race_df[race_df["name"].str.len() > 0].dropna(subset=["name"])

        if len(race_df) < 2:
            print(f"Race {i}: not enough runners after mapping.")
            continue

        top1, top2 = uf.top2(race_df)
        results.append({
            "Race #": i,
            "🥇 1st": top1["name"],
            "Score1": round(float(top1["Score"]), 3),
            "🥈 2nd": top2["name"],
            "Score2": round(float(top2["Score"]), 3),
        })
    except Exception as e:
        print(f"Race {i}: parsing/scoring issue -> {e}")

grid = pd.DataFrame(results, columns=["Race #","🥇 1st","Score1","🥈 2nd","Score2"])
if len(grid):
    from IPython.display import display
    display(grid)
else:
    print("No race results to display. Check parsing flavor or column mapping.")

---
### Notes & Tweaks
- If Camelot finds 0 tables, switch `PARSING_FLAVOR` to `"lattice"` in the setup cell and re-run.
- If column mapping is wrong for a specific card, open the **Column Mapping Helper** cell and adjust the `map_columns` logic.
- The Unity filter shown is a **minimal deterministic example**. You can replace the scoring in `score_runner` with your full Unity logic.