
# PureUnityFilterCorrected — Colab Runner (EW Guardrails, Top 2 Only)

**What this notebook does**  
1. Ask you to **upload an ATR PDF racecard** (the "Form Printouts" PDF).  
2. Parse the PDF into races and runners (based on the v4 parsing approach we validated).  
3. Score every runner with **PureUnityFilterCorrected** (REL + MAP + CSI + TPI).  
4. Enforce **EW guardrails** (only races with 7–15 runners).  
5. Output **exactly 2 selections per qualifying race** (🥇 / 🥈).  
6. Save a summary CSV you can download.

> Tip: If any runner fields can't be parsed (rare), they default conservatively or get flagged in the output so you can correct upstream.


In [None]:

# If you're in Colab, these installs will succeed; running locally you can skip.
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False

# Core deps
!pip -q install pdfplumber==0.11.4 tabulate==0.9.0

import io
import re
import json
import pdfplumber
import pandas as pd
from dataclasses import dataclass, asdict
from typing import List, Dict, Any, Tuple, Optional

if IN_COLAB:
    from google.colab import files  # for upload/download

print("Environment ready. In Colab:", IN_COLAB)


In [None]:

# Upload the ATR Form Printouts PDF
pdf_bytes = None
pdf_name = None

if IN_COLAB:
    print("📄 Please upload your ATR 'Form Printouts' PDF...")
    up = files.upload()
    if not up:
        raise RuntimeError("No file uploaded.")
    pdf_name, pdf_bytes = next(iter(up.items()))
    print("Received:", pdf_name)
else:
    # Fallback for local runs: set a path manually if needed
    # pdf_name = "20250922hamallcardsatrformracecard.pdf"
    # with open(pdf_name, "rb") as f:
    #     pdf_bytes = f.read()
    raise RuntimeError("Not running in Colab. Please run this in Google Colab or set a local path.")


In [None]:

# -----------------------------
# ATR PDF Parsing (v4-style)
# -----------------------------

RACE_HEADER_RE = re.compile(
    r"^\(R(?P<race_no>\d+)\)\s+(?P<time>\d{1,2}:\d{2})\s+(?P<course>[A-Z][A-Za-z]+)",
    re.MULTILINE
)

# Horse lines look like:
# "1 (5) 022548 YOUNG FIRE (FR) 14 D"
# Followed by weight/jockey/trainer/OR on same or next lines; we only need: stall, form, name, age, OR, trainer, jockey, CD.
HORSE_HEAD_RE = re.compile(
    r"^\s*(?P<no>\d+)\s*\((?P<stall>\d+)\)\s*(?P<form>[A-Z0-9FURPD\-]+)\s+(?P<name>[A-Z][A-Z '\-().&/]+?)\s+(?P<age>\d+)\b",
    re.MULTILINE
)

# OR is typically at end of block line; we also try to catch 'CD' markers next to name or later
OR_RE = re.compile(r"\b(\d{2,3})\b(?![^\n]*\\b\w)")  # last 2-3 digit number on the block
CD_RE = re.compile(r"\b(CD|C|D)\b")

# Minimal dataclasses for structure
@dataclass
class Horse:
    name: str
    stall: Optional[int] = None
    form: str = ""
    or_rating: Optional[int] = None
    age: Optional[int] = None
    trainer: str = ""
    jockey: str = ""
    cd: str = ""

@dataclass
class Race:
    race_no: str
    time: str
    course: str
    distance: str  # we'll best-effort from header line or set placeholder
    cls: str       # class level text snippet
    horses: List[Horse]

def extract_text_from_pdf(pdf_bytes: bytes) -> str:
    with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
        pages = [p.extract_text() or "" for p in pdf.pages]
    text = "\\n\\n".join(pages)
    return text

def split_races(full_text: str) -> List[Tuple[str, str]]:
    \"\"\"Return list of (race_header_text, race_block_text).\"\"\"
    races = []
    headers = list(RACE_HEADER_RE.finditer(full_text))
    for i, m in enumerate(headers):
        start = m.start()
        end = headers[i+1].start() if i+1 < len(headers) else len(full_text)
        block = full_text[start:end]
        races.append((m.group(0), block))
    return races

def parse_race_header(header_line: str, block_text: str) -> Tuple[str, str, str, str]:
    m = RACE_HEADER_RE.search(header_line)
    race_no = m.group("race_no")
    time = m.group("time")
    course = m.group("course").upper()

    # Best-effort distance/class extraction from the first lines of the block
    # Look for a line like "HAMILTON, 1m 68y" and a next line with "(Class X)"
    dist_line = ""
    cls_line = ""
    lines = block_text.strip().splitlines()
    for ln in lines[:8]:  # look near the top
        if "," in ln and any(u in ln for u in ["f", "y", "m", "km", "yd", "y "]):
            dist_line = ln.strip()
        if "Class" in ln or "class" in ln:
            cls_line = ln.strip()
        if dist_line and cls_line:
            break
    # Compress distance: take text after comma, else keep line
    if "," in dist_line:
        distance = dist_line.split(",",1)[1].strip()
    else:
        distance = dist_line or "distance-unknown"
    cls = cls_line or "class-unknown"

    return race_no, time, course, distance, cls

def parse_horses(block_text: str) -> List[Horse]:
    horses: List[Horse] = []
    # We iterate over horse heads; for each, take the text until next head to scan extra fields
    head_matches = list(HORSE_HEAD_RE.finditer(block_text))
    for i, hm in enumerate(head_matches):
        start = hm.start()
        end = head_matches[i+1].start() if i+1 < len(head_matches) else len(block_text)
        chunk = block_text[start:end]

        stall = int(hm.group("stall"))
        form = hm.group("form")
        name = hm.group("name").strip()
        age = int(hm.group("age"))

        # OR: search in chunk (prefer last numeric token); if not found, None
        or_rating = None
        # grab numerics that look like OR at end; try a more robust approach:
        nums = re.findall(r"\\b(\\d{2,3})\\b", chunk)
        if nums:
            # Heuristic: OR is often the last 2-3 digit number on the block
            or_rating = int(nums[-1])

        # CD markers: search chunk
        cd_hits = set(re.findall(r"\\b(?:CD|C|D)\\b", chunk))
        cd = "".join(sorted(cd_hits)) if cd_hits else ""

        # Trainer and jockey: best-effort (we'll leave blank if not trivially present)
        # Often appear as lines like "... Jockey Name (claim)\\nTrainer Name"
        trainer = ""
        jockey = ""
        # naive attempt: lines after header may contain jockey then trainer
        lines = chunk.splitlines()
        for ln in lines[1:5]:
            # A very rough heuristic: if line contains '(' and ')' soon after a name, could be jockey
            if not jockey and re.search(r"\\b[A-Z][a-z]+\\s+[A-Z][a-z]+\\s*\\(\\d\\)", ln):
                jockey = ln.strip()
            # Trainer often in same area without parentheses count; we accept uppercase letters or O' prefixes
            if not trainer and re.search(r"[A-Z][a-z].*\\bO'\\w+|[A-Z][a-z]+\\s+[A-Z][a-z]+", ln):
                # Avoid double-catching the jockey line
                if ln.strip() != jockey:
                    trainer = ln.strip()

        horses.append(Horse(
            name=name,
            stall=stall,
            form=form,
            or_rating=or_rating,
            age=age,
            trainer=trainer,
            jockey=jockey,
            cd=cd
        ))
    return horses

def parse_pdf_to_races(pdf_bytes: bytes) -> List[Race]:
    text = extract_text_from_pdf(pdf_bytes)
    race_blocks = split_races(text)
    races: List[Race] = []
    for header_line, block in race_blocks:
        race_no, time, course, distance, cls = parse_race_header(header_line, block)
        horses = parse_horses(block)
        # Only add races that have at least one horse parsed
        if horses:
            races.append(Race(
                race_no=race_no, time=time, course=course,
                distance=distance, cls=cls, horses=horses
            ))
    return races

print("Parser functions loaded.")


In [None]:

# -----------------------------
# PureUnityFilterCorrected (EW guardrails, Top 2 only)
# -----------------------------
class PureUnityFilterCorrected:
    def __init__(self):
        self.trainer_scores = {
            "aidan o'brien": 4.0, "a p o'brien": 4.0, "joseph patrick o'brien": 3.8,
            "donnacha o'brien": 3.5, "dermot weld": 3.2, "jessica harrington": 2.8,
            "william haggas": 3.5, "john gosden": 3.5, "j & t gosden": 3.5,
            "charlie appleby": 3.2, "sir michael stoute": 3.0, "roger varian": 2.6,
            "andrew balding": 2.4, "ralph beckett": 2.2,
            "tim easterby": 2.8, "richard fahey": 2.6, "david o'meara": 2.2,
            "michael dods": 2.0, "kevin ryan": 1.8, "john quinn": 1.6
        }
        self.jockey_scores = {
            "ryan moore": 3.0, "william buick": 2.8, "frankie dettori": 2.8,
            "james doyle": 2.4, "tom marquand": 2.4, "oisin murphy": 2.6,
            "jim crowley": 2.2, "silvestre de sousa": 2.2, "rossa ryan": 2.0,
            "daniel tudhope": 2.4, "colin keane": 2.4, "seamie heffernan": 2.8,
            "wayne lordan": 2.2
        }

    def parse_form_corrected(self, form_str):
        if not form_str: return []
        positions = []
        for ch in form_str:
            if ch.isdigit():
                pos = int(ch); positions.append(10 if pos == 0 else pos)
            elif ch.upper() in 'FURPD':
                positions.append(15)
        return positions

    def calculate_corrected_rel(self, form, or_rating, age):
        if not form: return 1.0
        weights = [1.0,1.2,1.5,1.8,2.2,2.8,3.2,3.5]
        recent = form[-8:]
        score=0
        for i,pos in enumerate(recent):
            if i>=len(weights): break
            w=weights[i]
            if pos==1: points=8.0*w
            elif pos==2: points=5.5*w
            elif pos==3: points=4.0*w
            elif pos<=6: points=2.0*w
            elif pos<=10: points=0.5*w
            else: points=0
            score+=points
        extended=form[-12:]
        wins=sum(1 for p in extended if p==1)
        places=sum(1 for p in extended if p<=3)
        frames=sum(1 for p in extended if p<=6)
        consistency=(wins*3+places*2+frames)/len(extended) if extended else 0
        momentum=0.0
        if len(form)>=6:
            early=form[-6:-3]; recent3=form[-3:]
            if early and recent3:
                imp=(sum(early)/len(early))-(sum(recent3)/len(recent3))
                if imp>=3: momentum=2
                elif imp>=1.5: momentum=1
                elif imp>=0.5: momentum=0.5
                elif imp<=-3: momentum=-2
                elif imp<=-1.5: momentum=-1
                elif imp<=-0.5: momentum=-0.5
        base_rel=min(4.0,max(1.0,(score/25)+consistency+momentum))
        if age==3: base_rel*=1.15
        elif age==4: base_rel*=1.10
        elif age>=8: base_rel*=0.90
        if or_rating and or_rating>=85: base_rel*=0.95
        elif or_rating and or_rating<=55: base_rel*=1.05
        return round(max(1.0,min(4.0,base_rel)),2)

    def calculate_pure_map(self, stall, field_size, distance, track):
        if stall is None:
            draw_score = 0.0
        else:
            if stall<=field_size*0.3: draw_score=0.8
            elif stall>=field_size*0.7: draw_score=-0.6
            else: draw_score=0.1
            if field_size>=14: draw_score*=1.3
            elif field_size<=8: draw_score*=0.7
        return round(draw_score,1)

    def calculate_pure_csi(self, trainer, jockey, form):
        csi=0.0
        trainer_key=(trainer or "").lower()
        for t,s in self.trainer_scores.items():
            if t in trainer_key: csi+=s; break
        jockey_key=(jockey or "").lower()
        for j,s in self.jockey_scores.items():
            if j in jockey_key: csi+=s; break
        return round(min(csi,10.0),1)

    def calculate_pure_tpi(self, or_rating, cd_markers, age, class_level):
        tpi=0
        if or_rating is not None:
            if or_rating>=100:tpi+=5
            elif or_rating>=90:tpi+=4
            elif or_rating>=80:tpi+=3
            elif or_rating>=70:tpi+=2
            elif or_rating>=60:tpi+=1
        if cd_markers:
            c=cd_markers.upper()
            if 'CD'in c: tpi+=3
            elif c.count('C')+c.count('D')>=2: tpi+=2
            elif 'C'in c or 'D'in c: tpi+=1
        if age and 4<=age<=6: tpi+=1
        if class_level and 'group'in class_level.lower(): tpi+=2
        elif class_level and 'listed'in class_level.lower(): tpi+=1
        return min(tpi,12)

    def process_race(self, race_dict):
        horses = race_dict['horses']
        field_size = len(horses)

        # EW guardrails
        if field_size < 7:
            return {
                'qualifies': False,
                'reason': f'No Bet — only {field_size} runners (<7)',
                'selections': []
            }
        if field_size > 15:
            return {
                'qualifies': False,
                'reason': f'No Bet — {field_size} runners (>15)',
                'selections': []
            }

        places = 2 if 7 <= field_size <= 11 else 3

        processed = []
        for h in horses:
            form = self.parse_form_corrected(h.get('form',''))
            rel = self.calculate_corrected_rel(form, h.get('or',60), h.get('age',5))
            map_score = self.calculate_pure_map(h.get('stall'), field_size, race_dict.get('distance',''), race_dict.get('course',''))
            csi = self.calculate_pure_csi(h.get('trainer',''), h.get('jockey',''), form)
            tpi = self.calculate_pure_tpi(h.get('or',60), h.get('cd',''), h.get('age',5), race_dict.get('cls',''))
            primary = rel + map_score + csi
            total = primary + tpi
            processed.append({
                'name': h['name'], 'stall': h.get('stall'),
                'rel': rel, 'map': map_score, 'csi': csi, 'tpi': tpi,
                'primary': round(primary,1), 'total': round(total,1)
            })

        processed.sort(key=lambda x: (-x['primary'], -x['tpi'], -x['total']))

        return {
            'qualifies': True,
            'places': places,
            'field_size': field_size,
            'selections': processed[:2]
        }

print("PureUnityFilterCorrected loaded.")


In [None]:

# -----------------------------
# Run end-to-end
# -----------------------------

# 1) Parse PDF into races
races = parse_pdf_to_races(pdf_bytes)
if not races:
    raise RuntimeError("No races parsed. Please check that this is an ATR 'Form Printouts' PDF.")

# 2) Convert to simple dicts for the scorer
race_dicts = []
for r in races:
    race_dicts.append({
        'race_no': r.race_no,
        'time': r.time,
        'course': r.course,
        'distance': r.distance,
        'cls': r.cls,
        'horses': [{
            'name': h.name,
            'stall': h.stall,
            'form': h.form,
            'or': h.or_rating,
            'age': h.age,
            'trainer': h.trainer,
            'jockey': h.jockey,
            'cd': h.cd
        } for h in r.horses]
    })

# 3) Score with PureUnityFilterCorrected
puf = PureUnityFilterCorrected()
rows = []

print("=== Selections (EW races only) ===")
for rd in race_dicts:
    out = puf.process_race(rd)
    if out['qualifies']:
        s1, s2 = out['selections']
        print(f"{rd['time']} — {rd['course']}  ({out['field_size']} runners, {out['places']} places)")
        print(f"  🥇 {s1['name']}  | Primary={s1['primary']}  Total={s1['total']}")
        print(f"  🥈 {s2['name']}  | Primary={s2['primary']}  Total={s2['total']}")
        rows.append({
            'time': rd['time'], 'course': rd['course'], 'runners': out['field_size'],
            'places': out['places'],
            'gold': s1['name'], 'gold_primary': s1['primary'], 'gold_total': s1['total'],
            'silver': s2['name'], 'silver_primary': s2['primary'], 'silver_total': s2['total']
        })
    else:
        print(f"{rd['time']} — {rd['course']}  :: {out['reason']}")

# 4) Save a CSV summary
df = pd.DataFrame(rows)
csv_path = "pureunity_selections.csv"
df.to_csv(csv_path, index=False)
print("\\nSaved:", csv_path)

# If in Colab, auto-download
if IN_COLAB and len(rows):
    files.download(csv_path)


In [None]:

# Optional audit: show the first race horses parsed
try:
    first = race_dicts[0]
    pd.DataFrame(first['horses'])
except Exception as e:
    print("Audit display skipped:", e)
