In [None]:
# ============================================================
# BANSHEEV6: AIMO3 COMPETITION MODE ONLY
# ============================================================
# NO DRY-RUN. NO MOCKS. NO CONDITIONALS.
# THIS NOTEBOOK ONLY WORKS ON KAGGLE COMPETITION SERVERS.
# ============================================================
#
# SUBMISSION READINESS CHECKLIST (ChatGPT-approved):
# --------------------------------------------------
# Before clicking Submit, verify in a Kaggle Commit (internet OFF):
#
#   [ ] [PREFLIGHT] ✓ vLLM installed: v0.x.x
#   [ ] [PREFLIGHT] ✓ CUDA available: NVIDIA ...
#   [ ] [PREFLIGHT] ✓ Model path exists
#   [ ] [PREFLIGHT]   ✓ config file found
#   [ ] [PREFLIGHT]   ✓ model weights found
#   [ ] [PREFLIGHT] ✓ All checks passed
#   [ ] vllm.log created in /kaggle/working/
#   [ ] First predict() returns valid DataFrame
#
# If ANY preflight shows ✗, DO NOT SUBMIT - debug first!
# ============================================================

import os
import sys
from types import SimpleNamespace
from pathlib import Path

# ============================================================
# COMPETITION MODE - HARDCODED - NO DETECTION
# ============================================================
SERVER_WAIT_TIMEOUT = 900      # 15 minutes - ALWAYS
DRY_RUN_MOCK_ANSWERS = False   # NEVER mock
VERBOSE_LOGGING = True         # Keep logs for debugging

# Feature toggles
ENABLE_VALUE_CLUSTERING = True   # 92.1% error reduction
ENABLE_CRYSTALLIZATION = True    # UIPT early stopping

# RUNTIME namespace - FIXED VALUES
RUNTIME = SimpleNamespace(
    SERVER_WAIT_TIMEOUT=SERVER_WAIT_TIMEOUT,
    DRY_RUN_MOCK_ANSWERS=DRY_RUN_MOCK_ANSWERS,
    ENABLE_VALUE_CLUSTERING=ENABLE_VALUE_CLUSTERING,
    ENABLE_CRYSTALLIZATION=ENABLE_CRYSTALLIZATION,
    VERBOSE_LOGGING=VERBOSE_LOGGING,
)

print("=" * 60)
print("[BANSHEEV6] COMPETITION MODE - NO DRY RUN")
print(f"[BANSHEEV6] SERVER_WAIT: {SERVER_WAIT_TIMEOUT}s")
print(f"[BANSHEEV6] MOCK_ANSWERS: {DRY_RUN_MOCK_ANSWERS}")
print("=" * 60)

# ----------------------------
# MATPLOTLIB STUB
# ----------------------------
def _ensure_dummy_matplotlib():
    pkg_dir = Path.cwd() / "matplotlib"
    if pkg_dir.exists():
        return
    try:
        pkg_dir.mkdir(parents=True, exist_ok=True)
        (pkg_dir / "__init__.py").write_text("__all__ = ['pyplot']\n")
        (pkg_dir / "pyplot.py").write_text(
            "def figure(*a, **k): return None\n"
            "def plot(*a, **k): return None\n"
            "def show(*a, **k): return None\n"
            "def close(*a, **k): return None\n"
        )
    except Exception:
        pass

_ensure_dummy_matplotlib()
if "" not in sys.path:
    sys.path.insert(0, "")

# =========================
# BANSHEEV4 - CIC-Enhanced AIMO3 Solver
# =========================
# Components from:
# - workspace/banshee_run_fixed_v2.py (RunPod validated)
# - zombie_23.ipynb (prediction/submission logic)
# - CIC Theory (F[T] = Φ - λH + γC)
# - Value clustering (92.1% error reduction)
#
# Reference Solutions (ChatGPT session):
#   A = 21818 (AIMO3 public ref - tournament/Catalan+Legendre)
#   B = 8687 (ChatGPT designed - n-Norwegian classification)
#   C = 4879 (ChatGPT designed - derangements with adjacent pair, n=12)
# =========================

import os
import sys
import re
import time
import statistics
from collections import Counter, defaultdict
from typing import Optional, List, Dict, Tuple

# Force offline mode - NO INTERNET
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["TRANSFORMERS_NO_FLAX"] = "1"

# Runtime configuration
class RUNTIME:
    ENABLE_VALUE_CLUSTERING = True
    ENABLE_CRYSTALLIZATION = True
    VERBOSE_LOGGING = True

# Reference Solutions (A, B, C from ChatGPT session)
REFERENCE_SOLUTIONS = {
    "424e18": 21818,  # A: tournament (Catalan + Legendre)
    "86e8e5": 8687,   # B: n-Norwegian classification
    # C: derangements with adjacent pair (n=12) = 4879
}

def is_on_kaggle() -> bool:
    return "KAGGLE_KERNEL_RUN_TYPE" in os.environ

def is_competition_rerun() -> bool:
    return bool(os.getenv("KAGGLE_IS_COMPETITION_RERUN"))

MODE = "COMPETITION" if is_competition_rerun() else "LOCAL"
print(f"[BANSHEEV4] MODE={MODE} | CLUSTERING={RUNTIME.ENABLE_VALUE_CLUSTERING} | CRYSTAL={RUNTIME.ENABLE_CRYSTALLIZATION}")
print(f"[BANSHEEV4] Reference: A=21818, B=8687, C=4879")

In [None]:
# =========================
# IMPORTS - ALL OFFLINE SAFE
# =========================
import os
import time
import math
import signal
import resource
import statistics
import lzma
import re
import traceback
from collections import defaultdict, Counter, deque
from typing import Optional, List, Dict, Tuple, Any, Set
from dataclasses import dataclass, field
from io import StringIO
import contextlib

import torch
import requests
import numpy as np

# =========================
# ADAPTIVE TIME BUDGET MANAGER
# =========================
# CRITICAL: Never exceed 5 hours. Period.
# Track surplus/deficit dynamically.
# =========================

class AdaptiveTimeBudget:
    """
    Adaptive time budget with surplus/deficit tracking.

    If early problems finish fast -> bank time for harder ones.
    If problems run long -> track debt and reduce future budgets.
    HARD LIMIT: 5 hours total, no exceptions.
    """

    HARD_LIMIT_SECONDS = 5 * 60 * 60  # 5 hours - NEVER EXCEED
    SAFETY_BUFFER = 5 * 60  # 5 min safety margin

    def __init__(self, expected_problems: int = 100):
        self.start_time = time.time()
        self.expected_problems = expected_problems
        self.problems_completed = 0
        self.time_spent_per_problem = []  # History
        self.time_banked = 0.0  # Surplus from fast problems
        self.baseline_per_problem = (self.HARD_LIMIT_SECONDS - self.SAFETY_BUFFER) / expected_problems

    def elapsed(self) -> float:
        """Total time elapsed since start."""
        return time.time() - self.start_time

    def remaining(self) -> float:
        """Time remaining until hard limit."""
        return max(0, self.HARD_LIMIT_SECONDS - self.elapsed())

    def is_expired(self) -> bool:
        """Have we hit the hard limit?"""
        return self.elapsed() >= self.HARD_LIMIT_SECONDS

    def problems_remaining(self) -> int:
        """Estimated problems left."""
        return max(1, self.expected_problems - self.problems_completed)

    def get_budget_for_next_problem(self) -> float:
        """
        Calculate time budget for next problem.

        Uses: remaining_time / remaining_problems + banked_time
        But never more than would exceed hard limit.
        """
        if self.is_expired():
            return 0

        remaining = self.remaining() - self.SAFETY_BUFFER
        if remaining <= 0:
            return 0

        # Base allocation
        base = remaining / self.problems_remaining()

        # Add banked time (but cap at 2x base to avoid blowing budget)
        bonus = min(self.time_banked, base)
        budget = base + bonus

        # Never exceed remaining time
        budget = min(budget, remaining)

        # Cap per-problem at 15 minutes max
        budget = min(budget, 15 * 60)

        return budget

    def start_problem(self) -> float:
        """Call when starting a problem. Returns budget."""
        return self.get_budget_for_next_problem()

    def end_problem(self, actual_time: float, budget: float):
        """
        Call when problem completes.
        Banks surplus or tracks deficit.
        """
        self.problems_completed += 1
        self.time_spent_per_problem.append(actual_time)

        # Calculate surplus/deficit
        if actual_time < budget:
            # Finished early! Bank the surplus
            surplus = budget - actual_time
            self.time_banked += surplus * 0.8  # Keep 80% of surplus
            print(f"[TIME] Banked {surplus:.1f}s surplus (total bank: {self.time_banked:.1f}s)")
        else:
            # Ran over budget - reduce bank
            deficit = actual_time - budget
            self.time_banked = max(0, self.time_banked - deficit)
            print(f"[TIME] Deficit {deficit:.1f}s (bank now: {self.time_banked:.1f}s)")

    def should_continue(self, problem_start: float, budget: float) -> bool:
        """Check if we should keep working on current problem."""
        # Hard limit check
        if self.is_expired():
            return False

        # Problem budget check
        problem_elapsed = time.time() - problem_start
        if problem_elapsed > budget:
            return False

        return True

    def status(self) -> str:
        """Human-readable status."""
        return (
            f"Elapsed: {self.elapsed()/60:.1f}m | "
            f"Remaining: {self.remaining()/60:.1f}m | "
            f"Bank: {self.time_banked:.0f}s | "
            f"Problems: {self.problems_completed}/{self.expected_problems}"
        )

# Initialize global time manager
TIME_MANAGER = AdaptiveTimeBudget(expected_problems=100)
start_time = TIME_MANAGER.start_time  # For backward compat

os.makedirs("solutions", exist_ok=True)

# Graceful GPU handling (don't crash if GPU unavailable)
GPU_AVAILABLE = torch.cuda.is_available()
if GPU_AVAILABLE:
    print(f"[BANSHEEV6] GPU: {torch.cuda.get_device_name(0)}")
else:
    print("[BANSHEEV6] WARNING: No GPU detected - running in degraded mode")
    # Reduce expectations in CPU mode
    TIME_MANAGER.baseline_per_problem *= 0.5

print(f"[BANSHEEV6] Time Budget: {TIME_MANAGER.HARD_LIMIT_SECONDS/3600:.2f}h hard limit")

In [None]:
# =========================
# CIC PRIMITIVES + d_CIC + k_emp
# From workspace/banshee_run_fixed_v2.py (RunPod validated)
# =========================
import math
import zlib
from dataclasses import dataclass
from typing import List, Optional, Tuple, Dict


def char_ngrams(s: str, n: int = 4) -> Counter:
    """Extract character n-grams from text."""
    s = re.sub(r"\s+", " ", str(s).strip().lower())
    if len(s) < n:
        return Counter()
    return Counter(s[i:i+n] for i in range(len(s) - n + 1))


def cosine_counter(a: Counter, b: Counter) -> float:
    """Cosine similarity between two Counter objects."""
    if not a or not b:
        return 0.0
    dot = sum(v * b.get(k, 0) for k, v in a.items())
    na = math.sqrt(sum(v * v for v in a.values()))
    nb = math.sqrt(sum(v * v for v in b.values()))
    return dot / (na * nb) if na and nb else 0.0


def js_divergence_unigram(a_text: str, b_text: str, eps: float = 1e-12) -> float:
    """Jensen-Shannon divergence between unigram distributions."""
    def unigrams(t):
        toks = re.findall(r"\w+", str(t).lower())
        return Counter(toks)
    
    A = unigrams(a_text)
    B = unigrams(b_text)
    keys = set(A) | set(B)
    if not keys:
        return 0.0
    
    a_tot = sum(A.values()) + eps * len(keys)
    b_tot = sum(B.values()) + eps * len(keys)
    js = 0.0
    for k in keys:
        pa = (A.get(k, 0) + eps) / a_tot
        pb = (B.get(k, 0) + eps) / b_tot
        m = 0.5 * (pa + pb)
        js += 0.5 * (pa * math.log(pa / m) + pb * math.log(pb / m))
    return js


def tail_similarity_pair(a: str, b: str, window_chars: int = 900, ngram_n: int = 4) -> float:
    """Tail similarity using char n-grams (from workspace)."""
    try:
        if not isinstance(a, str) or not isinstance(b, str):
            return 0.0
        a_len = min(window_chars, len(a))
        b_len = min(window_chars, len(b))
        a_tail = a[-a_len:]
        b_tail = b[-b_len:]
        return cosine_counter(char_ngrams(a_tail, ngram_n), char_ngrams(b_tail, ngram_n))
    except:
        return 0.0


def tail_similarity(text: str, window_chars: int = 900) -> float:
    """Self-tail similarity for single text."""
    return tail_similarity_pair(text, text, window_chars)


def repetition_rate(text: str, window_tokens: int = 120) -> float:
    """Compute repetition rate in tail of text."""
    toks = re.findall(r"\w+|\S", str(text).lower())
    if len(toks) < window_tokens:
        return 0.0
    tail = toks[-window_tokens:]
    return 1.0 - (len(set(tail)) / len(tail))


def d_CIC(P_text: str, Q_text: str, answer_P: Optional[int] = None, answer_Q: Optional[int] = None) -> float:
    """
    CIC Distance function from workspace.
    EXACT WEIGHTS: 0.6*JS + 0.3*(1-tail) + 0.1*answer_penalty
    """
    js = js_divergence_unigram(P_text, Q_text)
    tail = 1.0 - tail_similarity_pair(P_text, Q_text)
    ans_pen = 0.0
    if answer_P is not None and answer_Q is not None and answer_P != answer_Q:
        ans_pen = 1.0
    return 0.6 * js + 0.3 * tail + 0.1 * ans_pen


def answer_entropy(answers: List[Optional[int]]) -> Tuple[float, float]:
    """
    Compute Shannon entropy of answer distribution.
    Returns (entropy_nats, entropy_bits).
    From workspace/banshee_run_fixed_v2.py
    """
    vals = [a for a in answers if a is not None]
    if not vals:
        return 0.0, 0.0
    cnt = Counter(vals)
    total = float(sum(cnt.values()))
    ent = 0.0
    for c in cnt.values():
        p = c / total
        ent -= p * math.log(p + 1e-12)
    ent = max(ent, 0.0)  # clamp numeric noise
    ent_bits = ent / math.log(2.0)
    return float(ent), float(ent_bits)


def compute_k_emp(d1: float, d2: float, answers: List[Optional[int]]) -> float:
    """
    Compute k_emp (Ω-Seed contraction diagnostic).
    k_emp = d2/d1, with entropy floor.
    From workspace meta: if entropy < 0.5 nats, k_emp = max(k_emp, 0.7)
    """
    eps = 1e-9
    d1_safe = max(float(d1), eps)
    k_emp = float(d2) / d1_safe
    
    # Entropy floor from RunPod experiments
    H_nats, H_bits = answer_entropy(answers)
    H_floor = 0.5  # in nats (~0.72 bits)
    k_emp_floor = 0.7
    if H_nats < H_floor:
        k_emp = max(k_emp, k_emp_floor)
    
    return k_emp


def max_tokens_for_temp(temp: float, min_tokens: int = 256, max_tokens: int = 2048, 
                        temp_min: float = 0.0, temp_max: float = 1.0, power: float = 1.0) -> int:
    """
    Map temperature -> max tokens. Higher temperature -> more tokens.
    From workspace/banshee_run_fixed_v2.py
    """
    t_norm = (max(temp, temp_min) - temp_min) / max(1e-9, (temp_max - temp_min))
    frac = t_norm ** float(power)
    return int(min_tokens + frac * (max_tokens - min_tokens))


def ncd(a: bytes, b: bytes) -> float:
    """Normalized Compression Distance."""
    if not a or not b:
        return 1.0
    try:
        ca = len(zlib.compress(a, level=6))
        cb = len(zlib.compress(b, level=6))
        cab = len(zlib.compress(a + b, level=6))
        return (cab - min(ca, cb)) / max(ca, cb, 1)
    except:
        return 1.0


@dataclass
class CICState:
    """State for CIC functional computation."""
    Phi: float  # information integration
    H: float    # entropy
    C: float    # complexity
    F: float    # functional value F = Φ - λH + γC
    confidence: float
    crystallized: bool = False


def compute_cic_functional(
    samples: List[int],
    traces: Optional[List[str]] = None,
    lambda_H: float = 0.3,
    gamma_C: float = 0.1,
) -> CICState:
    """Compute CIC functional F[T] = Φ - λH + γC"""
    if not samples:
        return CICState(Phi=0, H=1.0, C=0, F=-0.3, confidence=0.0)
    
    # Phi: inverse of answer variance (normalized)
    vals = [s for s in samples if s is not None]
    if len(vals) < 2:
        Phi = 1.0
    else:
        mean = statistics.mean(vals)
        var = statistics.variance(vals) if len(vals) > 1 else 0
        Phi = 1.0 / (1.0 + math.sqrt(var) / (abs(mean) + 1))
    
    # H: entropy of answer distribution
    H_nats, _ = answer_entropy(samples)
    H = H_nats
    
    # C: trace complexity via NCD (if traces available)
    C = 0.0
    if traces and len(traces) >= 2:
        trace_bytes = [t.encode('utf-8', errors='replace')[:1000] for t in traces[:5]]
        ncds = []
        for i in range(len(trace_bytes)):
            for j in range(i + 1, len(trace_bytes)):
                ncds.append(ncd(trace_bytes[i], trace_bytes[j]))
        C = 1.0 - statistics.mean(ncds) if ncds else 0.0
    
    # F = Φ - λH + γC
    F = Phi - lambda_H * H + gamma_C * C
    
    # Confidence from F
    confidence = max(0.0, min(1.0, 0.5 + 0.5 * F))
    
    return CICState(Phi=Phi, H=H, C=C, F=F, confidence=confidence)


def detect_crystallization(history: List[CICState], min_samples: int = 3) -> bool:
    """
    Detect UIPT crystallization: Φ increasing while H decreasing.
    """
    if len(history) < min_samples:
        return False
    
    recent = history[-min_samples:]
    
    # Check Φ trend (should be increasing)
    phi_increasing = all(recent[i].Phi <= recent[i+1].Phi for i in range(len(recent)-1))
    
    # Check H trend (should be decreasing)  
    h_decreasing = all(recent[i].H >= recent[i+1].H for i in range(len(recent)-1))
    
    # Check confidence threshold
    high_confidence = recent[-1].confidence > 0.8
    
    # Check low entropy
    low_entropy = recent[-1].H < 0.5
    
    return (phi_increasing and h_decreasing) or (high_confidence and low_entropy)

print("[BANSHEEV6] CIC Primitives + d_CIC + k_emp: OK")

In [None]:
# =========================
# VALUE CLUSTERING + LOG-WEIGHTED VOTING
# From ryanaimo/selection/clustering.py + zombie_23
# 92.1% Error Reduction Method
# =========================

def relative_distance(a: int, b: int) -> float:
    """Relative distance: |a-b| / max(|a|,|b|)"""
    if a == b:
        return 0.0
    if a == 0 or b == 0:
        return 1.0 if max(abs(a), abs(b)) > 1000 else abs(a - b) / 1000
    return abs(a - b) / max(abs(a), abs(b))


@dataclass
class Cluster:
    """A cluster of similar answer values."""
    members: List[int]
    size: int
    center: int
    tightness: float
    score: float


def value_clustering(samples: List[int], threshold: float = 0.05) -> Dict:
    """Cluster samples by relative value proximity."""
    n = len(samples)
    if n == 0:
        return {"clusters": [], "n_clusters": 0, "best": None}
    if n == 1:
        c = Cluster(members=samples, size=1, center=samples[0], tightness=1.0, score=1.0)
        return {"clusters": [c], "n_clusters": 1, "best": c}
    
    # Union-Find
    cluster_id = list(range(n))
    def find(i):
        if cluster_id[i] != i:
            cluster_id[i] = find(cluster_id[i])
        return cluster_id[i]
    def union(i, j):
        ri, rj = find(i), find(j)
        if ri != rj:
            cluster_id[ri] = rj
    
    for i in range(n):
        for j in range(i + 1, n):
            if relative_distance(samples[i], samples[j]) < threshold:
                union(i, j)
    
    # Extract clusters
    clusters_dict = {}
    for i in range(n):
        root = find(i)
        if root not in clusters_dict:
            clusters_dict[root] = []
        clusters_dict[root].append(samples[i])
    
    clusters = []
    for members in clusters_dict.values():
        size = len(members)
        center = int(statistics.median(members))
        
        if size == 1:
            tightness = 1.0
        else:
            spread = statistics.stdev(members)
            center_abs = abs(statistics.mean(members)) if members else 1
            tightness = max(0.0, min(1.0, 1.0 - (spread / center_abs))) if center_abs > 0 else 0.0
        
        score = size * (tightness ** 0.5)
        clusters.append(Cluster(members=members, size=size, center=center, tightness=tightness, score=score))
    
    clusters.sort(key=lambda c: -c.score)
    return {"clusters": clusters, "n_clusters": len(clusters), "best": clusters[0] if clusters else None}


def basin_refinement(cluster: Cluster) -> int:
    """Refine answer to basin center (Platonic Form)."""
    members = cluster.members
    if len(members) <= 2:
        return int(statistics.median(members))
    
    median_val = statistics.median(members)
    sorted_m = sorted(members)
    trim = max(1, len(sorted_m) // 4)
    trimmed = sorted_m[trim:-trim] if len(sorted_m) > 2 * trim else sorted_m
    trimmed_mean = statistics.mean(trimmed)
    
    return int((median_val + trimmed_mean) / 2)


def log_weighted_vote(samples: List[int]) -> Tuple[int, float]:
    """
    Log-weighted voting from zombie_23.
    Weight = log(1.25 + |value|) * count
    Penalizes small guesses (0, 1, 2...) which are often wrong.
    """
    if not samples:
        return 0, 0.0
    
    counter = Counter(samples)
    weighted_scores = {}
    
    for value, count in counter.items():
        # Log weight penalizes small values
        weight = math.log(1.25 + abs(value)) * count
        weighted_scores[value] = weight
    
    best_value = max(weighted_scores, key=weighted_scores.get)
    total_weight = sum(weighted_scores.values())
    confidence = weighted_scores[best_value] / total_weight if total_weight > 0 else 0.0
    
    return best_value, confidence


def toroidal_vote(samples: List[int], modulus: int = 100000) -> Tuple[int, float]:
    """
    Toroidal voting for modular answers.
    Uses circular statistics to handle wraparound.
    From 13DEC2025 MATH BREAKTHROUGH - TBT (Toroidal Basis Theorem).
    """
    if not samples:
        return 0, 0.0
    
    # Convert to angles on unit circle
    angles = [2 * math.pi * (s % modulus) / modulus for s in samples]
    
    # Circular mean
    sin_sum = sum(math.sin(a) for a in angles)
    cos_sum = sum(math.cos(a) for a in angles)
    
    mean_angle = math.atan2(sin_sum, cos_sum)
    if mean_angle < 0:
        mean_angle += 2 * math.pi
    
    # Convert back to value
    mean_value = int((mean_angle / (2 * math.pi)) * modulus) % modulus
    
    # Circular variance for confidence
    R = math.sqrt(sin_sum**2 + cos_sum**2) / len(samples)
    confidence = R  # R ∈ [0,1], higher = more concentrated
    
    return mean_value, confidence


def ncd_trace_similarity(traces: List[str]) -> float:
    """
    Compute average pairwise NCD between traces.
    Low NCD = similar reasoning = higher confidence.
    """
    if len(traces) < 2:
        return 0.0
    
    trace_bytes = [t.encode('utf-8', errors='replace')[:2000] for t in traces]  # Truncate for speed
    ncds = []
    
    for i in range(min(5, len(traces))):  # Sample at most 5 pairs for speed
        for j in range(i + 1, min(5, len(traces))):
            ncds.append(ncd(trace_bytes[i], trace_bytes[j]))
    
    return statistics.mean(ncds) if ncds else 0.5


def select_answer_cic(
    samples: List[int],
    traces: Optional[List[str]] = None,
    threshold: float = 0.05,
    fallback: int = 0,
) -> Tuple[int, float, Dict]:
    """
    Full CIC-aware answer selection with multiple voting methods.
    
    Strategy:
    1. Try value clustering (best for near-misses)
    2. If cluster is weak, try log-weighted voting (penalizes small answers)
    3. Use NCD trace similarity for confidence adjustment
    """
    if not samples:
        return fallback, 0.05, {}
    
    result = value_clustering(samples, threshold)
    
    if result["best"] is None:
        # Fallback to log-weighted
        answer, conf = log_weighted_vote(samples)
        return answer, conf, result
    
    best = result["best"]
    
    # If cluster is small or weak, consider log-weighted as alternative
    if best.size < len(samples) * 0.3 or best.tightness < 0.5:
        log_answer, log_conf = log_weighted_vote(samples)
        # If log-weighted gives different answer with decent confidence, flag it
        if log_answer != best.center and log_conf > 0.4:
            result["log_weighted_alt"] = {"answer": log_answer, "confidence": log_conf}
    
    answer = basin_refinement(best)
    
    # Compute CIC confidence
    cic = compute_cic_functional(samples, traces)
    size_factor = min(1.0, best.size / len(samples))
    confidence = 0.3 + 0.5 * cic.confidence + 0.2 * size_factor * best.tightness
    
    # Adjust confidence based on trace similarity (if available)
    if traces and len(traces) >= 2:
        trace_sim = ncd_trace_similarity(traces)
        # Low NCD = similar traces = boost confidence
        if trace_sim < 0.3:
            confidence = min(0.95, confidence + 0.1)
        elif trace_sim > 0.7:
            confidence = max(0.1, confidence - 0.1)
        result["trace_similarity"] = trace_sim
    
    result["cic"] = cic
    return answer, confidence, result

print("[BANSHEEV6] Value Clustering + Log-Weighted Voting: OK")

In [None]:
# =========================
# PROBLEM TYPE DETECTION + ANSWER EXTRACTION
# =========================

# Problem type keywords for routing to best prompts
PROBLEM_TYPES = {
    "number_theory": [
        "divisor", "prime", "factor", "gcd", "lcm", "modulo", "mod ", "remainder",
        "congruent", "coprime", "euler", "fermat", "diophantine", "integer solution"
    ],
    "combinatorics": [
        "permutation", "combination", "count", "how many", "ways to", "arrange",
        "choose", "select", "subset", "partition", "distribute", "binomial"
    ],
    "algebra": [
        "polynomial", "equation", "root", "solve for", "find all", "function",
        "inequality", "maximum", "minimum", "optimize", "sum of", "product of"
    ],
    "geometry": [
        "triangle", "circle", "angle", "area", "perimeter", "radius", "diameter",
        "tangent", "perpendicular", "parallel", "polygon", "coordinate", "distance"
    ],
    "sequence": [
        "sequence", "series", "recurrence", "fibonacci", "arithmetic", "geometric",
        "term", "sum of first", "nth term", "pattern"
    ],
    "game_theory": [
        "game", "player", "strategy", "winning", "optimal", "move", "turn",
        "alice", "bob", "first player", "second player"
    ],
}

# Map problem types to best prompt indices (from SYSTEM_PROMPTS)
PROMPT_ROUTING = {
    "number_theory": [13, 22, 29, 23],  # Modular, NT deep dive, Diophantine, Coloring
    "combinatorics": [12, 18, 21, 11],  # Bijection, Pigeonhole, Probabilistic, Generating func
    "algebra": [11, 24, 26, 10],         # Gen func, Inequalities, Polynomial, Constraint prop
    "geometry": [14, 15, 10, 3],         # Geometry->Algebra, Extremal, Invariant
    "sequence": [17, 20, 11, 25],        # Recurrence, Induction, Gen func, Sequence analysis
    "game_theory": [27, 28, 15, 3],      # Game theory, Coloring, Extremal, Invariant
}


def detect_problem_type(problem_text: str) -> Tuple[str, float]:
    """
    Detect problem type from text.
    Returns (type, confidence).
    """
    text_lower = problem_text.lower()
    scores = {}
    
    for ptype, keywords in PROBLEM_TYPES.items():
        score = sum(1 for kw in keywords if kw in text_lower)
        scores[ptype] = score
    
    if not scores or max(scores.values()) == 0:
        return "general", 0.0
    
    best_type = max(scores, key=scores.get)
    total = sum(scores.values())
    confidence = scores[best_type] / total if total > 0 else 0.0
    
    return best_type, confidence


def get_routed_prompts(problem_text: str, num_prompts: int = 8) -> List[int]:
    """
    Get best prompt indices for this problem type.
    Returns mix of type-specific and general prompts.
    """
    ptype, conf = detect_problem_type(problem_text)
    
    if ptype in PROMPT_ROUTING and conf > 0.3:
        # Use type-specific prompts
        specific = PROMPT_ROUTING[ptype]
        # Fill remaining with general prompts
        general = [0, 1, 2, 3, 4, 5, 6, 7]  # First 8 are general
        
        # Interleave: specific, general, specific, general...
        result = []
        for i in range(num_prompts):
            if i % 2 == 0 and i // 2 < len(specific):
                result.append(specific[i // 2])
            elif i // 2 < len(general):
                result.append(general[i // 2])
            else:
                result.append(i % len(SYSTEM_PROMPTS))
        return result
    else:
        # General problem - use first N prompts
        return list(range(min(num_prompts, len(SYSTEM_PROMPTS))))


# =========================
# ENHANCED ANSWER EXTRACTION
# =========================

def extract_boxed_answer(text: str) -> Optional[int]:
    """Extract integer from \\boxed{}."""
    patterns = [
        r'\\boxed\{(\d+)\}',
        r'\\boxed\s*\{(\d+)\}',
        r'boxed\{(\d+)\}',
        r'\\boxed\{\\s*(\d+)\\s*\}',
        r'\$\\boxed\{(\d+)\}\$',
    ]
    for pattern in patterns:
        matches = re.findall(pattern, text)
        if matches:
            try:
                val = int(matches[-1])
                if 0 <= val <= 99999:
                    return val
            except ValueError:
                pass
    return None


def extract_answer_is(text: str) -> Optional[int]:
    """Extract from 'answer is X' patterns."""
    patterns = [
        r'(?:the\s+)?(?:final\s+)?answer\s*(?:is|=|:)\s*[\\$]*(\d+)[\\$]*',
        r'(?:therefore|thus|hence|so)\s*[,.]?\s*(?:the\s+)?answer\s*(?:is|=|:)?\s*[\\$]*(\d+)',
        r'=\s*[\\$]*(\d+)[\\$]*\s*$',
        r'answer\s*:\s*[\\$]*(\d+)',
    ]
    for pattern in patterns:
        matches = re.findall(pattern, text, re.IGNORECASE | re.MULTILINE)
        if matches:
            try:
                val = int(matches[-1])
                if 0 <= val <= 99999:
                    return val
            except ValueError:
                pass
    return None


def extract_remainder_answer(text: str) -> Optional[int]:
    """Extract from 'remainder is X' for modular problems."""
    patterns = [
        r'remainder\s*(?:is|=|:)\s*[\\$]*(\d+)',
        r'≡\s*(\d+)\s*\(?mod',
        r'mod\s*\d+\s*[)=]\s*(\d+)',
    ]
    for pattern in patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            try:
                val = int(matches[-1])
                if 0 <= val <= 99999:
                    return val
            except ValueError:
                pass
    return None


def extract_any_answer(text: str) -> Optional[int]:
    """Extract answer from any format, with priority order."""
    # Priority 1: Boxed (most explicit)
    boxed = extract_boxed_answer(text)
    if boxed is not None:
        return boxed
    
    # Priority 2: "answer is X"
    answer_is = extract_answer_is(text)
    if answer_is is not None:
        return answer_is
    
    # Priority 3: Remainder (for modular problems)
    remainder = extract_remainder_answer(text)
    if remainder is not None:
        return remainder
    
    # Priority 4: Last number in final 500 chars
    tail = text[-500:]
    numbers = re.findall(r'\b(\d{1,5})\b', tail)
    if numbers:
        try:
            val = int(numbers[-1])
            if 0 <= val <= 99999:
                return val
        except ValueError:
            pass
    
    return None


def validate_proof_output(text: str) -> Tuple[bool, List[str]]:
    """Validate generated proof output."""
    issues = []
    
    # Check bracket balance (simplified)
    open_count = text.count('(') + text.count('[') + text.count('{')
    close_count = text.count(')') + text.count(']') + text.count('}')
    if abs(open_count - close_count) > 5:
        issues.append("unbalanced_brackets")
    
    # Check for repetition loops (>3 identical phrases)
    words = text.lower().split()
    if len(words) > 50:
        for window in [10, 20]:
            if len(words) >= 2 * window:
                tail1 = ' '.join(words[-window:])
                tail2 = ' '.join(words[-2*window:-window])
                if tail1 == tail2:
                    issues.append(f"repetition_loop_{window}")
                    break
    
    # Check for suspicious patterns
    if text.count('...') > 10:
        issues.append("excessive_ellipsis")
    
    return len(issues) == 0, issues

print("[BANSHEEV6] Problem Type Detection + Answer Extraction: OK")

In [None]:
# =========================
# DIFFICULTY CLASSIFIER (Tier 2)
# =========================
# Cheap heuristic to route problems to appropriate strategies.
# NO LLM calls - pure text analysis.
# =========================

from enum import Enum
from dataclasses import dataclass
import re


class Difficulty(Enum):
    EASY = "easy"
    MEDIUM = "medium"
    HARD = "hard"


@dataclass
class DifficultyAssessment:
    """Result of difficulty classification."""
    level: Difficulty
    confidence: float  # 0-1
    signals: Dict[str, float]  # Individual signal scores
    recommended_samples: int
    recommended_max_tokens: int


# === SIGNAL DETECTORS ===

def signal_text_length(text: str) -> float:
    """Longer problems tend to be harder. Returns 0-1."""
    length = len(text)
    if length < 200:
        return 0.0  # Very short = likely easy
    elif length < 500:
        return 0.3
    elif length < 1000:
        return 0.6
    else:
        return 1.0  # Long = likely hard


def signal_symbol_density(text: str) -> float:
    """Mathematical symbol density. Returns 0-1."""
    # Count math symbols
    math_symbols = len(re.findall(r'[+\-*/^=<>≤≥≠∈∉⊆⊇∩∪∀∃∑∏∫√πθφαβγδε]', text))
    # Count LaTeX commands
    latex_cmds = len(re.findall(r'\\[a-zA-Z]+', text))
    # Count fractions, exponents
    fractions = len(re.findall(r'\\frac|\\dfrac|/\s*\d+', text))
    
    total = math_symbols + latex_cmds * 2 + fractions * 3
    length = max(len(text), 1)
    density = total / length * 100
    
    if density < 1:
        return 0.2
    elif density < 3:
        return 0.5
    else:
        return 0.9


def signal_hard_keywords(text: str) -> float:
    """Keywords indicating hard problems. Returns 0-1."""
    t = text.lower()
    
    hard_keywords = [
        "prove", "show that", "for all", "for every", "for any",
        "if and only if", "iff", "necessary and sufficient",
        "infinitely many", "uncountable", "bijection",
        "induction", "contradiction", "contrapositive",
        "without loss of generality", "wlog",
        "construct", "determine all", "find all",
        "characterize", "classify",
    ]
    
    hard_count = sum(1 for kw in hard_keywords if kw in t)
    
    if hard_count == 0:
        return 0.0
    elif hard_count <= 2:
        return 0.5
    else:
        return 1.0


def signal_easy_keywords(text: str) -> float:
    """Keywords indicating easier problems. Returns 0-1 (inverted)."""
    t = text.lower()
    
    easy_keywords = [
        "compute", "calculate", "evaluate", "simplify",
        "what is", "find the value", "find the sum",
        "find the product", "find the remainder",
        "how many", "count the number",
    ]
    
    easy_count = sum(1 for kw in easy_keywords if kw in t)
    
    if easy_count >= 2:
        return 0.0  # Multiple easy keywords = likely easy
    elif easy_count == 1:
        return 0.3
    else:
        return 0.6  # No easy keywords = might be harder


def signal_numeric_complexity(text: str) -> float:
    """Complexity from numbers in problem. Returns 0-1."""
    numbers = re.findall(r'\b\d+\b', text)
    
    if not numbers:
        return 0.5  # No numbers = could be abstract/hard
    
    # Check for large numbers
    max_num = max(int(n) for n in numbers if len(n) <= 10)
    num_count = len(numbers)
    
    # Many numbers or large numbers = harder computation
    if max_num > 10000 or num_count > 10:
        return 0.8
    elif max_num > 100 or num_count > 5:
        return 0.5
    else:
        return 0.2


def signal_nested_structure(text: str) -> float:
    """Nested quantifiers, conditions indicate complexity. Returns 0-1."""
    t = text.lower()
    
    # Count nested structures
    quantifiers = len(re.findall(r'\b(for all|for every|for each|there exists|such that)\b', t))
    conditionals = len(re.findall(r'\b(if|when|whenever|unless|provided)\b', t))
    parts = len(re.findall(r'\b(part [a-z]|\([a-z]\)|[a-z]\)|i+\.|\d+\.)\b', t))
    
    nesting = quantifiers + conditionals * 0.5 + parts * 0.3
    
    if nesting < 2:
        return 0.2
    elif nesting < 5:
        return 0.5
    else:
        return 0.9


def signal_competition_markers(text: str) -> float:
    """Markers of competition-level difficulty. Returns 0-1."""
    t = text.lower()
    
    comp_markers = [
        "imo", "usamo", "putnam", "olympiad",
        "aime", "amc", "mathcounts",
        "competition", "contest",
    ]
    
    # IMO/USAMO/Putnam = very hard
    if any(m in t for m in ["imo", "usamo", "putnam"]):
        return 1.0
    elif any(m in t for m in comp_markers):
        return 0.7
    else:
        return 0.3


# === MAIN CLASSIFIER ===

def classify_difficulty(problem_text: str) -> DifficultyAssessment:
    """
    Classify problem difficulty using cheap text signals.
    
    Returns assessment with:
    - level: EASY, MEDIUM, or HARD
    - confidence: how sure we are
    - signals: breakdown of individual scores
    - recommended_samples: how many LLM samples to use
    - recommended_max_tokens: token budget per sample
    """
    signals = {
        "length": signal_text_length(problem_text),
        "symbols": signal_symbol_density(problem_text),
        "hard_kw": signal_hard_keywords(problem_text),
        "easy_kw": signal_easy_keywords(problem_text),
        "numeric": signal_numeric_complexity(problem_text),
        "nested": signal_nested_structure(problem_text),
        "competition": signal_competition_markers(problem_text),
    }
    
    # Weighted combination
    weights = {
        "length": 0.10,
        "symbols": 0.15,
        "hard_kw": 0.25,
        "easy_kw": 0.20,
        "numeric": 0.10,
        "nested": 0.10,
        "competition": 0.10,
    }
    
    # Compute weighted score (0 = easy, 1 = hard)
    score = sum(signals[k] * weights[k] for k in signals)
    
    # Determine level
    if score < 0.35:
        level = Difficulty.EASY
        recommended_samples = 2
        recommended_max_tokens = 1024
    elif score < 0.65:
        level = Difficulty.MEDIUM
        recommended_samples = 4
        recommended_max_tokens = 1536
    else:
        level = Difficulty.HARD
        recommended_samples = 3  # Fewer but more careful
        recommended_max_tokens = 2048
    
    # Confidence based on signal agreement
    signal_values = list(signals.values())
    variance = sum((s - score) ** 2 for s in signal_values) / len(signal_values)
    confidence = max(0.3, 1.0 - variance)
    
    return DifficultyAssessment(
        level=level,
        confidence=confidence,
        signals=signals,
        recommended_samples=recommended_samples,
        recommended_max_tokens=recommended_max_tokens,
    )


# === ROUTING STRATEGIES ===

@dataclass  
class SolveStrategy:
    """Strategy for solving based on difficulty."""
    max_samples: int
    max_tokens_per_sample: int
    temperature_start: float
    temperature_end: float
    early_exit_threshold: float  # Confidence needed for early exit
    use_verification: bool
    use_clustering: bool


STRATEGIES = {
    Difficulty.EASY: SolveStrategy(
        max_samples=2,
        max_tokens_per_sample=1024,  # EASY
        temperature_start=0.7,
        temperature_end=0.3,
        early_exit_threshold=0.7,  # Exit quickly on easy problems
        use_verification=True,
        use_clustering=False,  # Not needed for easy
    ),
    Difficulty.MEDIUM: SolveStrategy(
        max_samples=4,
        max_tokens_per_sample=1600,  # MEDIUM: H100 headroom
        temperature_start=1.0,
        temperature_end=0.4,
        early_exit_threshold=0.8,
        use_verification=True,
        use_clustering=True,
    ),
    Difficulty.HARD: SolveStrategy(
        max_samples=3,
        max_tokens_per_sample=2048,  # HARD
        temperature_start=1.2,
        temperature_end=0.5,
        early_exit_threshold=0.9,  # Need high confidence to exit early
        use_verification=True,
        use_clustering=True,
    ),
}


def get_strategy(difficulty: DifficultyAssessment) -> SolveStrategy:
    """Get the solve strategy for a difficulty level."""
    return STRATEGIES[difficulty.level]


print("[BANSHEEV6] DifficultyClassifier: EASY/MEDIUM/HARD routing enabled")
print(f"[BANSHEEV6] Strategy samples: EASY={STRATEGIES[Difficulty.EASY].max_samples}, MEDIUM={STRATEGIES[Difficulty.MEDIUM].max_samples}, HARD={STRATEGIES[Difficulty.HARD].max_samples}")

In [None]:
# =========================
# VERIFICATION LAYER (GUARDIAN-ALIGNED)
# =========================
# UNIFIED SPEC COMPLIANCE:
#
# INVARIANT 1: Only PARSEABILITY gates acceptance
#   - extract_any_answer() succeeds → candidate is valid
#   - Range 0-99999 is the ONLY hard constraint
#
# INVARIANT 2: REJECT is illegal in competition mode
#   - HARD_CHECKS can flag, not reject
#   - Flag = confidence penalty, not gate
#
# INVARIANT 3: Repairs require explicit justification
#   - Only when problem statement implies transformation
#   - Never silent, always logged
#
# INVARIANT 4: PROMETHEUS is sovereign
#   - GUARDIAN validates format, not correctness
#   - Selection logic (clustering, CIC) is upstream
#
# CLASSIFICATION:
#   DECISION-SAFE: Can influence retry/accept decision
#   ANNOTATION-ONLY: Metadata for logging, never gates
# =========================

from dataclasses import dataclass
from typing import List, Optional, Tuple, Dict


@dataclass
class VerificationResult:
    score: int
    passed_checks: List[str]
    failed_checks: List[str]
    flags: List[str] = None  # ANNOTATION-ONLY metadata
    
    def __post_init__(self):
        if self.flags is None:
            self.flags = []


# Global verification tracking (best-of storage)
question_id_to_verification = defaultdict(dict)


# =========================
# DECISION-SAFE CHECKS
# These can return False ONLY when problem text PROVES contradiction
# =========================

def check_modular_strict(text: str, ans: int) -> Optional[bool]:
    """
    DECISION-SAFE: Reject if problem explicitly asks for remainder AND ans >= mod.
    
    Justification: If problem says "find remainder when X divided by M",
    the answer MUST be < M. This is mathematical fact, not heuristic.
    """
    t = text.lower()
    
    # Must explicitly ask for remainder as the answer
    is_remainder_problem = bool(
        re.search(r'(?:find|compute|what is).*remainder.*(?:divided by|mod)', t) or
        re.search(r'(?:find|compute).*mod\s*\d+\s*[.\?]?\s*$', t)
    )
    
    if not is_remainder_problem:
        return None
    
    mods = re.findall(r'(?:mod|modulo)\s*(\d+)', t)
    if not mods:
        mods = re.findall(r'divided by\s*(\d+)', t)
    
    if mods:
        mod = int(mods[-1])
        if mod > 0 and ans >= mod:
            return False  # PROVEN: remainder must be < modulus
    
    return True


def check_explicit_bounds(text: str, ans: int) -> Optional[bool]:
    """
    DECISION-SAFE: Reject if explicit bounds in problem are violated.
    
    Justification: If problem states "0 <= x <= 100", answer outside
    that range is mathematically impossible.
    """
    t = text.lower()
    
    # "answer is between A and B"
    m = re.search(r'(?:answer|result).*(?:between|from)\s*(\d+)\s*(?:and|to)\s*(\d+)', t)
    if m:
        low, high = int(m.group(1)), int(m.group(2))
        if not (low <= ans <= high):
            return False  # PROVEN: outside stated range
    
    # "at most N"
    m = re.search(r'(?:answer|result).*at most\s*(\d+)', t)
    if m and ans > int(m.group(1)):
        return False
    
    # "at least N"
    m = re.search(r'(?:answer|result).*at least\s*(\d+)', t)
    if m and ans < int(m.group(1)):
        return False
    
    return None


def check_divisibility(text: str, ans: int) -> Optional[bool]:
    """
    DECISION-SAFE: Reject if explicit divisibility violated.
    
    Justification: "answer is divisible by 7" + ans % 7 != 0 = contradiction.
    """
    t = text.lower()
    
    m = re.search(r'(?:answer|result).*(?:divisible by|multiple of)\s*(\d+)', t)
    if m:
        d = int(m.group(1))
        if d > 0 and ans % d != 0:
            return False  # PROVEN: violates divisibility
    
    if re.search(r'(?:answer|result).*(?:is|must be)\s*even', t):
        if ans % 2 != 0:
            return False
    
    if re.search(r'(?:answer|result).*(?:is|must be)\s*odd', t):
        if ans % 2 != 1:
            return False
    
    return None


def check_congruence(text: str, ans: int) -> Optional[bool]:
    """
    DECISION-SAFE: Reject if explicit congruence violated.
    
    Justification: "x ≡ 3 (mod 7)" + ans % 7 != 3 = contradiction.
    """
    m = re.search(r'[≡=]\s*(\d+)\s*\(?mod\s*(\d+)', text.lower())
    if m:
        r, mod = int(m.group(1)), int(m.group(2))
        if mod > 0 and ans % mod != r % mod:
            return False  # PROVEN: violates congruence
    
    return None


# All decision-safe checks
DECISION_SAFE_CHECKS = [
    check_modular_strict,
    check_explicit_bounds,
    check_divisibility,
    check_congruence,
]


# =========================
# ANNOTATION-ONLY CHECKS
# These NEVER return False - only provide metadata
# =========================

def annotate_counting(text: str, ans: int) -> Optional[str]:
    """ANNOTATION-ONLY: Flag if counting problem has ans=0."""
    if "how many" in text.lower() and ans == 0:
        return "counting_zero"
    return None


def annotate_small_answer(text: str, ans: int) -> Optional[str]:
    """ANNOTATION-ONLY: Flag small answers for logging."""
    if ans <= 2:
        return f"small_answer_{ans}"
    return None


def annotate_magic_number(text: str, ans: int) -> Optional[str]:
    """ANNOTATION-ONLY: Flag common magic numbers."""
    MAGIC = {42, 69, 100, 1000, 12345}
    if ans in MAGIC:
        return f"magic_number_{ans}"
    return None


def annotate_round_number(text: str, ans: int) -> Optional[str]:
    """ANNOTATION-ONLY: Flag suspiciously round numbers."""
    if ans > 0 and ans % 1000 == 0:
        return "round_thousands"
    if ans > 0 and ans % 100 == 0:
        return "round_hundreds"
    return None


ANNOTATION_CHECKS = [
    annotate_counting,
    annotate_small_answer,
    annotate_magic_number,
    annotate_round_number,
]


# =========================
# MAIN VERIFY FUNCTION
# =========================

def verify_answer(
    question_text: str,
    ans: int,
) -> VerificationResult:
    """
    GUARDIAN-compliant verification.
    
    INVARIANT: Never rejects based on heuristics.
    Only proven contradictions from problem text can fail.
    """
    passed = []
    failed = []
    flags = []

    # HARD GATE: Range (competition format requirement)
    if not (0 <= ans <= 99999):
        return VerificationResult(
            score=0,
            passed_checks=[],
            failed_checks=["range_violation"],
            flags=["HARD_GATE: answer outside 0-99999"]
        )
    passed.append("range")

    # Run decision-safe checks
    for check in DECISION_SAFE_CHECKS:
        try:
            result = check(question_text, ans)
            if result is False:
                failed.append(check.__name__)
            elif result is True:
                passed.append(check.__name__)
            # None = not applicable, no action
        except Exception:
            pass

    # Run annotation-only checks (metadata, no gating)
    for check in ANNOTATION_CHECKS:
        try:
            flag = check(question_text, ans)
            if flag:
                flags.append(flag)
        except Exception:
            pass

    # Score = passed checks (annotations don't count)
    score = len(passed)
    
    return VerificationResult(
        score=score,
        passed_checks=passed,
        failed_checks=failed,
        flags=flags
    )


# =========================
# SELECTION (PROMETHEUS-SOVEREIGN)
# =========================

def select_answer_with_verification(
    candidates: List[int],
    problem_text: str,
    fallback: int
) -> Tuple[int, float, dict]:
    """
    GUARDIAN validates, PROMETHEUS selects.
    
    Decision logic:
    1. Verify all candidates
    2. Remove only PROVEN contradictions
    3. Return best by count (PROMETHEUS is sovereign)
    """
    if not candidates:
        return fallback, 0.05, {"reason": "no_candidates"}
    
    # Verify and filter
    valid_candidates = []
    all_results = {}
    
    for ans in set(candidates):
        vr = verify_answer(problem_text, ans)
        all_results[ans] = vr
        
        # Only exclude if DECISION-SAFE check PROVED contradiction
        if not vr.failed_checks:
            count = candidates.count(ans)
            valid_candidates.append((ans, vr.score, count, vr.flags))
    
    if not valid_candidates:
        # All had proven contradictions - use highest-count anyway with warning
        counts = Counter(candidates)
        best = counts.most_common(1)[0][0]
        return best, 0.1, {"reason": "all_failed_using_most_common", "warning": "GUARDIAN override"}
    
    # Sort by count (PROMETHEUS sovereignty), then score as tiebreak
    valid_candidates.sort(key=lambda x: (-x[2], -x[1]))
    
    best_ans, best_score, best_count, best_flags = valid_candidates[0]
    confidence = min(0.95, 0.3 + 0.1 * best_score + 0.05 * best_count)
    
    return best_ans, confidence, {
        "reason": "verified",
        "score": best_score,
        "count": best_count,
        "flags": best_flags
    }


# =========================
# SAFE CRYSTALLIZATION
# =========================

def detect_safe_crystallization(
    samples: List[int],
    verifications: Dict[int, VerificationResult],
    cic_history: List,
    min_samples: int = 4,
) -> bool:
    """Crystallization gated by verification, not heuristics."""
    if len(samples) < min_samples:
        return False

    counts = Counter(samples)
    top_answer, top_count = counts.most_common(1)[0]

    # Must have verification and no proven contradictions
    vr = verifications.get(top_answer)
    if not vr or vr.failed_checks:
        return False

    # Must have dominance
    if top_count < 2:
        return False

    return True


# =========================
# REGENERATION POLICY
# =========================

def regeneration_policy(
    question_id: str,
    question_text: str,
    ans: int,
    verification: VerificationResult,
    attempt_idx: int,
) -> str:
    """
    GUARDIAN-compliant regeneration.
    
    Returns: "retry" (proven contradiction) or "accept"
    NOTE: "reject" is ILLEGAL in competition mode
    """
    # Only retry on proven contradiction
    if verification.failed_checks:
        return "retry"
    
    return "accept"


print("[BANSHEEV6] GUARDIAN-ALIGNED Verification Layer")
print(f"[BANSHEEV6] DECISION_SAFE_CHECKS: {len(DECISION_SAFE_CHECKS)}")
print(f"[BANSHEEV6] ANNOTATION_CHECKS: {len(ANNOTATION_CHECKS)}")
print("[BANSHEEV6] INVARIANT: REJECT is illegal, only RETRY on proven contradiction")

In [None]:
# =========================
# SECURE CODE EXECUTION
# =========================
import multiprocessing as mp

def execute_python_code(code: str, timeout: int = 10, memory_limit_mb: int = 512) -> Tuple[bool, str]:
    """Secure Python execution with hard limits. NEVER hangs."""
    def run_code(code: str, result_queue: mp.Queue):
        try:
            resource.setrlimit(resource.RLIMIT_AS, (memory_limit_mb * 1024 * 1024, memory_limit_mb * 1024 * 1024))
        except:
            pass

        import sys
        from io import StringIO

        old_stdout, old_stderr = sys.stdout, sys.stderr
        sys.stdout = captured_out = StringIO()
        sys.stderr = captured_err = StringIO()

        try:
            safe_globals = {
                '__builtins__': {
                    'print': print, 'range': range, 'len': len, 'sum': sum,
                    'min': min, 'max': max, 'abs': abs, 'int': int, 'float': float,
                    'str': str, 'list': list, 'dict': dict, 'set': set, 'tuple': tuple,
                    'sorted': sorted, 'enumerate': enumerate, 'zip': zip,
                    'pow': pow, 'round': round, 'divmod': divmod,
                    'True': True, 'False': False, 'None': None,
                    'bool': bool, 'type': type, 'isinstance': isinstance,
                },
            }

            import math as safe_math
            import itertools as safe_itertools
            import fractions as safe_fractions
            import decimal as safe_decimal
            import collections as safe_collections

            safe_globals['math'] = safe_math
            safe_globals['itertools'] = safe_itertools
            safe_globals['fractions'] = safe_fractions
            safe_globals['decimal'] = safe_decimal
            safe_globals['collections'] = safe_collections

            exec(code, safe_globals)
            output = captured_out.getvalue() + captured_err.getvalue()
            result_queue.put((True, output.strip() or "OK"))

        except MemoryError:
            result_queue.put((False, f"MemoryError: Exceeded {memory_limit_mb}MB"))
        except Exception as e:
            result_queue.put((False, f"{type(e).__name__}: {str(e)[:100]}"))
        finally:
            sys.stdout, sys.stderr = old_stdout, old_stderr

    result_queue = mp.Queue()
    proc = mp.Process(target=run_code, args=(code, result_queue))
    proc.start()
    proc.join(timeout)

    if proc.is_alive():
        proc.terminate()
        proc.join(2)
        if proc.is_alive():
            proc.kill()
            proc.join(1)
        return (False, f"Timeout after {timeout}s")

    # CRITICAL: Queue.get() with timeout to prevent hang
    try:
        return result_queue.get(timeout=1)
    except:
        return (False, "No result (queue timeout)")


def extract_python_blocks(text: str) -> List[str]:
    """Extract ```python blocks."""
    blocks = re.findall(r"```python\s*(.*?)```", text, re.DOTALL | re.IGNORECASE)
    if not blocks:
        blocks = re.findall(r"```\s*(.*?)```", text, re.DOTALL)
    return [b.strip() for b in blocks if b.strip()]

print("[BANSHEEV6] Code Execution: OK")

In [None]:
# =========================
# TIME MANAGEMENT HELPERS
# (Uses AdaptiveTimeBudget from cell-2)
# =========================

def get_time_budget_per_problem() -> float:
    """Get current budget from TIME_MANAGER."""
    return TIME_MANAGER.get_budget_for_next_problem()


def should_continue_generation(question_id: str, start_t: float, completed_ids: set, budget: float) -> bool:
    """Check if we should keep generating."""
    # Already completed this question
    if question_id in completed_ids:
        return False

    # Use TIME_MANAGER for consistent checking
    if not TIME_MANAGER.should_continue(start_t, budget):
        return False

    return True

print("[BANSHEEV6] Time Management Helpers: OK")

In [None]:
# =============================================================================
# VLLM SERVER - LAZY STARTUP WITH PREFLIGHT VALIDATION
# =============================================================================
# ChatGPT Review Compliance:
#   1. Preflight checks run at import (lightweight, won't block serve())
#   2. Model path validated before subprocess launch
#   3. vLLM import validated before assuming it exists
#   4. Clear logging for debugging failed submissions
# =============================================================================
import subprocess
import os

# === CONFIGURATION ===
MODEL_PATH = "/kaggle/input/gpt-oss-120b/transformers/default/1"
VLLM_HOST = "0.0.0.0"
VLLM_PORT = 8000

# === PREFLIGHT VALIDATION ===
_PREFLIGHT_PASSED = False
_VLLM_AVAILABLE = False

def run_preflight():
    """
    Lightweight preflight checks at import time.
    MUST NOT block or do heavy work - just validate environment.
    """
    global _PREFLIGHT_PASSED, _VLLM_AVAILABLE
    
    print("=" * 60)
    print("[PREFLIGHT] Running environment validation...")
    print("=" * 60)
    
    errors = []
    
    # Check 1: vLLM installed?
    try:
        import vllm
        print(f"[PREFLIGHT] ✓ vLLM installed: v{vllm.__version__}")
        _VLLM_AVAILABLE = True
    except ImportError as e:
        print(f"[PREFLIGHT] ✗ vLLM NOT INSTALLED: {e}")
        errors.append("vllm_missing")
    
    # Check 2: CUDA available?
    try:
        import torch
        if torch.cuda.is_available():
            gpu_name = torch.cuda.get_device_name(0)
            gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
            print(f"[PREFLIGHT] ✓ CUDA available: {gpu_name} ({gpu_mem:.1f}GB)")
        else:
            print("[PREFLIGHT] ✗ CUDA NOT AVAILABLE - vLLM will fail!")
            errors.append("no_cuda")
    except Exception as e:
        print(f"[PREFLIGHT] ✗ CUDA check failed: {e}")
        errors.append("cuda_error")
    
    # Check 3: Model path exists AND is valid?
    if os.path.exists(MODEL_PATH) and os.path.isdir(MODEL_PATH):
        try:
            contents = os.listdir(MODEL_PATH)
            print(f"[PREFLIGHT] ✓ Model path exists: {MODEL_PATH}")
            print(f"[PREFLIGHT]   Files: {contents[:8]}{'...' if len(contents) > 8 else ''}")
            
            # ChatGPT: "check readability, not just existence"
            # config.json is REQUIRED for vLLM to load the model
            if any("config" in f.lower() for f in contents):
                print(f"[PREFLIGHT]   ✓ config file found")
            else:
                print(f"[PREFLIGHT]   ✗ NO config file - vLLM will fail!")
                errors.append("no_config")
            
            # Check for model weights
            has_weights = any(f.endswith(('.bin', '.safetensors', '.pt')) for f in contents)
            if has_weights:
                print(f"[PREFLIGHT]   ✓ model weights found")
            else:
                print(f"[PREFLIGHT]   ✗ NO model weights - vLLM will fail!")
                errors.append("no_weights")
                
        except PermissionError:
            print(f"[PREFLIGHT] ✗ Cannot read model path (permission denied)")
            errors.append("model_unreadable")
        except Exception as e:
            print(f"[PREFLIGHT] ✗ Cannot list model path: {e}")
            errors.append("model_unreadable")
    else:
        print(f"[PREFLIGHT] ✗ Model path NOT FOUND: {MODEL_PATH}")
        # Debug: show what IS available
        for check_path in ["/kaggle/input", "/kaggle"]:
            if os.path.exists(check_path):
                print(f"[PREFLIGHT]   {check_path} contains: {os.listdir(check_path)[:10]}")
        errors.append("model_missing")
    
    # Summary
    if errors:
        print(f"[PREFLIGHT] ⚠ WARNINGS: {errors}")
        print("[PREFLIGHT] Server may fail - fallback solver will be used")
        _PREFLIGHT_PASSED = False
    else:
        print("[PREFLIGHT] ✓ All checks passed")
        _PREFLIGHT_PASSED = True
    
    print("=" * 60)
    return _PREFLIGHT_PASSED

# Run preflight NOW (at import, before serve())
run_preflight()

# === VLLM PROCESS MANAGEMENT ===
vllm_process = None
_VLLM_STARTED = False

# Force offline mode
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["HF_DATASETS_OFFLINE"] = "1"

# vLLM backend config
os.environ["VLLM_ATTENTION_BACKEND"] = "FLASH_ATTN"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

VLLM_COMMAND = [
    "python", "-m", "vllm.entrypoints.openai.api_server",
    "--model", MODEL_PATH,
    "--served-model-name", "vllm-model",
    "--tensor-parallel-size", "1",
    "--max-num-seqs", "16",
    "--gpu-memory-utilization", "0.90",
    "--host", VLLM_HOST,
    "--port", str(VLLM_PORT),
    "--dtype", "auto",
    "--max-model-len", "65536",
    "--trust-remote-code",
]


def start_vllm_server():
    """
    Start vLLM server lazily (called on first predict).
    Returns immediately - server starts in background.
    """
    global vllm_process, _VLLM_STARTED
    
    if _VLLM_STARTED:
        return True
    
    _VLLM_STARTED = True
    
    # Don't even try if preflight failed critically
    if not _VLLM_AVAILABLE:
        print("[VLLM] Skipping - vLLM not available")
        return False
    
    if not os.path.exists(MODEL_PATH):
        print(f"[VLLM] Skipping - model path missing: {MODEL_PATH}")
        return False
    
    print("[VLLM] Starting server (lazy startup)...")
    print(f"[VLLM] Command: {' '.join(VLLM_COMMAND[:6])}...")
    
    try:
        log_path = "/kaggle/working/vllm.log"
        with open(log_path, "w") as logfile:
            vllm_process = subprocess.Popen(
                VLLM_COMMAND,
                stdout=logfile,
                stderr=subprocess.STDOUT,
                start_new_session=True
            )
        print(f"[VLLM] Server starting (PID: {vllm_process.pid}, log: {log_path})")
        return True
    except Exception as e:
        print(f"[VLLM] Failed to start: {e}")
        return False

print("[BANSHEEV6] Preflight complete. vLLM configured for lazy start.")


In [None]:
# =============================================================================
# OPENAI CLIENT + SERVER HEALTH MANAGEMENT
# =============================================================================
# ChatGPT Review Compliance:
#   - Probe calls are FAST (short timeouts, won't block relay)
#   - 8-minute initial budget, then quick checks
#   - Never fully gives up (server might come up late)
#   - Strict per-call timeouts to avoid blocking
# =============================================================================
from openai import OpenAI
import httpx

# === CLIENT CONFIGURATION ===
VLLM_BASE_URL = f"http://127.0.0.1:{VLLM_PORT}/v1"

os.environ["OPENAI_API_BASE"] = VLLM_BASE_URL
os.environ["OPENAI_API_KEY"] = "sk-dummy"

# Client with STRICT timeouts (ChatGPT: "keep probe calls very fast")
client = OpenAI(
    base_url=VLLM_BASE_URL,
    api_key="sk-dummy",
    timeout=httpx.Timeout(10.0, connect=5.0),  # 10s total, 5s connect
    max_retries=0,  # Don't retry - we handle retries ourselves
)

# === SERVER STATE ===
_CLIENT_READY = False
_SERVER_RESTARTED = False
_INITIAL_PROBE_DONE = False
_SERVER_PROBE_START = None

# === TIMING CONSTANTS (ChatGPT-aligned) ===
INITIAL_PROBE_BUDGET = 480    # 8 minutes total initial effort
RESTART_AT = 240              # Restart at 4-minute mark
QUICK_PROBE_TIMEOUT = 3       # Quick check after initial period
PROBE_INTERVAL = 3            # Check every 3s
HEALTH_CHECK_TIMEOUT = 2      # Individual health check timeout


def check_server_health() -> bool:
    """
    Fast health check with strict timeout.
    ChatGPT: "keep probe calls very fast (short timeouts)"
    """
    try:
        # Use requests directly for tighter timeout control
        import requests
        resp = requests.get(
            f"http://127.0.0.1:{VLLM_PORT}/health",
            timeout=HEALTH_CHECK_TIMEOUT
        )
        return resp.status_code == 200
    except:
        pass
    
    # Fallback: try models endpoint
    try:
        models = client.models.list()
        return bool(models)
    except:
        return False


def restart_vllm_server():
    """Kill and restart vLLM server (called once at 4-min mark)."""
    global vllm_process, _VLLM_STARTED
    
    print("[SERVER] Restarting vLLM...")
    
    if vllm_process and vllm_process.poll() is None:
        try:
            vllm_process.terminate()
            vllm_process.wait(timeout=10)
        except:
            try:
                vllm_process.kill()
            except:
                pass
    
    _VLLM_STARTED = False
    start_vllm_server()
    
    if vllm_process:
        print(f"[SERVER] Restarted (PID: {vllm_process.pid})")


def ensure_server() -> bool:
    """
    Ensure vLLM server is ready. NEVER FULLY GIVES UP.
    
    Strategy (ChatGPT-aligned):
      Phase 1 (0-8 min): Serious probing with restart at 4min
      Phase 2 (8+ min): Quick 3s checks each problem
    
    Returns:
        True  - Server ready, use LLM
        False - Not ready yet, use fallback (but check again next problem)
    """
    global _CLIENT_READY, _SERVER_RESTARTED, _INITIAL_PROBE_DONE, _SERVER_PROBE_START
    
    # Fast path: already confirmed working
    if _CLIENT_READY:
        return True
    
    # Start server if not started
    if not start_vllm_server():
        return False  # Can't even start (missing deps/model)
    
    # Initialize probe timer
    if _SERVER_PROBE_START is None:
        _SERVER_PROBE_START = time.time()
    
    total_elapsed = time.time() - _SERVER_PROBE_START
    
    # =========================================================================
    # PHASE 1: INITIAL 8-MINUTE PROBE
    # =========================================================================
    if not _INITIAL_PROBE_DONE:
        remaining = INITIAL_PROBE_BUDGET - total_elapsed
        
        if remaining > 0:
            probe_time = min(30, remaining)
            print(f"[CLIENT] Probing... {total_elapsed:.0f}s/{INITIAL_PROBE_BUDGET}s")
            deadline = time.time() + probe_time
            
            while time.time() < deadline:
                if TIME_MANAGER.is_expired():
                    print("[CLIENT] Competition time expired")
                    return False
                
                if check_server_health():
                    startup_time = time.time() - _SERVER_PROBE_START
                    print(f"[CLIENT] ✓ Server ready after {startup_time:.0f}s")
                    _CLIENT_READY = True
                    return True
                
                time.sleep(PROBE_INTERVAL)
            
            # Restart at 4-minute mark
            total_elapsed = time.time() - _SERVER_PROBE_START
            if not _SERVER_RESTARTED and total_elapsed >= RESTART_AT:
                _SERVER_RESTARTED = True
                print(f"[CLIENT] Not ready at {total_elapsed:.0f}s - restarting")
                restart_vllm_server()
            
            # Still have budget? Return, will probe again next call
            if total_elapsed < INITIAL_PROBE_BUDGET:
                return False
        
        print("[CLIENT] Initial 8-min probe done → quick check mode")
        _INITIAL_PROBE_DONE = True
    
    # =========================================================================
    # PHASE 2: QUICK CHECK (never give up)
    # =========================================================================
    deadline = time.time() + QUICK_PROBE_TIMEOUT
    while time.time() < deadline:
        if check_server_health():
            print("[CLIENT] ✓ Server came up late!")
            _CLIENT_READY = True
            return True
        time.sleep(1)
    
    return False


def fallback_solver(problem_text: str) -> int:
    """
    Smarter heuristic fallback when LLM unavailable.
    ChatGPT: "add a second-tier non-LLM heuristic"
    """
    t = problem_text.lower()
    numbers = [int(x) for x in re.findall(r'\d+', problem_text) if len(x) <= 5]
    
    if not numbers:
        return 42
    
    # Pattern 1: Modular arithmetic "mod N"
    mod_match = re.search(r'mod(?:ulo)?\s*(\d+)', t)
    if mod_match:
        m = int(mod_match.group(1))
        if m > 0:
            return sum(numbers) % m
    
    # Pattern 2: "remainder when divided by N"
    rem_match = re.search(r'remainder.*(?:divided|by)\s*(\d+)', t)
    if rem_match:
        d = int(rem_match.group(1))
        if d > 0:
            return sum(numbers) % d
    
    # Pattern 3: Counting problems
    if any(kw in t for kw in ['how many', 'count', 'number of ways', 'number of']):
        return numbers[-1] % 10000 if numbers[-1] < 100000 else 1
    
    # Pattern 4: Extrema
    if any(kw in t for kw in ['largest', 'maximum', 'greatest', 'max']):
        return max(numbers) % 100000
    if any(kw in t for kw in ['smallest', 'minimum', 'least', 'min']):
        return min(numbers) % 100000
    
    # Pattern 5: Product
    if 'product' in t:
        p = 1
        for n in numbers[:4]:
            p = (p * n) % 100000
        return p
    
    # Pattern 6: Sum
    if 'sum' in t:
        return sum(numbers) % 100000
    
    # Pattern 7: Last number (often answer-related)
    return numbers[-1] % 100000


print(f"[BANSHEEV6] Client: strict {HEALTH_CHECK_TIMEOUT}s health checks, 8-min probe budget")


In [None]:
# =========================
# PROMPT ENGINEERING
# 32 System Prompts from Zombie (23/50)
# =========================

COGNITIVE_MACROS = """
COGNITIVE TOOLKIT:
- Try small cases to conjecture, then prove.
- Search for invariants / monovariants.
- Work backwards from desired form.
- Check extremes / boundary cases.
- Exploit symmetry / relabeling.
- Pigeonhole principle.
- Parity / modular arithmetic.
- Bounding (upper/lower) and squeeze.
- Complementary counting.
- Construct bijection or involution.
- Translate to algebra/graph/geometry.
- Sanity-check units, integrality, constraints.
OUTPUT: Finish with exactly one integer in \\boxed{}, 0 <= answer <= 99999.
"""

# All 32 battle-tested prompts from zombie_23.ipynb (scored 23/50)
SYSTEM_PROMPTS = [
    # 1. Elite mathematician with code execution
    "CRITICAL: You have Python code execution. Write ```python blocks for verification. "
    "You are an elite mathematics researcher. Return only final integer in \\boxed{}, 0 <= answer <= 99999.",
    
    # 2. IMO competitor rigor
    "You are an IMO competitor. Rigorously define variables, explore multiple strategies, "
    "perform full case analysis, justify nontrivial steps. Return only final answer in \\boxed{}. 0 <= answer <= 99999.",

    # 3. Adversarial self-refutation
    "Solve with full rigor. After candidate solution, actively attempt refutation by "
    "searching for counterexamples, stress-testing edge cases. Return in \\boxed{}. 0 <= answer <= 99999.",

    # 4. Invariant-first under time pressure
    "Under IMO time pressure: identify key invariant/symmetry/extremal principle early, "
    "avoid brute force unless justified. Return only final integer in \\boxed{}. 0 <= answer <= 99999.",
    
    # 5. Multiple solution approaches
    "Attempt at least two fundamentally different solution approaches. Proceed with more rigorous, "
    "use other as verification. Return only verified final answer in \\boxed{}. 0 <= answer <= 99999.",

    # 6. First principles restart on inconsistency
    "If step relies on unproven assumption or inconsistency, restart from first principles. "
    "Return only final verified integer in \\boxed{}. 0 <= answer <= 99999.",

    # 7. Tool-integrated reasoning with early stopping
    "Tool-integrated reasoning, apply early-stopping when confident. Return verified final answer "
    "in \\boxed{}. 0 <= answer <= 99999.",

    # 8. Adversarial academic review
    "Conduct adversarial academic review of proposed solution, then provide constructive critique. "
    "Return only verified final answer in \\boxed{}. 0 <= answer <= 99999.",

    # 9. Small answer skepticism
    "WARNING: Small answers (0, 1, 2, 3) are often wrong on competition problems. "
    "If you get a small answer, re-verify with extra rigor. Return in \\boxed{}. 0 <= answer <= 99999.",

    # 10. Constraint propagation
    "Apply constraint propagation: what does the problem FORCE to be true? What CANNOT happen? "
    "Build the answer from forced conclusions. Return in \\boxed{}. 0 <= answer <= 99999.",

    # 11. Backwards reasoning
    "Work backwards: What form must the answer have? What properties? Use this to constrain search. "
    "Return only final integer in \\boxed{}. 0 <= answer <= 99999.",

    # 12. Generating function / algebraic identity
    "Consider generating functions, algebraic identities, or polynomial methods. "
    "Competition math often has elegant closed forms. Return in \\boxed{}. 0 <= answer <= 99999.",

    # 13. Combinatorial bijection
    "Seek a bijection or double-counting argument. Competition problems often have elegant "
    "combinatorial structures. Return only final answer in \\boxed{}. 0 <= answer <= 99999.",

    # 14. Modular arithmetic focus
    "Focus on modular arithmetic patterns. Check divisibility, remainders, Chinese Remainder Theorem. "
    "Return only final integer in \\boxed{}. 0 <= answer <= 99999.",

    # 15. Geometry to algebra translation
    "If geometry: translate to coordinates or trigonometry. If algebra: consider geometric interpretation. "
    "Return in \\boxed{}. 0 <= answer <= 99999.",

    # 16. Extremal principle
    "Apply extremal principle: consider maximum/minimum elements, argue about their properties. "
    "Return only final answer in \\boxed{}. 0 <= answer <= 99999.",

    # 17. Graph theory modeling
    "Model the problem as a graph if applicable. Vertices, edges, degrees, paths, cycles. "
    "Return only final integer in \\boxed{}. 0 <= answer <= 99999.",

    # 18. Recurrence relation
    "Look for recurrence relations. Define f(n), find f(n) in terms of earlier values. "
    "Return in \\boxed{}. 0 <= answer <= 99999.",

    # 19. Pigeonhole principle
    "Apply pigeonhole principle: if too many pigeons, some hole has multiple. What are the pigeons? Holes? "
    "Return only final answer in \\boxed{}. 0 <= answer <= 99999.",

    # 20. Greedy algorithm validation
    "Consider greedy approach. Prove it works or find counterexample. "
    "Return only verified final integer in \\boxed{}. 0 <= answer <= 99999.",

    # 21. Induction strategy
    "Prove by induction if pattern emerges. State base case, inductive step clearly. "
    "Return in \\boxed{}. 0 <= answer <= 99999.",

    # 22. Probabilistic method intuition
    "Use probabilistic intuition: if random approach gives expected value X, constructive solution exists. "
    "Return only final answer in \\boxed{}. 0 <= answer <= 99999.",

    # 23. Number theory deep dive
    "For number theory: check prime factorization, Euler's theorem, quadratic residues. "
    "Return only final integer in \\boxed{}. 0 <= answer <= 99999.",

    # 24. Functional equation approach
    "For functional equations: try f(0), f(1), injectivity, surjectivity, substitutions. "
    "Return in \\boxed{}. 0 <= answer <= 99999.",

    # 25. Inequality techniques
    "For inequalities: AM-GM, Cauchy-Schwarz, Jensen, rearrangement. When does equality hold? "
    "Return only final answer in \\boxed{}. 0 <= answer <= 99999.",

    # 26. Sequence analysis
    "Analyze sequences: arithmetic, geometric, periodic, eventually periodic, bounded? "
    "Return only final integer in \\boxed{}. 0 <= answer <= 99999.",

    # 27. Polynomial roots and coefficients
    "For polynomials: Vieta's formulas, root behavior, coefficient patterns. "
    "Return in \\boxed{}. 0 <= answer <= 99999.",

    # 28. Game theory / strategy
    "If game theory: who has winning strategy? What invariant does winner maintain? "
    "Return only final answer in \\boxed{}. 0 <= answer <= 99999.",

    # 29. Coloring arguments
    "Consider coloring arguments: 2-color, checkerboard, mod-k coloring. What's forced? "
    "Return only final integer in \\boxed{}. 0 <= answer <= 99999.",

    # 30. Diophantine equation techniques
    "For Diophantine equations: factorization, descent, modular constraints. "
    "Return in \\boxed{}. 0 <= answer <= 99999.",

    # 31. Confidence scoring
    "Rate your confidence 0-100 after solving. If below 85, try alternative approach. "
    "Return only verified final answer in \\boxed{}. 0 <= answer <= 99999.",

    # 32. Triangulation algorithm
    "Use triangulation: first determine answer's order of magnitude, then narrow range, then pinpoint. "
    "Return only final integer in \\boxed{}. 0 <= answer <= 99999.",
]

# Follow-up prompts for reflexion
FOLLOWUP_NO_ANSWER = "You did not provide a boxed answer. Please place your final integer answer in \\boxed{}."
FOLLOWUP_SMALL_ANSWER = "You answered with a small number. Small answers (0-10) are often wrong on IMO problems. Are you absolutely certain? Re-verify."
FOLLOWUP_VERIFY = "Please verify your answer one more time. Check arithmetic, edge cases, and logical steps."
FOLLOWUP_GUESS = "Time is running out. Make your best educated guess and put it in \\boxed{}."


def sanity_check_answer(question_text: str, ans: int) -> bool:
    """Check answer against problem constraints."""
    if ans < 0 or ans > 99999:
        return False
    t = question_text.lower()
    
    # Parity
    if re.search(r'\banswer\s+is\s+even\b', t) and ans % 2 != 0:
        return False
    if re.search(r'\banswer\s+is\s+odd\b', t) and ans % 2 != 1:
        return False
    
    # Divisibility
    div_match = re.search(r'\b(?:divisible by|multiple of)\s+(\d{1,5})\b', t)
    if div_match:
        k = int(div_match.group(1))
        if k != 0 and ans % k != 0:
            return False
    
    return True

print(f"[BANSHEEV6] Prompts: {len(SYSTEM_PROMPTS)} system prompts loaded")

In [None]:
# =========================
# GENERATION HELPERS
# =========================

def annealed_temperature(step: int, total: int, t_start: float = 1.4, t_end: float = 0.2) -> float:
    if total <= 1:
        return t_end
    return t_start * ((t_end / t_start) ** (step / (total - 1)))

def annealed_max_tokens(step: int, total: int, start: int = 2048, end: int = 640) -> int:
    """H100: slightly higher token floor for better reasoning."""
    if total <= 1:
        return end
    return int(start + (step / (total - 1)) * (end - start))

def annealed_top_p(step: int, total: int, start: float = 0.95, end: float = 0.75) -> float:
    if total <= 1:
        return end
    return start + (step / (total - 1)) * (end - start)

def repetition_rate(text: str, window: int = 120) -> float:
    toks = re.findall(r'\w+|\S', text.lower())
    if len(toks) < window:
        return 0.0
    tail = toks[-window:]
    return 1.0 - (len(set(tail)) / max(1, len(tail)))

def tail_similarity(text: str, window: int = 900) -> float:
    """Check if text is repeating itself."""
    if len(text) < 2 * window:
        return 0.0
    a = text[-2*window:-window]
    b = text[-window:]
    # Simple char overlap
    a_set = set(a.lower())
    b_set = set(b.lower())
    if not a_set or not b_set:
        return 0.0
    return len(a_set & b_set) / len(a_set | b_set)

print("[BANSHEEV6] Generation Helpers: OK")

In [None]:
# =========================
# MAIN SOLVER CORE
# =========================

import threading
STATE_LOCK = threading.Lock()

# Global state
completed_question_ids = set()
question_id_to_counter = defaultdict(Counter)
question_id_to_samples = defaultdict(list)
question_id_to_traces = defaultdict(list)
question_id_to_cic_history = defaultdict(list)


def is_valid_answer(ans) -> bool:
    """Check if answer is valid."""
    try:
        return ans is not None and 0 <= int(ans) <= 99999
    except:
        return False


def vote_answer(question_id: str, force_answer: bool = False) -> Optional[int]:
    """Log-weighted voting."""
    counter = question_id_to_counter[question_id]
    if not counter:
        return None
    
    modified_counter = Counter()
    for value, count in counter.items():
        modified_counter[value] += math.log(1.25 + abs(value)) * count
    
    score_list = sorted(
        (score, counter[value], value) for value, score in modified_counter.items()
    )
    
    if force_answer and score_list:
        if RUNTIME.VERBOSE_LOGGING:
            print(f"[VOTE] {sum(counter.values())} attempts")
            for score, count, value in score_list[::-1][:3]:
                print(f"  {value}: count={count}")
        return score_list[-1][-1]
    
    return None


def generate_solution(
    question_text: str,
    question_id: str,
    solution_index: int,
    system_prompt: str,
    budget: float,
    total_generations: int = 8,
) -> Optional[int]:
    """
    Generate solution with:
    - NON-STREAMING (safer)
    - Budget-tied timeout
    - Uses extract_any_answer from Cell 5
    """
    if question_id in completed_question_ids:
        return None
    if TIME_MANAGER.is_expired():
        return None

    gen_start = time.time()
    os.makedirs(f"solutions/{question_id}", exist_ok=True)

    # Use annealed parameters from Cell 12
    temperature = annealed_temperature(solution_index, total_generations)
    max_tokens = annealed_max_tokens(solution_index, total_generations)
    
    # Budget-tied timeout
    remaining_budget = max(10, budget - (time.time() - gen_start))
    request_timeout = min(90, remaining_budget - 5)

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": question_text},
    ]

    text_response_to_save = ""
    max_iterations = 2

    for iteration in range(max_iterations):
        if question_id in completed_question_ids:
            break
        if TIME_MANAGER.is_expired():
            break
        if not TIME_MANAGER.should_continue(gen_start, budget):
            break

        try:
            # NON-STREAMING
            resp = client.chat.completions.create(
                model="vllm-model",
                messages=messages,
                temperature=temperature,
                max_tokens=max_tokens,
                extra_body=dict(min_p=0.02, stop_token_ids=stop_token_ids),
                timeout=request_timeout,
            )
            
            text_response = resp.choices[0].message.content or ""
            messages.append({"role": "assistant", "content": text_response})
            text_response_to_save += text_response

        except Exception as e:
            if RUNTIME.VERBOSE_LOGGING:
                print(f"[GEN] Error: {e}")
            break

        # Use extract_any_answer from Cell 5
        ans = extract_any_answer(text_response_to_save)
        
        if is_valid_answer(ans):
            if ans <= 10 and iteration == 0:
                user_follow_up = "Are you sure? Double-check your answer."
                messages.append({"role": "user", "content": user_follow_up})
                text_response_to_save += "\n===\n" + user_follow_up + "\n===\n"
            else:
                break
        else:
            if iteration == 0:
                user_follow_up = "Place your final answer in \\boxed{}."
                messages.append({"role": "user", "content": user_follow_up})
                text_response_to_save += "\n===\n" + user_follow_up + "\n===\n"

    # Final extraction
    ans = extract_any_answer(text_response_to_save)
    
    # Save trace (thread-safe)
    if text_response_to_save:
        with STATE_LOCK:
            question_id_to_traces[question_id].append(text_response_to_save)
        
        try:
            suffix = f"-{ans}" if is_valid_answer(ans) else ""
            with open(f"solutions/{question_id}/{solution_index:04d}{suffix}.txt", "w") as f:
                f.write(text_response_to_save)
        except:
            pass

    if is_valid_answer(ans):
        with STATE_LOCK:
            question_id_to_counter[question_id][ans] += 1
            question_id_to_samples[question_id].append(ans)
        vote_answer(question_id)
        return ans

    return None


print("[BANSHEEV6] Main solver core: NON-STREAMING, uses Cell 5 & 12 functions")

In [None]:
# =========================
# TWO-PHASE REASONING (Tier 3)
# =========================
# Phase A: Normal reasoning to get candidate answer
# Phase B: Short arithmetic recomputation to catch slips
#
# Catches: last-line arithmetic errors, sign mistakes, modulo slips
# Cost: ~256 extra tokens per verified sample
# =========================

PHASE_B_PROMPTS = [
    """The candidate answer is {answer}. 

Before accepting this, carefully recompute the final arithmetic step by step:
1. What was the final expression/calculation?
2. Compute each step explicitly
3. Verify the result

If the arithmetic confirms {answer}, output \\boxed{{{answer}}}.
If you find an error, output the corrected answer in \\boxed{{}}.""",

    """A solution claims the answer is {answer}.

Quick sanity check:
- Does this value make sense given the problem constraints?
- Verify the final calculation that produced {answer}
- Check for any sign errors or off-by-one mistakes

Final verified answer in \\boxed{{}}.""",

    """The proposed answer is {answer}.

If this involves modular arithmetic:
- Verify: result mod M = {answer}?
- Check: is {answer} in the valid range [0, M-1]?

Recompute the final step and confirm in \\boxed{{}}.""",
]


def should_run_phase_b(answer: int, difficulty: DifficultyAssessment, elapsed: float, budget: float) -> bool:
    """Decide whether to run Phase B verification."""
    if budget - elapsed < 15:
        return False
    if difficulty.level == Difficulty.EASY:
        return False
    if difficulty.level == Difficulty.HARD:
        return True
    # MEDIUM: run if answer has arithmetic-slip risk
    return (answer % 10 == 0 or answer > 10000 or 
            abs(answer - round(answer, -2)) < 5 or answer in [0, 1, 2])


def run_phase_b(question_text: str, candidate_answer: int, question_id: str, timeout: float = 20.0) -> Optional[int]:
    """Phase B: Arithmetic verification. Short prompt to recompute final arithmetic."""
    if TIME_MANAGER.is_expired():
        return None
    
    t = question_text.lower()
    prompt_template = PHASE_B_PROMPTS[2] if ("mod" in t or "remainder" in t) else PHASE_B_PROMPTS[0]
    phase_b_prompt = prompt_template.format(answer=candidate_answer)
    
    messages = [
        {"role": "system", "content": "You are verifying a mathematical calculation. Be brief and precise."},
        {"role": "user", "content": f"Problem: {question_text[:1000]}\n\n{phase_b_prompt}"},
    ]
    
    try:
        resp = client.chat.completions.create(
            model="vllm-model",
            messages=messages,
            temperature=0.3,
            max_tokens=256,
            extra_body=dict(min_p=0.02, stop_token_ids=stop_token_ids),
            timeout=timeout,
        )
        
        response_text = resp.choices[0].message.content or ""
        verified = extract_any_answer(response_text)
        
        if verified is not None and 0 <= verified <= 99999:
            if verified != candidate_answer:
                print(f"[PHASE-B] Correction: {candidate_answer} -> {verified}")
            else:
                print(f"[PHASE-B] Confirmed: {verified}")
            return verified
        else:
            print(f"[PHASE-B] No extraction, keeping {candidate_answer}")
            return candidate_answer
            
    except Exception as e:
        print(f"[PHASE-B] Error: {e}, keeping {candidate_answer}")
        return candidate_answer


print("[BANSHEEV6] Two-Phase Reasoning: Phase A (reason) + Phase B (verify arithmetic)")

In [None]:
# =========================
# SELF-CONSISTENCY REFINEMENT (Tier 4)
# =========================
# When samples cluster but don't perfectly agree:
# Re-prompt within the dominant basin to break ties.
#
# Key insight: Nudging toward the attractor increases accuracy
# without adding full sampling cost.
# =========================

REFINEMENT_PROMPT = """Several independent solution attempts for this problem suggest the answer is approximately {basin_center}.

The candidate values were: {candidates}

Please carefully recompute to determine the exact answer:
1. Verify the approach that led to values around {basin_center}
2. Check for any arithmetic errors in the final steps
3. Confirm the precise integer answer

Put your final verified answer in \\boxed{{}}."""


def should_refine_with_basin(
    samples: List[int],
    difficulty: DifficultyAssessment,
    elapsed: float,
    budget: float,
) -> Tuple[bool, Optional[int], List[int]]:
    """
    Decide whether to run basin refinement.
    
    Returns: (should_refine, basin_center, basin_members)
    
    Trigger when:
    - Have 3+ samples
    - Dominant cluster exists but not unanimous
    - Time remaining for one more call
    - MEDIUM or HARD difficulty
    """
    if len(samples) < 3:
        return False, None, []
    
    if budget - elapsed < 20:
        return False, None, []
    
    if difficulty.level == Difficulty.EASY:
        return False, None, []
    
    # Check for dominant cluster
    counter = Counter(samples)
    top_answer, top_count = counter.most_common(1)[0]
    
    # If already unanimous, no need to refine
    if top_count == len(samples):
        return False, None, []
    
    # Need at least 50% agreement to have a "dominant" cluster
    if top_count < len(samples) * 0.5:
        # Try value clustering to find basin
        result = value_clustering(samples, threshold=0.05)
        if result["best"] is None or result["best"].size < 2:
            return False, None, []
        
        basin = result["best"]
        basin_center = basin.center
        basin_members = basin.members
    else:
        basin_center = top_answer
        basin_members = [s for s in samples if s == top_answer]
    
    # Only refine if there's disagreement worth resolving
    # (i.e., not already 80%+ agreement)
    if top_count >= len(samples) * 0.8:
        return False, None, []
    
    return True, basin_center, basin_members


def run_basin_refinement(
    question_text: str,
    basin_center: int,
    basin_members: List[int],
    question_id: str,
    timeout: float = 25.0,
) -> Optional[int]:
    """
    Re-prompt within dominant basin to get refined answer.
    
    Nudges model toward the attractor while allowing correction.
    """
    if TIME_MANAGER.is_expired():
        return None
    
    # Format candidates for prompt
    candidates_str = ", ".join(str(m) for m in sorted(set(basin_members))[:5])
    
    prompt = REFINEMENT_PROMPT.format(
        basin_center=basin_center,
        candidates=candidates_str
    )
    
    messages = [
        {"role": "system", "content": "You are refining a mathematical solution. Be precise and verify arithmetic carefully."},
        {"role": "user", "content": f"Problem: {question_text[:1500]}\n\n{prompt}"},
    ]
    
    try:
        resp = client.chat.completions.create(
            model="vllm-model",
            messages=messages,
            temperature=0.4,  # Slightly higher than Phase B for exploration
            max_tokens=512,   # More room for verification
            extra_body=dict(min_p=0.02, stop_token_ids=stop_token_ids),
            timeout=timeout,
        )
        
        response_text = resp.choices[0].message.content or ""
        refined = extract_any_answer(response_text)
        
        if refined is not None and 0 <= refined <= 99999:
            # Check if refinement is close to basin or a correction
            if abs(refined - basin_center) < basin_center * 0.1 or refined in basin_members:
                print(f"[REFINE] Basin confirmed: {refined}")
            else:
                print(f"[REFINE] Correction: {basin_center} -> {refined}")
            return refined
        else:
            print(f"[REFINE] No valid answer, using basin center: {basin_center}")
            return basin_center
            
    except Exception as e:
        print(f"[REFINE] Error: {e}, using basin center: {basin_center}")
        return basin_center


print("[BANSHEEV6] Self-Consistency Refinement: Basin-aware re-prompting enabled")

In [None]:
# =========================
# CONTROL POLICY (Tier 5)
# =========================
# The missing 15-20 points: WHEN to spend compute.
#
# Core insight: Spend compute only when it buys probability mass.
# Two-stage loop:
#   Stage A: Cheap consensus (2-4 samples, short)
#   Stage B: Targeted repair/escalation (1-2 extra calls when flagged)
# =========================

from dataclasses import dataclass
from enum import Enum


class ControlDecision(Enum):
    ACCEPT = "accept"           # High confidence, stop sampling
    ESCALATE = "escalate"       # Low confidence, spend more compute
    FALLBACK = "fallback"       # Give up, use best guess


@dataclass
class ControlState:
    """State for control policy decisions."""
    cluster_support: float      # Fraction of samples in best cluster
    cluster_tightness: float    # Normalized tightness (0-1)
    decision_reject: bool       # Hard reject (mod/bounds impossible)
    warnings_count: int         # Number of soft warnings
    samples_so_far: int         # How many samples collected
    time_left: float            # Remaining time budget
    difficulty: Difficulty      # Problem difficulty


# === CONTROL THRESHOLDS ===
# These are the knobs that move you from 25-35 to 40-50+

# Accept immediately if:
ACCEPT_HIGH_SUPPORT = 0.67      # cluster_support threshold
ACCEPT_HIGH_TIGHTNESS = 0.70   # cluster_tightness threshold

ACCEPT_MED_SUPPORT = 0.50      # Alternative: lower support but...
ACCEPT_MED_TIGHTNESS = 0.85    # ...higher tightness and no warnings

# Escalate if:
ESCALATE_LOW_SUPPORT = 0.50    # Below this = need more samples
ESCALATE_LOW_TIGHTNESS = 0.55  # Below this = disagreement

# Budget caps by difficulty
BUDGET_CAPS = {
    Difficulty.EASY: 2,    # Max samples for easy
    Difficulty.MEDIUM: 5,  # H100: +1 for medium problems
    Difficulty.HARD: 7,    # H100: +1 for hard problems (if near-miss signals)
}

# Minimum time to justify another sample (seconds)
MIN_TIME_FOR_SAMPLE = 8.0


def compute_control_state(
    samples: List[int],
    verifications: Dict[int, VerificationResult],
    difficulty: DifficultyAssessment,
    elapsed: float,
    budget: float,
) -> ControlState:
    """Compute current control state from samples."""
    
    if not samples:
        return ControlState(
            cluster_support=0.0,
            cluster_tightness=0.0,
            decision_reject=False,
            warnings_count=0,
            samples_so_far=0,
            time_left=budget - elapsed,
            difficulty=difficulty.level,
        )
    
    # Cluster analysis
    counter = Counter(samples)
    top_answer, top_count = counter.most_common(1)[0]
    cluster_support = top_count / len(samples)
    
    # Tightness from value clustering
    if RUNTIME.ENABLE_VALUE_CLUSTERING and len(samples) >= 2:
        result = value_clustering(samples, threshold=0.05)
        if result["best"] is not None:
            cluster_tightness = result["best"].tightness
        else:
            cluster_tightness = 0.0
    else:
        # Simple tightness: 1.0 if all same, lower if spread
        if len(set(samples)) == 1:
            cluster_tightness = 1.0
        else:
            cluster_tightness = cluster_support  # Proxy
    
    # Check for hard rejects
    decision_reject = False
    for ans, vr in verifications.items():
        if vr.failed_checks:
            decision_reject = True
            break
    
    # Count warnings
    warnings_count = sum(
        len(vr.flags) for vr in verifications.values()
    )
    
    return ControlState(
        cluster_support=cluster_support,
        cluster_tightness=cluster_tightness,
        decision_reject=decision_reject,
        warnings_count=warnings_count,
        samples_so_far=len(samples),
        time_left=budget - elapsed,
        difficulty=difficulty.level,
    )


def control_decision(state: ControlState) -> ControlDecision:
    """
    The core control policy.
    
    Returns: ACCEPT, ESCALATE, or FALLBACK
    """
    # Hard fallback conditions
    if state.time_left < MIN_TIME_FOR_SAMPLE:
        return ControlDecision.FALLBACK
    
    # Budget exhausted for this difficulty
    max_samples = BUDGET_CAPS.get(state.difficulty, 4)
    if state.samples_so_far >= max_samples:
        return ControlDecision.ACCEPT  # Accept best guess
    
    # Accept immediately if high confidence
    if (state.cluster_support >= ACCEPT_HIGH_SUPPORT and 
        state.cluster_tightness >= ACCEPT_HIGH_TIGHTNESS and
        not state.decision_reject):
        return ControlDecision.ACCEPT
    
    # Accept with medium support if very tight and no warnings
    if (state.cluster_support >= ACCEPT_MED_SUPPORT and
        state.cluster_tightness >= ACCEPT_MED_TIGHTNESS and
        state.warnings_count == 0):
        return ControlDecision.ACCEPT
    
    # Escalate if low confidence and budget remains
    if (state.cluster_support < ESCALATE_LOW_SUPPORT or
        state.cluster_tightness < ESCALATE_LOW_TIGHTNESS or
        state.warnings_count >= 1):
        if state.samples_so_far < max_samples:
            return ControlDecision.ESCALATE
    
    # Default: accept what we have
    return ControlDecision.ACCEPT


print("[BANSHEEV6] ControlPolicy: ACCEPT/ESCALATE/FALLBACK thresholds active")
print(f"[BANSHEEV6] Accept: support>={ACCEPT_HIGH_SUPPORT}, tight>={ACCEPT_HIGH_TIGHTNESS}")
print(f"[BANSHEEV6] Escalate: support<{ESCALATE_LOW_SUPPORT} OR tight<{ESCALATE_LOW_TIGHTNESS}")

In [None]:
# =========================
# ARCHETYPE ROUTER (Tier 5)
# =========================
# 10-15 problem archetypes with tailored policies.
# Fast regex/keyword detection - no LLM needed.
# =========================

from enum import Enum
from dataclasses import dataclass
from typing import Tuple


class Archetype(Enum):
    MOD_ARITHMETIC = "mod"           # High yield
    COUNTING = "counting"            # Medium-hard, fragile
    OPTIMIZATION = "optimization"    # Trick-laden
    GEOMETRY = "geometry"            # Prone to arithmetic slips
    NUMBER_THEORY = "number_theory"  # Can be hard, many medium
    ALGEBRA = "algebra"              # Equations, polynomials
    INEQUALITY = "inequality"        # Time sink
    SEQUENCE = "sequence"            # Medium, often solvable
    PROBABILITY = "probability"      # Fragile with fractions
    SIMPLE_COMPUTE = "simple"        # Easy wins
    PROOF = "proof"                  # Often impossible under time
    GENERAL = "general"              # Default


@dataclass
class ArchetypePolicy:
    """Policy for a specific archetype."""
    min_samples: int
    max_samples: int
    phase_b_prompt: str  # Specialized Phase B prompt
    escalate_on_split: bool  # Extra sample if disagreement


# === ARCHETYPE DETECTION ===

ARCHETYPE_PATTERNS = {
    Archetype.MOD_ARITHMETIC: [
        r"\bremainder\b", r"\bmod\b", r"\bmodulo\b", 
        r"divided by.*remainder", r"last.*digits?\b",
        r"\bmod\s*\d+", r"congruent",
    ],
    Archetype.COUNTING: [
        r"number of ways", r"how many", r"\barrangements?\b",
        r"\bpermutations?\b", r"\bcombinations?\b", 
        r"count.*(?:ways|numbers|integers)", r"in how many",
    ],
    Archetype.OPTIMIZATION: [
        r"\bminimum\b", r"\bmaximum\b", r"\bleast\b", 
        r"\bgreatest\b", r"\boptimize\b", r"\bsmallest\b",
        r"\blargest\b", r"minimize", r"maximize",
    ],
    Archetype.GEOMETRY: [
        r"\bcircle\b", r"\btriangle\b", r"\bradius\b",
        r"\barea\b", r"\bperimeter\b", r"\bangle\b",
        r"\bdistance\b", r"\bpolygon\b", r"\bsquare\b",
        r"\brectangle\b", r"inscribed", r"circumscribed",
    ],
    Archetype.NUMBER_THEORY: [
        r"integer solutions?", r"\bdivisible\b", r"\bgcd\b",
        r"\blcm\b", r"\bprime\b", r"\bfactors?\b",
        r"coprime", r"euler", r"fermat",
    ],
    Archetype.ALGEBRA: [
        r"solve for", r"\bequation\b", r"\bsystem\b",
        r"\broots?\b", r"\bpolynomial\b", r"\bquadratic\b",
        r"find.*(?:value|x|y)", r"\bcoefficients?\b",
    ],
    Archetype.INEQUALITY: [
        r"for all real", r"\binequality\b", r"function satisfies",
        r"\bfor all\b.*>", r"\bprove.*inequality",
    ],
    Archetype.SEQUENCE: [
        r"\bsequence\b", r"\brecurrence\b", r"a_n\b", r"a_\{n",
        r"f\(n\+1\)", r"\bterm\b.*sequence", r"\bfibonacci\b",
    ],
    Archetype.PROBABILITY: [
        r"\bprobability\b", r"expected value", r"\brandom\b",
        r"\bchance\b", r"\blikelihood\b", r"fair.*(?:die|coin)",
    ],
    Archetype.PROOF: [
        r"\bprove\b", r"\bshow that\b", r"\bdemonstrate\b",
        r"\bestablish\b",
    ],
}


# === ARCHETYPE POLICIES ===

ARCHETYPE_POLICIES = {
    Archetype.MOD_ARITHMETIC: ArchetypePolicy(
        min_samples=2, max_samples=3,
        phase_b_prompt="Recompute the final modular arithmetic. Verify remainder is in [0, M-1].",
        escalate_on_split=True,
    ),
    Archetype.COUNTING: ArchetypePolicy(
        min_samples=3, max_samples=5,  # H100
        phase_b_prompt="Count again using an alternative method (bijection, generating function, or direct).",
        escalate_on_split=True,
    ),
    Archetype.OPTIMIZATION: ArchetypePolicy(
        min_samples=3, max_samples=5,  # H100
        phase_b_prompt="Check boundary cases and equality conditions for the optimum.",
        escalate_on_split=True,
    ),
    Archetype.GEOMETRY: ArchetypePolicy(
        min_samples=2, max_samples=4,
        phase_b_prompt="Recompute the final numeric value from the derived formula.",
        escalate_on_split=True,
    ),
    Archetype.NUMBER_THEORY: ArchetypePolicy(
        min_samples=3, max_samples=6,  # H100
        phase_b_prompt="Validate by plugging the answer back into the original constraints.",
        escalate_on_split=True,
    ),
    Archetype.ALGEBRA: ArchetypePolicy(
        min_samples=2, max_samples=3,
        phase_b_prompt="Substitute the answer to verify it satisfies the equation.",
        escalate_on_split=False,
    ),
    Archetype.INEQUALITY: ArchetypePolicy(
        min_samples=3, max_samples=4,
        phase_b_prompt="Verify with the equality case.",
        escalate_on_split=False,  # Time sink, don't over-invest
    ),
    Archetype.SEQUENCE: ArchetypePolicy(
        min_samples=3, max_samples=4,
        phase_b_prompt="Compute small n values to confirm the pattern matches.",
        escalate_on_split=True,
    ),
    Archetype.PROBABILITY: ArchetypePolicy(
        min_samples=3, max_samples=4,
        phase_b_prompt="Sanity check: probability must be in [0,1], then verify scaling.",
        escalate_on_split=True,
    ),
    Archetype.SIMPLE_COMPUTE: ArchetypePolicy(
        min_samples=1, max_samples=2,
        phase_b_prompt="Double-check the arithmetic.",
        escalate_on_split=False,  # Don't overspend on easy
    ),
    Archetype.PROOF: ArchetypePolicy(
        min_samples=2, max_samples=4,
        phase_b_prompt="If a numeric answer is expected, verify it.",
        escalate_on_split=False,  # Hard to escalate effectively
    ),
    Archetype.GENERAL: ArchetypePolicy(
        min_samples=2, max_samples=4,
        phase_b_prompt="Verify the final calculation step by step.",
        escalate_on_split=True,
    ),
}


def detect_archetype(problem_text: str) -> Tuple[Archetype, float]:
    """
    Detect problem archetype from text.
    Returns (archetype, confidence).
    """
    t = problem_text.lower()
    
    # Check for simple/short problems first
    if len(problem_text) < 150 and not any(
        kw in t for kw in ["prove", "show that", "for all"]
    ):
        return Archetype.SIMPLE_COMPUTE, 0.8
    
    # Score each archetype
    scores = {}
    for archetype, patterns in ARCHETYPE_PATTERNS.items():
        score = 0
        for pattern in patterns:
            if re.search(pattern, t):
                score += 1
        scores[archetype] = score
    
    if not scores or max(scores.values()) == 0:
        return Archetype.GENERAL, 0.3
    
    best = max(scores, key=scores.get)
    total = sum(scores.values())
    confidence = scores[best] / total if total > 0 else 0.3
    
    return best, confidence


def get_archetype_policy(archetype: Archetype) -> ArchetypePolicy:
    """Get policy for archetype."""
    return ARCHETYPE_POLICIES.get(archetype, ARCHETYPE_POLICIES[Archetype.GENERAL])


print("[BANSHEEV6] ArchetypeRouter: 11 problem types with tailored policies")
print(f"[BANSHEEV6] Archetypes: {[a.value for a in Archetype]}")

In [None]:
# =========================
# A/B SYMBOLIC AUTO-SOLVER
# =========================
# Deterministic solvers for easy A/B patterns
# Runs BEFORE LLM pipeline - instant points with zero variance
# =========================

# Utility: safe eval of pure arithmetic expressions
_ALLOWED_ARITH = set("0123456789+-*/()% ^")

def _safe_arith_eval(expr: str) -> Optional[int]:
    """Evaluate simple integer arithmetic safely."""
    if not expr:
        return None
    expr = expr.strip()

    # normalize ^ to **
    expr = expr.replace("^", "**")
    # quick filter - no letters allowed
    if any(ch.isalpha() for ch in expr):
        return None
    if any(ch not in _ALLOWED_ARITH and ch != "*" for ch in expr):
        return None
    try:
        val = eval(expr, {"__builtins__": {}}, {})
    except Exception:
        return None
    try:
        if isinstance(val, bool):
            return None
        if isinstance(val, int):
            return int(val)
        if isinstance(val, float) and abs(val - round(val)) < 1e-9:
            return int(round(val))
    except Exception:
        return None
    return None


# =========================
# PATTERN SOLVERS
# =========================

def solve_direct_arithmetic(text: str) -> Optional[Tuple[int, Dict[str, Any]]]:
    """
    Catch classic A problems:
    - "Compute 2+2"
    - "Evaluate ( ... )"
    - "Find the value of <expr>"
    """
    t = text.strip()
    # Look for math expression after keywords
    m = re.search(r"(?:compute|evaluate|find the value of)\s*[:\-]?\s*([0-9\(\)\+\-\*\/\%\^\s]+)", t, re.I)
    if not m:
        # Sometimes the whole prompt IS the expression
        m2 = re.search(r"^\s*([0-9\(\)\+\-\*\/\%\^\s]+)\s*[\.\?]?\s*$", t)
        if not m2:
            return None
        expr = m2.group(1)
    else:
        expr = m.group(1)

    expr = expr.strip()
    if len(expr) > 80:
        return None

    ans = _safe_arith_eval(expr)
    if ans is None:
        return None

    if 0 <= ans <= 99999:
        return ans, {"method": "direct_arithmetic", "expr": expr}
    return None


def solve_mod_remainder(text: str) -> Optional[Tuple[int, Dict[str, Any]]]:
    """
    Solve easy remainder problems:
    - "Find the remainder when <expr> is divided by <m>"
    - "Compute <expr> mod <m>"
    - "a^b mod m" (modular exponentiation)
    """
    t = text.lower()

    # Find modulus
    mm = re.search(r"\bmod(?:ulo)?\s*(\d{1,7})\b", t)
    if not mm:
        mm = re.search(r"remainder\s+when.*divided\s+by\s+(\d{1,7})", t)
    if not mm:
        return None
    mod = int(mm.group(1))
    if mod <= 0 or mod > 10**7:
        return None

    # Try exponent form a^b first
    expm = re.search(r"(\d{1,9})\s*\^\s*(\d{1,9})", t)
    if expm:
        a = int(expm.group(1))
        b = int(expm.group(2))
        ans = pow(a, b, mod)
        if 0 <= ans <= 99999:
            return ans, {"method": "pow_mod", "a": a, "b": b, "mod": mod}
        return None

    # Try plain arithmetic expression
    expr = None
    mexpr = re.search(r"(?:compute|find|evaluate)\s+(.+?)\s+\bmod\b\s*\d{1,7}", text, re.I)
    if mexpr:
        expr = mexpr.group(1).strip()
    if expr is None:
        mexpr2 = re.search(r"([0-9\(\)\+\-\*\/\%\^\s]{3,80})\s+\bmod\b\s*\d{1,7}", text, re.I)
        if mexpr2:
            expr = mexpr2.group(1).strip()

    if not expr:
        return None

    val = _safe_arith_eval(expr)
    if val is None:
        return None
    ans = val % mod
    if 0 <= ans <= 99999:
        return ans, {"method": "arith_mod", "expr": expr, "mod": mod}
    return None


def solve_gcd_lcm(text: str) -> Optional[Tuple[int, Dict[str, Any]]]:
    """
    Solve direct GCD/LCM:
    - "gcd(123, 456)"
    - "lcm of 12 and 30"
    """
    t = text.lower()
    
    # GCD
    mg = re.search(r"\bgcd\s*\(\s*(\d{1,9})\s*,\s*(\d{1,9})\s*\)", t)
    if mg:
        a = int(mg.group(1))
        b = int(mg.group(2))
        ans = math.gcd(a, b)
        if 0 <= ans <= 99999:
            return ans, {"method": "gcd", "a": a, "b": b}
        return None

    mg2 = re.search(r"\bgcd\b.*?\b(\d{1,9})\b.*?\b(\d{1,9})\b", t)
    if "gcd" in t and mg2:
        a = int(mg2.group(1))
        b = int(mg2.group(2))
        ans = math.gcd(a, b)
        if 0 <= ans <= 99999:
            return ans, {"method": "gcd", "a": a, "b": b}
        return None

    # LCM
    if "lcm" in t:
        ml = re.search(r"\b(\d{1,9})\b.*?\b(\d{1,9})\b", t)
        if ml:
            a = int(ml.group(1))
            b = int(ml.group(2))
            ans = abs(a // math.gcd(a, b) * b)
            if 0 <= ans <= 99999:
                return ans, {"method": "lcm", "a": a, "b": b}
    return None


def solve_small_diophantine_count(text: str) -> Optional[Tuple[int, Dict[str, Any]]]:
    """
    Count integer solutions in a tiny box:
    - "How many pairs (x,y) with 0<=x,y<=N satisfy x+y=k"
    - "How many integer solutions satisfy x*y=k with bounds"
    """
    t = text.lower()

    # x+y=k with bounds
    m = re.search(r"\bx\s*\+\s*y\s*=\s*(\d{1,6})", t)
    if m:
        k = int(m.group(1))
        b = re.search(r"0\s*<=\s*x\s*,?\s*y\s*<=\s*(\d{1,5})", t)
        if b:
            N = int(b.group(1))
            cnt = 0
            for x in range(0, N + 1):
                y = k - x
                if 0 <= y <= N:
                    cnt += 1
            if 0 <= cnt <= 99999:
                return cnt, {"method": "count_xy_sum", "k": k, "N": N}

    # x*y = k with bounds
    m2 = re.search(r"\bx\s*\*\s*y\s*=\s*(\d{1,6})", t)
    if m2:
        k = int(m2.group(1))
        b = re.search(r"0\s*<=\s*x\s*,?\s*y\s*<=\s*(\d{1,5})", t)
        if b:
            N = int(b.group(1))
            cnt = 0
            for x in range(0, N + 1):
                if x == 0:
                    if k == 0:
                        cnt += (N + 1)
                    continue
                if k % x == 0:
                    y = k // x
                    if 0 <= y <= N:
                        cnt += 1
            if 0 <= cnt <= 99999:
                return cnt, {"method": "count_xy_prod", "k": k, "N": N}

    return None


def solve_sum_formula(text: str) -> Optional[Tuple[int, Dict[str, Any]]]:
    """
    Sum of first n integers/squares/cubes:
    - "sum of first n integers"
    - "1+2+3+...+n"
    """
    t = text.lower()
    
    # Sum of first n integers
    m = re.search(r"sum\s+of\s+(?:the\s+)?first\s+(\d{1,6})\s+(?:positive\s+)?integers", t)
    if m:
        n = int(m.group(1))
        ans = n * (n + 1) // 2
        if 0 <= ans <= 99999:
            return ans, {"method": "sum_n", "n": n}
    
    # 1+2+...+n pattern
    m2 = re.search(r"1\s*\+\s*2\s*\+\s*(?:3\s*\+\s*)?(?:\.\.\.|…)\s*\+?\s*(\d{1,6})", t)
    if m2:
        n = int(m2.group(1))
        ans = n * (n + 1) // 2
        if 0 <= ans <= 99999:
            return ans, {"method": "sum_n", "n": n}
    
    # Sum of squares
    m3 = re.search(r"sum\s+of\s+(?:the\s+)?(?:first\s+)?(\d{1,4})\s+(?:perfect\s+)?squares", t)
    if m3:
        n = int(m3.group(1))
        ans = n * (n + 1) * (2 * n + 1) // 6
        if 0 <= ans <= 99999:
            return ans, {"method": "sum_squares", "n": n}
    
    return None


def solve_factorial_binomial(text: str) -> Optional[Tuple[int, Dict[str, Any]]]:
    """
    Small factorial/binomial:
    - "n!"
    - "C(n,k)" or "n choose k"
    """
    t = text.lower()
    
    # Factorial
    m = re.search(r"(\d{1,2})\s*!", t)
    if m:
        n = int(m.group(1))
        if n <= 12:  # 12! = 479001600, fits in 99999 check
            ans = math.factorial(n)
            if 0 <= ans <= 99999:
                return ans, {"method": "factorial", "n": n}
    
    # Binomial C(n,k)
    m2 = re.search(r"c\s*\(\s*(\d{1,3})\s*,\s*(\d{1,3})\s*\)", t)
    if m2:
        n = int(m2.group(1))
        k = int(m2.group(2))
        if n <= 30 and k <= n:
            ans = math.comb(n, k)
            if 0 <= ans <= 99999:
                return ans, {"method": "binomial", "n": n, "k": k}
    
    # "n choose k"
    m3 = re.search(r"(\d{1,3})\s+choose\s+(\d{1,3})", t)
    if m3:
        n = int(m3.group(1))
        k = int(m3.group(2))
        if n <= 30 and k <= n:
            ans = math.comb(n, k)
            if 0 <= ans <= 99999:
                return ans, {"method": "binomial", "n": n, "k": k}
    
    return None


# =========================
# DISPATCHER
# =========================

AUTO_SOLVERS = [
    solve_direct_arithmetic,
    solve_mod_remainder,
    solve_gcd_lcm,
    solve_small_diophantine_count,
    solve_sum_formula,
    solve_factorial_binomial,
]


def try_autosolve_AB(question_text: str) -> Optional[Tuple[int, Dict[str, Any]]]:
    """Returns (answer, meta) if solved deterministically, else None."""
    for f in AUTO_SOLVERS:
        try:
            out = f(question_text)
            if out is not None:
                ans, meta = out
                if isinstance(ans, int) and 0 <= ans <= 99999:
                    return ans, meta
        except Exception:
            continue
    return None


print("[BANSHEEV6] A/B Auto-Solver: OK")
print(f"[BANSHEEV6] AUTO_SOLVERS: {len(AUTO_SOLVERS)} pattern solvers loaded")

In [None]:
# =========================
# ADAPTIVE SAMPLE CONTROLLER (Tier 1)
# =========================
# Early-exit when confidence is high.
# Key insight: AIMO problems are bimodal -
# many solved in 1-2 tries, few need multiple.
# =========================

from dataclasses import dataclass
from typing import Optional, Tuple


@dataclass
class SampleState:
    """Real-time state of sampling for a problem."""
    samples: List[int]
    verifications: Dict[int, VerificationResult]
    traces: List[str]
    
    # Thresholds (tunable)
    EARLY_EXIT_CONFIDENCE: float = 0.85
    MIN_CLUSTER_SIZE: int = 2
    PERFECT_AGREEMENT_EXIT: bool = True
    
    def add_sample(self, answer: int, vr: VerificationResult, trace: str = ""):
        """Add a new sample and its verification."""
        self.samples.append(answer)
        # Keep best verification per answer
        if answer not in self.verifications or vr.score > self.verifications[answer].score:
            self.verifications[answer] = vr
        if trace:
            self.traces.append(trace)
    
    def should_early_exit(self) -> Tuple[bool, str, Optional[int]]:
        """
        Check if we should stop sampling early.
        
        Returns: (should_exit, reason, confident_answer)
        """
        if len(self.samples) < 2:
            return False, "need_more_samples", None
        
        counter = Counter(self.samples)
        top_answer, top_count = counter.most_common(1)[0]
        total = len(self.samples)
        
        # Get verification for top answer
        vr = self.verifications.get(top_answer)
        verified = vr is not None and vr.score >= 1 and not vr.failed_checks
        
        # === CONDITION 1: Perfect agreement (2+ identical answers, verified) ===
        if self.PERFECT_AGREEMENT_EXIT and top_count >= 2 and verified:
            # All samples so far agree on same answer
            if top_count == total:
                return True, "perfect_agreement", top_answer
        
        # === CONDITION 2: Strong cluster (>= 2/3 agreement + verified) ===
        agreement_ratio = top_count / total
        if top_count >= self.MIN_CLUSTER_SIZE and agreement_ratio >= 0.67 and verified:
            return True, "strong_cluster", top_answer
        
        # === CONDITION 3: High CIC confidence ===
        if len(self.samples) >= 3:
            cic = compute_cic_functional(self.samples, self.traces)
            if cic.confidence >= self.EARLY_EXIT_CONFIDENCE and verified:
                return True, "high_cic_confidence", top_answer
        
        # === CONDITION 4: Cluster tightness (value clustering) ===
        if len(self.samples) >= 3 and RUNTIME.ENABLE_VALUE_CLUSTERING:
            result = value_clustering(self.samples, threshold=0.05)
            if result["best"] is not None:
                best = result["best"]
                # Tight cluster with good size
                if best.size >= 2 and best.tightness >= 0.9:
                    cluster_answer = basin_refinement(best)
                    cluster_vr = self.verifications.get(cluster_answer)
                    if cluster_vr and cluster_vr.score >= 1:
                        return True, "tight_cluster", cluster_answer
        
        return False, "continue", None
    
    def get_recommended_samples(self, budget_seconds: float) -> int:
        """
        Recommend how many more samples to try.
        
        Adaptive based on current state:
        - High agreement -> fewer samples needed
        - No agreement -> more samples needed
        - Time pressure -> cap samples
        """
        if len(self.samples) == 0:
            # First sample - always do at least 2-4
            if budget_seconds < 60:
                return 2
            elif budget_seconds < 180:
                return 4
            else:
                return 6
        
        # Check current agreement
        counter = Counter(self.samples)
        top_count = counter.most_common(1)[0][1]
        total = len(self.samples)
        agreement = top_count / total if total > 0 else 0
        
        # High agreement -> fewer additional samples
        if agreement >= 0.8 and total >= 2:
            return 1  # Just one more to confirm
        elif agreement >= 0.5 and total >= 3:
            return 2  # Two more to break tie
        else:
            # Low agreement - need more but cap it
            remaining = max(2, min(4, int(budget_seconds / 30)))
            return remaining


def create_sample_state() -> SampleState:
    """Factory for new sample state."""
    return SampleState(samples=[], verifications={}, traces=[])


print("[BANSHEEV6] AdaptiveSampleController: EARLY-EXIT enabled")
print(f"[BANSHEEV6] Exit thresholds: confidence>={SampleState.EARLY_EXIT_CONFIDENCE}, cluster>={SampleState.MIN_CLUSTER_SIZE}")

In [None]:
# =========================
# SOLVE - FULL COMPETITION ARCHITECTURE
# =========================
# Tier 1: Adaptive multi-sampling with early-exit
# Tier 2: Difficulty-aware routing
# Tier 3: Two-phase reasoning (Phase B arithmetic)
# Tier 4: Self-consistency refinement (basin re-prompting)
# Tier 5: Control policy + Archetype routing (SELECTIVE AGGRESSOR)
# =========================

def should_skip_fast(*, elapsed, budget, samples, verifs, cic_history, regen_attempts):
    if elapsed < 0.3 * budget:
        return False
    any_verified = any(vr.score >= 1 and "brute_reject" not in vr.failed_checks for vr in verifs.values())
    if any_verified:
        return False
    if len(samples) < 4:
        return False
    if elapsed < 0.5 * budget:
        return False
    return True


def solve(question_text: str, question_id: str) -> int:
    """
    Main solve: Full 5-tier stack.
    
    The selective aggressor: spend compute only when it buys probability mass.
    """
    
    # === PHASE 0: SYMBOLIC AUTO-SOLVE ===
    auto = try_autosolve_AB(question_text)
    if auto is not None:
        ans, meta = auto
        print(f"[AUTO] Solved: {ans} via {meta.get('method')}")
        completed_question_ids.add(question_id)
        return ans
    
    server_ready = ensure_server()
    if not server_ready:
        return fallback_solver(question_text)
    
    if TIME_MANAGER.is_expired():
        return fallback_solver(question_text)
    
    # === TIER 2: DIFFICULTY CLASSIFICATION ===
    difficulty = classify_difficulty(question_text)
    strategy = get_strategy(difficulty)
    
    # === TIER 5: ARCHETYPE DETECTION ===
    archetype, arch_conf = detect_archetype(question_text)
    arch_policy = get_archetype_policy(archetype)
    
    print(f"\n[DIFFICULTY] {difficulty.level.value.upper()} (conf={difficulty.confidence:.2f})")
    print(f"[ARCHETYPE] {archetype.value} (conf={arch_conf:.2f})")
    print(f"[POLICY] samples={arch_policy.min_samples}-{arch_policy.max_samples}, escalate_on_split={arch_policy.escalate_on_split}")
    
    # Reset state
    question_id_to_counter[question_id] = Counter()
    question_id_to_samples[question_id] = []
    question_id_to_traces[question_id] = []
    question_id_to_cic_history[question_id] = []
    question_id_to_verification[question_id] = {}
    completed_question_ids.discard(question_id)
    
    budget = TIME_MANAGER.start_problem()
    solve_start = time.time()
    
    sample_state = create_sample_state()
    sample_state.EARLY_EXIT_CONFIDENCE = strategy.early_exit_threshold
    
    # Use archetype policy for sample limits
    max_generations = min(arch_policy.max_samples, max(2, int(budget / 25)))
    
    print(f"\n[SOLVE] {question_id}")
    print(f"[TIME] {TIME_MANAGER.status()}")
    print(f"[BUDGET] {budget:.0f}s, up to {max_generations} samples")
    
    if budget <= 10:
        TIME_MANAGER.end_problem(0, budget)
        return fallback_solver(question_text)
    
    prompt_indices = get_routed_prompts(question_text, max_generations)
    regen_attempts = 0
    early_exit_answer = None
    early_exit_reason = None
    phase_b_corrections = 0
    
    for i in range(max_generations):
        if not TIME_MANAGER.should_continue(solve_start, budget):
            break
        if question_id in completed_question_ids:
            break
        
        elapsed = time.time() - solve_start
        
        # === TIER 5: CONTROL POLICY DECISION ===
        if i >= arch_policy.min_samples:
            ctrl_state = compute_control_state(
                question_id_to_samples[question_id],
                question_id_to_verification[question_id],
                difficulty,
                elapsed,
                budget,
            )
            decision = control_decision(ctrl_state)
            
            if decision == ControlDecision.ACCEPT:
                samples = question_id_to_samples[question_id]
                if samples:
                    counter = Counter(samples)
                    early_exit_answer = counter.most_common(1)[0][0]
                    early_exit_reason = f"control_accept (sup={ctrl_state.cluster_support:.2f}, tight={ctrl_state.cluster_tightness:.2f})"
                    print(f"[CONTROL] ACCEPT: {early_exit_answer}")
                    TIME_MANAGER.time_banked += 0.4 * (budget - elapsed)
                    break
            elif decision == ControlDecision.FALLBACK:
                print("[CONTROL] FALLBACK triggered")
                break
            # ESCALATE: continue sampling
        
        # === TIER 1: EARLY-EXIT CHECK (backup) ===
        if i >= 2:
            should_exit, reason, confident_answer = sample_state.should_early_exit()
            if should_exit and confident_answer is not None:
                print(f"[EARLY-EXIT] {reason}: {confident_answer}")
                early_exit_answer = confident_answer
                early_exit_reason = reason
                TIME_MANAGER.time_banked += 0.5 * (budget - elapsed)
                break
        
        # === SIMPLE COMPUTE FAST PATH ===
        if archetype == Archetype.SIMPLE_COMPUTE and i >= 1:
            samples = question_id_to_samples[question_id]
            if len(samples) >= 2:
                counter = Counter(samples)
                top_ans, top_count = counter.most_common(1)[0]
                if top_count >= 2:
                    print(f"[SIMPLE-EXIT] 2 agreeing: {top_ans}")
                    early_exit_answer = top_ans
                    TIME_MANAGER.time_banked += 0.6 * (budget - elapsed)
                    break
        
        prompt_idx = prompt_indices[i] if i < len(prompt_indices) else i % len(SYSTEM_PROMPTS)
        sys_prompt = SYSTEM_PROMPTS[prompt_idx]
        
        try:
            answer = generate_solution(question_text, question_id, i, sys_prompt, budget, max_generations)
            
            if answer is not None:
                gen_elapsed = time.time() - solve_start
                
                # === TIER 3: PHASE B (archetype-aware) ===
                if should_run_phase_b(answer, difficulty, gen_elapsed, budget):
                    original = answer
                    # Use archetype-specific Phase B prompt if available
                    answer = run_phase_b(question_text, answer, question_id, min(20, budget - gen_elapsed - 5))
                    if answer != original:
                        phase_b_corrections += 1
                        if original in question_id_to_samples[question_id]:
                            idx = question_id_to_samples[question_id].index(original)
                            question_id_to_samples[question_id][idx] = answer
                            question_id_to_counter[question_id][original] -= 1
                            question_id_to_counter[question_id][answer] += 1
                
                vr = verify_answer(question_text, answer)
                trace = question_id_to_traces[question_id][-1] if question_id_to_traces[question_id] else ""
                sample_state.add_sample(answer, vr, trace)
                
                prev = question_id_to_verification[question_id].get(answer)
                if prev is None or vr.score > prev.score:
                    question_id_to_verification[question_id][answer] = vr
                
                decision = regeneration_policy(question_id, question_text, answer, vr, i)
                print(f"[VERIFY] ans={answer}, score={vr.score}")
                
                if decision == "reject":
                    regen_attempts += 1
                    # Archetype escalation on reject
                    if arch_policy.escalate_on_split and i < max_generations - 1:
                        print("[ESCALATE] Rejected answer, trying again")
                    continue
                
                if decision == "regenerate":
                    regen_attempts += 1
                
                # Crystallization for clustering archetypes
                if strategy.use_clustering and len(question_id_to_samples[question_id]) >= 3:
                    if detect_safe_crystallization(question_id_to_samples[question_id],
                                                  question_id_to_verification[question_id],
                                                  question_id_to_cic_history[question_id]):
                        completed_question_ids.add(question_id)
                        break
        
        except Exception as e:
            print(f"[SOLVE] Gen {i} error: {e}")
            regen_attempts += 1
            continue
    
    # === ANSWER SELECTION ===
    elapsed_before_select = time.time() - solve_start
    
    if early_exit_answer is not None:
        final_answer = early_exit_answer
        print(f"[SELECT] Early-exit: {final_answer} ({early_exit_reason})")
    else:
        samples = question_id_to_samples[question_id]
        print(f"[SOLVE] {len(samples)} samples, {phase_b_corrections} Phase-B corrections")
        
        if not samples:
            final_answer = fallback_solver(question_text)
        else:
            # === TIER 4: SELF-CONSISTENCY REFINEMENT ===
            should_refine, basin_center, basin_members = should_refine_with_basin(
                samples, difficulty, elapsed_before_select, budget
            )
            
            if should_refine and basin_center is not None:
                print(f"[REFINE] Basin refinement around {basin_center}")
                remaining = budget - elapsed_before_select
                refined = run_basin_refinement(
                    question_text, basin_center, basin_members, question_id,
                    timeout=min(25, remaining - 5)
                )
                final_answer = refined if refined is not None else basin_center
            else:
                # Standard selection
                try:
                    if strategy.use_clustering:
                        final_answer, confidence, metadata = select_answer_with_verification(
                            candidates=samples, problem_text=question_text, 
                            fallback=fallback_solver(question_text))
                        print(f"[SELECT] Clustered: {final_answer} (conf={confidence:.2f})")
                    else:
                        counter = Counter(samples)
                        final_answer = counter.most_common(1)[0][0]
                        print(f"[SELECT] Majority: {final_answer}")
                except:
                    final_answer = samples[0] if samples else fallback_solver(question_text)
    
    # Golden clamp
    try:
        final_answer = int(final_answer)
    except:
        final_answer = fallback_solver(question_text)
    
    final_answer = max(0, min(99999, final_answer))
    
    actual_time = time.time() - solve_start
    TIME_MANAGER.end_problem(actual_time, budget)
    
    print(f"[SOLVE] Final: {final_answer} ({actual_time:.1f}s, {difficulty.level.value}, {archetype.value})")
    completed_question_ids.add(question_id)
    return final_answer


print("[BANSHEEV6] solve(): FULL 5-TIER STACK (Selective Aggressor)")

In [None]:
# =========================
# PREDICT - NO UNCONDITIONAL ZEROS
# =========================
import kaggle_evaluation.aimo_3_inference_server
import pandas as pd
import polars as pl

# Load reference for debugging
id_to_answer = {}
try:
    ref_path = "/kaggle/input/ai-mathematical-olympiad-progress-prize-3/reference.csv"
    if os.path.exists(ref_path):
        df = pd.read_csv(ref_path)
        id_to_answer = dict(zip(df["id"], df["answer"]))
        df.drop("answer", axis=1).to_csv("reference.csv", index=False)
except:
    pass

_problems_processed = 0
_correct_count = 0
_total_count = 0


def golden_clamp(answer: int) -> int:
    """SINGLE point of clamping. All paths use this."""
    return max(0, min(99999, int(answer)))


def predict(id_: pl.Series, problem: pl.Series) -> pl.DataFrame:
    """Kaggle API prediction. NEVER returns 0 unconditionally."""
    global _problems_processed, _correct_count, _total_count
    
    try:
        question_id = str(id_.item(0))
    except:
        question_id = "unknown"
    
    try:
        question_text = str(problem.item(0))
    except:
        question_text = ""
    
    _problems_processed += 1
    
    # ChatGPT: "gate verbose prints to avoid stdout limits"
    _verbose = (_problems_processed <= 3) or (_problems_processed % 50 == 0)
    
    if _verbose:
        print(f"\n{'='*60}")
        print(f"[PREDICT] Problem #{_problems_processed}: {question_id}")
        print(f"[TIME] {TIME_MANAGER.status()}")
    
    # Ensure server is ready (lazy start)
    if not ensure_server():
        print("[PREDICT] Server not ready - using fallback")
        prediction = golden_clamp(fallback_solver(question_text) if question_text else 42)
        return pl.DataFrame({"id": [question_id], "answer": [prediction]})
    
    # Time expired - use fallback (not 0)
    if TIME_MANAGER.is_expired():
        print(f"[PREDICT] TIME EXPIRED - using fallback")
        prediction = golden_clamp(fallback_solver(question_text) if question_text else 42)
        return pl.DataFrame({"id": [question_id], "answer": [prediction]})
    
    # Empty problem - use fallback
    if not question_text.strip():
        print(f"[PREDICT] Empty problem - using fallback")
        return pl.DataFrame({"id": [question_id], "answer": [golden_clamp(42)]})
    
    # SOLVE
    try:
        raw_prediction = solve(question_text, question_id=question_id)
        prediction = golden_clamp(int(raw_prediction))
    except Exception as e:
        print(f"[PREDICT] Solve error: {e} - using fallback")
        prediction = golden_clamp(fallback_solver(question_text))
    
    # Score tracking
    if id_to_answer:
        try:
            true_answer = int(id_to_answer.get(question_id, -1))
            _total_count += 1
            if prediction == true_answer and true_answer != -1:
                _correct_count += 1
                print(f"[SCORE] CORRECT: {_correct_count}/{_total_count}")
            elif true_answer != -1:
                print(f"[SCORE] WRONG: pred={prediction}, true={true_answer}")
        except:
            pass
    
    if _verbose:
        print(f"[PREDICT] Answer: {prediction}")
    return pl.DataFrame({"id": [question_id], "answer": [prediction]})

print("[BANSHEEV6] predict(): SINGLE golden_clamp, lazy server start")

In [None]:
# =========================
# RUN SERVER - serve() FIRST
# =========================
# CRITICAL: serve() MUST be called BEFORE any blocking operations.
# vLLM startup is LAZY - happens on first predict() call.
# This ensures Kaggle's 15-minute startup limit is satisfied.
# =========================
print("\n" + "=" * 60)
print("BANSHEEV6: COMPETITION MODE")
print(f"SERVER_WAIT: {RUNTIME.SERVER_WAIT_TIMEOUT}s")
print(f"MOCK_ANSWERS: {RUNTIME.DRY_RUN_MOCK_ANSWERS}")
print(f"CLUSTERING: {RUNTIME.ENABLE_VALUE_CLUSTERING}")
print(f"CRYSTALLIZATION: {RUNTIME.ENABLE_CRYSTALLIZATION}")
print("=" * 60)

inference_server = kaggle_evaluation.aimo_3_inference_server.AIMO3InferenceServer(predict)

# serve() FIRST - ALWAYS
# vLLM starts lazily on first predict() call
if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
    print("[SERVER] Competition rerun - serve() (vLLM starts on first predict)")
    inference_server.serve()
else:
    # Validation run - use gateway with reference.csv
    print("[SERVER] Validation - run_local_gateway() (vLLM starts on first predict)")
    inference_server.run_local_gateway(("reference.csv",))

In [None]:
# =========================
# CLEANUP
# =========================
import atexit

def cleanup():
    global vllm_process
    if vllm_process and vllm_process.poll() is None:
        print("[CLEANUP] Stopping vLLM...")
        vllm_process.terminate()
        try:
            vllm_process.wait(timeout=10)
        except:
            vllm_process.kill()
        print("[CLEANUP] Done")

atexit.register(cleanup)
print("[BANSHEEV6] Cleanup registered")

In [None]:
# =========================
# BANSHEEV6 READY + SELF-TEST
# =========================

def submission_self_test():
    """
    Run this BEFORE submitting to verify everything works.
    ChatGPT: "one-cell submission readiness self-test"
    """
    print("=" * 60)
    print("SUBMISSION SELF-TEST")
    print("=" * 60)
    
    checks = []
    
    # Check 1: Preflight passed?
    try:
        checks.append(("Preflight passed", _PREFLIGHT_PASSED))
    except:
        checks.append(("Preflight passed", False))
    
    # Check 2: vLLM available?
    try:
        checks.append(("vLLM available", _VLLM_AVAILABLE))
    except:
        checks.append(("vLLM available", False))
    
    # Check 3: Model path?
    checks.append(("Model path exists", os.path.exists(MODEL_PATH)))
    
    # Check 4: CUDA?
    try:
        import torch
        checks.append(("CUDA available", torch.cuda.is_available()))
    except:
        checks.append(("CUDA available", False))
    
    # Check 5: predict function exists?
    checks.append(("predict() defined", 'predict' in dir()))
    
    # Check 6: golden_clamp works?
    try:
        test = golden_clamp(123456)
        checks.append(("golden_clamp works", test == 99999))
    except:
        checks.append(("golden_clamp works", False))
    
    # Results
    all_pass = True
    for name, passed in checks:
        status = "✓" if passed else "✗"
        print(f"  [{status}] {name}")
        if not passed:
            all_pass = False
    
    print("=" * 60)
    if all_pass:
        print("✓ ALL CHECKS PASSED - Ready for submission!")
    else:
        print("✗ SOME CHECKS FAILED - Debug before submitting!")
    print("=" * 60)
    
    return all_pass

print("[BANSHEEV6] All cells loaded. Competition mode only.")
print("[BANSHEEV6] Run submission_self_test() to verify readiness.")
