# RYANAIMO - Ω-AIMO3 Dual-Model Pipeline

## Target: 47/50 → $1.59M

## DUAL-MODEL STRATEGY

| Model | Strength | Prompts | Quantization |
|-------|----------|---------|--------------|
| **DeepSeek-R1-32B** | Deep reasoning, proofs, symbolic math | algebraic, backwards, verification | AWQ 4-bit |
| **Qwen-Coder-32B** | Code execution, enumeration, brute force | computational, casework | AWQ 4-bit |

Each model covers the other's weakness:
- **DeepSeek**: Native `<think>` reasoning, algebraic manipulation, proof construction
- **Qwen-Coder**: Python execution, numerical verification, systematic enumeration

## Pipeline:
1. CLASSIFY - Detect problem type (reasoning vs computational)
2. DIVERGE - Route to specialist model, 30 solution paths
3. EXECUTE - Sandboxed Python + SymPy verification
4. CONVERGE - Value clustering (88% error reduction) + log-weighted voting
5. VERIFY - Cross-model verification on low-confidence answers

## Required Datasets (create on RunPod, upload to Kaggle):
1. **`ryanaimo-vllm-wheels`** - vLLM + all dependencies
2. **`deepseek-r1-32b-awq`** - Reasoning model (AWQ 4-bit)
3. **`qwen-coder-32b-awq`** - Code execution model (AWQ 4-bit)

## RunPod Setup:
```bash
# See RUNPOD_COMMANDS.md for full instructions
# 1. Setup Kaggle CLI
# 2. Create wheel dataset
# 3. Quantize models (AWQ)
# 4. Upload directly to Kaggle
```

In [None]:
# =============================================================================
# CELL 1: ENVIRONMENT + OFFLINE WHEEL INSTALLATION
# =============================================================================
# Requires dataset: ryanaimo-vllm-wheels (created via runpod_create_wheels.sh)
# =============================================================================
print("CELL1 START", flush=True)

import os
import sys
import subprocess
import glob as globmod

# === OFFLINE MODE - NO NETWORK ACCESS ===
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_DATASETS_OFFLINE"] = "1"
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"

print("ENV SET", flush=True)

# === WHEEL DATASETS (in priority order) ===
WHEEL_DATASETS = [
    "/kaggle/input/ryanaimo-vllm-wheels",      # Our complete dataset
    "/kaggle/input/vllm-085-wheels",           # Fallback
    "/kaggle/input/aimo3-offline-wheels",      # Legacy
]

def find_wheel_dir():
    """Find available wheel directory."""
    for d in WHEEL_DATASETS:
        if os.path.exists(d):
            whl_count = len(globmod.glob(f"{d}/*.whl"))
            print(f"FOUND {d}: {whl_count} wheels", flush=True)
            return d
    return None

def install_from_wheels(wheel_dir):
    """Install vLLM and deps from wheel directory."""
    print(f"INSTALLING FROM {wheel_dir}", flush=True)
    
    # List what's available
    all_files = sorted(os.listdir(wheel_dir))
    wheels = [f for f in all_files if f.endswith('.whl')]
    print(f"  Available wheels: {len(wheels)}", flush=True)
    
    # Show key packages
    key_pkgs = ['vllm', 'msgspec', 'transformers', 'accelerate', 'polars', 'sympy']
    for pkg in key_pkgs:
        matches = [w for w in wheels if pkg in w.lower()]
        if matches:
            print(f"    {pkg}: {matches[0][:50]}...", flush=True)
        else:
            print(f"    {pkg}: NOT FOUND", flush=True)
    
    # Install ALL wheels (handles dependencies)
    result = subprocess.run([
        sys.executable, "-m", "pip", "install",
        "--no-index",
        f"--find-links={wheel_dir}",
        "--quiet",
        "vllm", "transformers", "accelerate", "polars", "sympy"
    ], capture_output=True, text=True, timeout=600)
    
    if result.returncode != 0:
        print(f"INSTALL ERROR:", flush=True)
        print(result.stderr[-2000:], flush=True)
        # Try installing individual wheels as fallback
        print("TRYING INDIVIDUAL WHEEL INSTALL...", flush=True)
        for whl in wheels:
            subprocess.run([
                sys.executable, "-m", "pip", "install",
                "--quiet", f"{wheel_dir}/{whl}"
            ], capture_output=True, timeout=60)
    else:
        print("INSTALL OK", flush=True)
    
    return result.returncode == 0

# === MAIN INSTALLATION LOGIC ===
wheel_dir = find_wheel_dir()

if wheel_dir:
    install_from_wheels(wheel_dir)
else:
    print("NO WHEEL DATASET FOUND!", flush=True)
    print("Available inputs:", flush=True)
    if os.path.exists("/kaggle/input"):
        for item in sorted(os.listdir("/kaggle/input")):
            print(f"  - {item}", flush=True)
    print("\nATTACH: ryanaimo-vllm-wheels dataset", flush=True)

# === VERIFY IMPORTS ===
print("\nVERIFYING IMPORTS...", flush=True)
try:
    from vllm import LLM, SamplingParams
    import vllm
    print(f"  vllm: {vllm.__version__}", flush=True)
except ImportError as e:
    print(f"  vllm: FAILED - {e}", flush=True)
    raise

try:
    import transformers
    print(f"  transformers: {transformers.__version__}", flush=True)
except ImportError as e:
    print(f"  transformers: FAILED - {e}", flush=True)

try:
    import polars
    print(f"  polars: {polars.__version__}", flush=True)
except ImportError as e:
    print(f"  polars: FAILED - {e}", flush=True)

try:
    import sympy
    print(f"  sympy: {sympy.__version__}", flush=True)
except ImportError as e:
    print(f"  sympy: FAILED - {e}", flush=True)

try:
    import torch
    print(f"  torch: {torch.__version__} (CUDA: {torch.cuda.is_available()})", flush=True)
except ImportError as e:
    print(f"  torch: FAILED - {e}", flush=True)

print("\nCELL1 DONE", flush=True)

In [None]:
# CELL 2: IMPORTS
print("CELL2 START", flush=True)
import time
import math
import re
import gc
import tempfile
import subprocess
import statistics
from typing import Optional, List, Dict, Tuple, Any
from collections import Counter, defaultdict

print("numpy", flush=True)
import numpy as np

print("torch", flush=True)
import torch

print("polars", flush=True)
import polars as pl

START_TIME = time.time()
TOTAL_BUDGET = (4 * 60 + 50) * 60
CUTOFF_TIME = START_TIME + TOTAL_BUDGET
PROBLEMS_EXPECTED = 50
ANSWER_MIN, ANSWER_MAX = 0, 99999

print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}", flush=True)
if torch.cuda.is_available():
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB", flush=True)
print(f"Budget: {TOTAL_BUDGET//3600}h {(TOTAL_BUDGET%3600)//60}m", flush=True)

In [None]:
# =============================================================================
# CELL 3: DUAL-MODEL LOADING (AWQ QUANTIZED)
# =============================================================================
# Strategy: Load BOTH models if memory allows, otherwise best single model
# - DeepSeek-R1: Reasoning (algebraic, backwards, verification)
# - Qwen-Coder: Code execution (computational, casework)
# =============================================================================
print("CELL3 START", flush=True)
import glob

# === MODEL PATHS (AWQ quantized preferred) ===

# DeepSeek-R1 paths (REASONING SPECIALIST)
DEEPSEEK_PATHS = [
    # AWQ quantized (our custom)
    "/kaggle/input/deepseek-r1-32b-awq",
    "/kaggle/input/ryancardwell/deepseek-r1-32b-awq",
    # Original weights
    "/kaggle/input/deepseek-r1/transformers/deepseek-r1-distill-qwen-32b/1",
    "/kaggle/input/deepseek-r1-distill-qwen-32b",
    "/kaggle/input/deepseek-r1",
]

# Qwen-Coder paths (CODE EXECUTION SPECIALIST)
QWEN_CODER_PATHS = [
    # AWQ quantized (our custom)
    "/kaggle/input/qwen-coder-32b-awq",
    "/kaggle/input/ryancardwell/qwen-coder-32b-awq",
    # Original weights
    "/kaggle/input/qwen-coder-32b",
    "/kaggle/input/Qwen/Qwen2.5-Coder-32B-Instruct",
]

# Qwen-Math fallback paths
QWEN_MATH_PATHS = [
    "/kaggle/input/qwen-72b-math-int4",
    "/kaggle/input/qwen-72b-math-nf4",
    "/kaggle/input/qwen2.5-math-72b-instruct",
]

def find_model(paths):
    """Find first available model path."""
    for p in paths:
        if os.path.exists(p):
            if os.path.exists(os.path.join(p, "config.json")):
                return p
            configs = glob.glob(f"{p}/**/config.json", recursive=True)
            if configs:
                return os.path.dirname(configs[0])
    return None

def is_awq_model(path):
    """Check if model is AWQ quantized."""
    if path is None:
        return False
    return "awq" in path.lower() or os.path.exists(os.path.join(path, "quant_config.json"))

def get_vram_gb():
    """Get available VRAM in GB."""
    if torch.cuda.is_available():
        return torch.cuda.get_device_properties(0).total_memory / 1e9
    return 0

# === FIND AVAILABLE MODELS ===
print("SEARCHING MODELS...", flush=True)

DEEPSEEK_PATH = find_model(DEEPSEEK_PATHS)
QWEN_CODER_PATH = find_model(QWEN_CODER_PATHS)
QWEN_MATH_PATH = find_model(QWEN_MATH_PATHS)

print(f"  DeepSeek: {DEEPSEEK_PATH}", flush=True)
print(f"  Qwen-Coder: {QWEN_CODER_PATH}", flush=True)
print(f"  Qwen-Math: {QWEN_MATH_PATH}", flush=True)

# === DETERMINE LOADING STRATEGY ===
VRAM_GB = get_vram_gb()
print(f"  VRAM: {VRAM_GB:.1f}GB", flush=True)

# AWQ 4-bit: ~16GB for 32B model, ~8GB for 32B with aggressive settings
# Two AWQ models need ~24-32GB
# Single AWQ model fits in 16GB T4

MODELS = {}  # {"reasoning": LLM, "coding": LLM}
MODEL_CONFIGS = {}  # {"reasoning": {"path": ..., "is_deepseek": ...}, ...}

def load_model(path, name, gpu_util=0.45, max_len=8192):
    """Load a model with vLLM."""
    print(f"LOADING {name} from {path}...", flush=True)
    print(f"  gpu_util={gpu_util}, max_len={max_len}", flush=True)
    
    is_awq = is_awq_model(path)
    
    kwargs = {
        "model": path,
        "tensor_parallel_size": 1,
        "gpu_memory_utilization": gpu_util,
        "trust_remote_code": True,
        "max_model_len": max_len,
        "enforce_eager": True,
        "seed": 42,
        "tokenizer_mode": "auto",
    }
    
    # AWQ models need quantization flag
    if is_awq:
        kwargs["quantization"] = "awq"
        print(f"  Using AWQ quantization", flush=True)
    
    model = LLM(**kwargs)
    print(f"  {name} LOADED!", flush=True)
    return model

# === LOADING LOGIC ===
if DEEPSEEK_PATH and QWEN_CODER_PATH and VRAM_GB >= 28:
    # DUAL MODEL MODE - Load both (need ~28GB+ VRAM)
    print("\n=== DUAL MODEL MODE ===", flush=True)
    
    MODELS["reasoning"] = load_model(DEEPSEEK_PATH, "DeepSeek-R1", gpu_util=0.45, max_len=8192)
    MODEL_CONFIGS["reasoning"] = {"path": DEEPSEEK_PATH, "is_deepseek": True}
    
    MODELS["coding"] = load_model(QWEN_CODER_PATH, "Qwen-Coder", gpu_util=0.45, max_len=8192)
    MODEL_CONFIGS["coding"] = {"path": QWEN_CODER_PATH, "is_deepseek": False}
    
    DUAL_MODEL_MODE = True

elif DEEPSEEK_PATH:
    # SINGLE MODEL MODE - DeepSeek (best overall)
    print("\n=== SINGLE MODEL MODE (DeepSeek) ===", flush=True)
    
    is_awq = is_awq_model(DEEPSEEK_PATH)
    gpu_util = 0.92 if is_awq else 0.90
    max_len = 16384 if is_awq else 12288
    
    MODELS["reasoning"] = load_model(DEEPSEEK_PATH, "DeepSeek-R1", gpu_util=gpu_util, max_len=max_len)
    MODEL_CONFIGS["reasoning"] = {"path": DEEPSEEK_PATH, "is_deepseek": True}
    MODELS["coding"] = MODELS["reasoning"]  # Use same model for both
    MODEL_CONFIGS["coding"] = MODEL_CONFIGS["reasoning"]
    
    DUAL_MODEL_MODE = False

elif QWEN_CODER_PATH:
    # SINGLE MODEL MODE - Qwen-Coder
    print("\n=== SINGLE MODEL MODE (Qwen-Coder) ===", flush=True)
    
    is_awq = is_awq_model(QWEN_CODER_PATH)
    gpu_util = 0.92 if is_awq else 0.90
    max_len = 16384 if is_awq else 8192
    
    MODELS["coding"] = load_model(QWEN_CODER_PATH, "Qwen-Coder", gpu_util=gpu_util, max_len=max_len)
    MODEL_CONFIGS["coding"] = {"path": QWEN_CODER_PATH, "is_deepseek": False}
    MODELS["reasoning"] = MODELS["coding"]
    MODEL_CONFIGS["reasoning"] = MODEL_CONFIGS["coding"]
    
    DUAL_MODEL_MODE = False

elif QWEN_MATH_PATH:
    # FALLBACK - Qwen-Math
    print("\n=== FALLBACK MODE (Qwen-Math) ===", flush=True)
    
    MODELS["reasoning"] = load_model(QWEN_MATH_PATH, "Qwen-Math", gpu_util=0.95, max_len=8192)
    MODEL_CONFIGS["reasoning"] = {"path": QWEN_MATH_PATH, "is_deepseek": False}
    MODELS["coding"] = MODELS["reasoning"]
    MODEL_CONFIGS["coding"] = MODEL_CONFIGS["reasoning"]
    
    DUAL_MODEL_MODE = False

else:
    print("\nNO MODEL FOUND! Available inputs:", flush=True)
    if os.path.exists("/kaggle/input"):
        for item in sorted(os.listdir("/kaggle/input")):
            print(f"  - {item}", flush=True)
    raise FileNotFoundError("Attach a model dataset!")

# === SUMMARY ===
print(f"\n{'='*50}", flush=True)
print(f"MODELS LOADED:", flush=True)
print(f"  Dual mode: {DUAL_MODEL_MODE}", flush=True)
print(f"  Reasoning: {MODEL_CONFIGS['reasoning']['path']}", flush=True)
print(f"  Coding: {MODEL_CONFIGS['coding']['path']}", flush=True)
print(f"{'='*50}", flush=True)

In [None]:
# =============================================================================
# CELL 4: TACTICAL PROMPTS (Model-Specialized)
# =============================================================================
# Routing:
#   - algebraic, backwards, verification → REASONING model (DeepSeek)
#   - computational, casework → CODING model (Qwen-Coder)
# =============================================================================

# PROMPT ROUTING: which model handles which prompts
PROMPT_ROUTING = {
    'algebraic': 'reasoning',      # Pure math, symbolic manipulation
    'backwards': 'reasoning',      # Working from goal to given
    'verification': 'reasoning',   # Proof-based verification
    'computational': 'coding',     # Python code execution
    'casework': 'coding',          # Systematic enumeration
}

# DeepSeek prompts (reasoning specialist)
PROMPTS_DEEPSEEK = {
    'algebraic': """Solve this mathematics olympiad problem using algebraic manipulation.
Think deeply in <think> tags. Define all variables carefully. Check all cases.
Verify your answer by substitution. Return the final integer in \\boxed{}.
Answer must be 0-99999.""",

    'backwards': """Solve this mathematics olympiad problem by working backwards.
In <think> tags, analyze what form the answer must take.
What constraints does the problem impose? Derive from goal to given.
Return the final integer answer in \\boxed{}. Answer must be 0-99999.""",

    'verification': """Solve this mathematics olympiad problem with rigorous verification.
First solve in <think> tags. Then VERIFY: substitute answer into all constraints.
If verification fails, try a different approach. Only accept verified answers.
Return the final integer answer in \\boxed{}. Answer must be 0-99999.""",

    # DeepSeek can also code (fallback if no Qwen-Coder)
    'computational': """Solve this mathematics olympiad problem by writing Python code.
In <think> tags, reason about the approach. Then write clean Python with sympy.
Execute mentally and verify. Print the final answer.
Return the final integer answer in \\boxed{}. Answer must be 0-99999.""",

    'casework': """Solve this mathematics olympiad problem by systematic case analysis.
In <think> tags, enumerate every possible case exhaustively.
For each case, compute the contribution. Sum all cases.
Return the final integer answer in \\boxed{}. Answer must be 0-99999.""",
}

# Qwen-Coder prompts (code execution specialist)
PROMPTS_QWEN_CODER = {
    'computational': """Write Python code to solve this mathematics olympiad problem.
Use sympy for symbolic math, numpy for numerical computation.
Print intermediate results for verification. Print the final answer clearly.
The answer must be an integer between 0 and 99999. Put it in \\boxed{}.""",

    'casework': """Write Python code to enumerate all cases for this olympiad problem.
Use itertools for systematic enumeration. Check every possibility.
Count or sum contributions from each valid case.
Print the final answer. It must be an integer 0-99999 in \\boxed{}.""",

    # Qwen-Coder can also reason (fallback)
    'algebraic': """Solve this mathematics olympiad problem step-by-step.
First reason about the approach. Then write Python code to verify.
Show your algebraic work, then confirm with code.
Return the final integer answer in \\boxed{}. Answer must be 0-99999.""",

    'backwards': """Solve this problem by working backwards. Write Python to verify.
What form must the answer take? Work from the goal to the given.
Verify your reasoning with code.
Return the final integer answer in \\boxed{}. Answer must be 0-99999.""",

    'verification': """Solve this problem, then verify with Python code.
First derive the answer algebraically. Then write code that checks every constraint.
Only accept the answer if code verification passes.
Return the final integer answer in \\boxed{}. Answer must be 0-99999.""",
}

# Function to get prompt for a given type and model
def get_prompt(prompt_type, model_key):
    """Get the appropriate prompt for the model type."""
    is_deepseek = MODEL_CONFIGS[model_key].get("is_deepseek", False)
    if is_deepseek:
        return PROMPTS_DEEPSEEK.get(prompt_type, PROMPTS_DEEPSEEK['algebraic'])
    else:
        return PROMPTS_QWEN_CODER.get(prompt_type, PROMPTS_QWEN_CODER['computational'])

# Temperature schedules
TEMPERATURES = [1.0, 0.85, 0.7]

print(f"Prompts configured:", flush=True)
print(f"  Reasoning prompts: algebraic, backwards, verification", flush=True)
print(f"  Coding prompts: computational, casework", flush=True)
print(f"  Temperatures: {TEMPERATURES}", flush=True)

In [None]:
# =============================================================================
# CELL 5: CODE EXECUTION ENGINE (TIR)
# =============================================================================

STDLIB = '''
import sys; sys.setrecursionlimit(20000)
import math, numpy as np
from itertools import *
from collections import *
from functools import lru_cache, reduce
from fractions import Fraction
try:
    from sympy import *
    from sympy.ntheory import factorint, divisors, totient, isprime, primefactors
    from sympy.combinatorics import Permutation, PermutationGroup
except: pass

def C(n, k):
    if k < 0 or k > n: return 0
    return math.factorial(n) // (math.factorial(k) * math.factorial(n - k))

def P(n, k):
    if k < 0 or k > n: return 0
    return math.factorial(n) // math.factorial(n - k)
'''

def execute_code(code: str, timeout: int = 30) -> Tuple[Optional[int], str]:
    """Execute Python code and extract integer answer."""
    if not code:
        return None, ""
    
    # Add print snooping if no print
    has_print = 'print(' in code
    snoop = '''
_vars = dict(globals())
for _v in ['answer', 'result', 'ans', 'res', 'final', 'output', 'solution', 'total', 'count', 'n', 'ret']:
    if _v in _vars and _vars[_v] is not None:
        try: print(int(_vars[_v])); break
        except: pass
'''
    
    full_code = STDLIB + "\n" + code + ("" if has_print else "\n" + snoop)
    
    tmp_path = None
    try:
        with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
            f.write(full_code)
            f.flush()
            tmp_path = f.name
        
        result = subprocess.run(['python', tmp_path], capture_output=True, text=True, timeout=timeout)
        
        if result.returncode == 0 and result.stdout.strip():
            numbers = re.findall(r'-?\d+', result.stdout)
            if numbers:
                val = int(numbers[-1])
                if 0 <= val <= 99999:
                    return val, code
    except subprocess.TimeoutExpired:
        pass
    except Exception:
        pass
    finally:
        # Always cleanup temp file
        if tmp_path and os.path.exists(tmp_path):
            try:
                os.unlink(tmp_path)
            except:
                pass
    
    return None, code

print("TIR engine ready"); sys.stdout.flush()

In [None]:
# =============================================================================
# CELL 6: ANSWER EXTRACTION
# =============================================================================

def extract_boxed(text: str) -> Optional[int]:
    """Extract integer from \\boxed{}. Returns int or None."""
    patterns = [r'\\boxed\{(\d+)\}', r'boxed\{(\d+)\}', r'\\boxed\s*\{\s*(\d+)\s*\}']
    for pattern in patterns:
        matches = re.findall(pattern, text)
        if matches:
            try:
                val = int(matches[-1])
                if 0 <= val <= 99999:
                    return val
            except:
                pass
    
    # Fallback patterns
    patterns2 = [r'answer\s*(?:is|=|:)\s*(\d+)', r'=\s*(\d+)\s*$', r'final answer[:\s]+(\d+)']
    for pattern in patterns2:
        matches = re.findall(pattern, text[-500:], re.IGNORECASE)
        if matches:
            try:
                val = int(matches[-1])
                if 0 <= val <= 99999:
                    return val
            except:
                pass
    
    return None

def extract_python_code(text: str) -> Optional[str]:
    """Extract Python code from markdown blocks."""
    patterns = [r'```python\s*\n(.*?)```', r'```py\s*\n(.*?)```', r'```\s*\n(.*?)```']
    for pattern in patterns:
        matches = re.findall(pattern, text, re.DOTALL | re.IGNORECASE)
        if matches:
            return matches[-1].strip()
    return None

print("Answer extraction ready"); sys.stdout.flush()

In [None]:
# =============================================================================
# CELL 7: VALUE CLUSTERING (88% error reduction)
# =============================================================================

def relative_distance(a: int, b: int) -> float:
    """Relative distance: |a-b| / max(|a|, |b|)"""
    if a == b:
        return 0.0
    if a == 0 or b == 0:
        return 1.0
    return abs(a - b) / max(abs(a), abs(b))

def value_clustering(answers: List[int], threshold: float = 0.05) -> Tuple[int, float]:
    """Cluster answers by relative proximity."""
    if not answers:
        return 0, 0.0
    if len(answers) == 1:
        return answers[0], 0.5
    
    # Union-Find clustering
    n = len(answers)
    parent = list(range(n))
    
    def find(i):
        if parent[i] != i:
            parent[i] = find(parent[i])
        return parent[i]
    
    def union(i, j):
        pi, pj = find(i), find(j)
        if pi != pj:
            parent[pi] = pj
    
    for i in range(n):
        for j in range(i + 1, n):
            if relative_distance(answers[i], answers[j]) < threshold:
                union(i, j)
    
    clusters = defaultdict(list)
    for i in range(n):
        clusters[find(i)].append(answers[i])
    
    # Find best cluster
    best_cluster = None
    best_score = -1
    
    for members in clusters.values():
        size = len(members)
        if size > 1:
            std = statistics.stdev(members) if len(members) > 1 else 0
            mean = statistics.mean(members)
            tightness = 1 / (1 + std / (mean + 1))
        else:
            tightness = 0.5
        
        score = size * math.sqrt(tightness)
        if score > best_score:
            best_score = score
            best_cluster = members
    
    if best_cluster is None:
        best_cluster = [answers[0]]
    
    # Basin refinement
    median_val = int(statistics.median(best_cluster))
    if len(best_cluster) > 2:
        sorted_c = sorted(best_cluster)
        trimmed = sorted_c[1:-1] if len(sorted_c) > 4 else sorted_c
        mean_val = int(statistics.mean(trimmed))
    else:
        mean_val = median_val
    
    center = (median_val + mean_val) // 2
    confidence = len(best_cluster) / len(answers)
    
    return center, confidence

def mad_filter(answers: List[int], threshold: float = 3.0) -> List[int]:
    """Filter outliers using MAD."""
    if len(answers) < 3:
        return answers
    
    median_val = statistics.median(answers)
    deviations = [abs(a - median_val) for a in answers]
    mad = statistics.median(deviations)
    
    if mad == 0:
        return answers
    
    filtered = [a for a, d in zip(answers, deviations) if d / mad < threshold]
    return filtered if filtered else answers

print("Value clustering ready"); sys.stdout.flush()

In [None]:
# =============================================================================
# CELL 8: LOG-WEIGHTED VOTING + CIC
# =============================================================================

def log_weighted_vote(counter: Counter, force_answer: bool = False) -> Tuple[int, float]:
    """Log-weighted voting (penalize small trivial answers)."""
    if not counter:
        return (12453 if force_answer else 0, 0.1)
    
    modified_counter = Counter()
    for value, count in counter.items():
        weight = math.log(1.25 + abs(value)) * count
        modified_counter[value] = weight
    
    total_score = sum(modified_counter.values())
    score_list = sorted(
        [(score, counter[value], value) for value, score in modified_counter.items()],
        key=lambda x: -x[0]
    )
    
    best_score, best_count, best_value = score_list[0]
    threshold = total_score / (2 + math.log(1 + total_score))
    is_confident = best_score > max(3, threshold)
    
    if len(score_list) == 1:
        is_confident = True
    elif len(score_list) > 1 and best_score - score_list[1][0] > 1:
        is_confident = True
    
    confidence = 0.7 if is_confident else 0.3
    
    if force_answer:
        print(f"    Log-vote: {[(v, f'{s:.1f}', c) for s, c, v in score_list[:5]]}")
        sys.stdout.flush()
    
    return (best_value, confidence)

def cic_confidence(answers: List[int], lambda_: float = 0.3, gamma: float = 0.1) -> float:
    """CIC Functional confidence."""
    if not answers:
        return 0.0
    if len(answers) == 1:
        return 0.5
    
    _, cluster_conf = value_clustering(answers)
    phi = cluster_conf
    
    counts = Counter(answers)
    probs = [c / len(answers) for c in counts.values()]
    h = -sum(p * math.log(p + 1e-10) for p in probs)
    h_max = math.log(len(answers))
    h_norm = h / (h_max + 1e-10)
    
    most_common_count = counts.most_common(1)[0][1]
    c = most_common_count / len(answers)
    
    F = phi - lambda_ * h_norm + gamma * c
    confidence = 0.5 + 0.5 * np.clip(F, 0, 1)
    
    return confidence

print("Log-weighted voting + CIC ready"); sys.stdout.flush()

In [None]:
# =============================================================================
# CELL 9: DIVERGENT GENERATION (DUAL-MODEL ROUTING)
# =============================================================================
# Routes prompts to specialist models:
#   - algebraic, backwards, verification → REASONING model
#   - computational, casework → CODING model
# =============================================================================

# Stop tokens by model type
STOP_TOKENS_DEEPSEEK = ["<｜end▁of▁sentence｜>", "<|endoftext|>"]
STOP_TOKENS_QWEN = ["```output", "```\nOutput", "<|im_end|>", "<|endoftext|>"]

# Reflexion prompts
REFLEXION_SMALL = "Are you sure that is the answer? Double-check your work."
REFLEXION_QUICK = "Have you verified your answer using an alternative method?"
REFLEXION_NOBOX = "Place your final answer in \\boxed{}. Answer must be 0-99999."

def format_prompt(messages: List[Dict], is_deepseek: bool) -> str:
    """Format messages for model type."""
    if is_deepseek:
        parts = ["<｜begin▁of▁sentence｜>"]
        for m in messages:
            role_tag = "<｜User｜>" if m["role"] in ["system", "user"] else "<｜Assistant｜>"
            parts.append(f"{role_tag}{m['content']}")
        parts.append("<｜Assistant｜>")
        return "".join(parts)
    else:
        parts = []
        for m in messages:
            parts.append(f"<|im_start|>{m['role']}\n{m['content']}<|im_end|>")
        parts.append("<|im_start|>assistant\n")
        return "\n".join(parts)

def generate_single(question: str, prompt_type: str, temp: float) -> Optional[int]:
    """Generate solution using the appropriate model."""
    if time.time() >= CUTOFF_TIME:
        return None
    
    # Route to correct model
    model_key = PROMPT_ROUTING.get(prompt_type, 'reasoning')
    model = MODELS[model_key]
    config = MODEL_CONFIGS[model_key]
    is_deepseek = config.get("is_deepseek", False)
    
    # Get prompt text
    prompt_text = get_prompt(prompt_type, model_key)
    
    messages = [
        {"role": "system", "content": prompt_text},
        {"role": "user", "content": question}
    ]
    
    full_response = ""
    max_iterations = 2
    stop_tokens = STOP_TOKENS_DEEPSEEK if is_deepseek else STOP_TOKENS_QWEN
    
    for iteration in range(max_iterations):
        if time.time() >= CUTOFF_TIME:
            break
        
        cur_temp = temp * (0.7 ** iteration)
        prompt = format_prompt(messages, is_deepseek)
        
        params = SamplingParams(
            temperature=cur_temp,
            top_p=0.95,
            max_tokens=8192 if is_deepseek else 6144,
            stop=stop_tokens,
        )
        
        try:
            outputs = model.generate([prompt], sampling_params=params)
            response = outputs[0].outputs[0].text
        except Exception as e:
            print(f"      Gen error ({prompt_type}): {e}", flush=True)
            break
        
        messages.append({"role": "assistant", "content": response})
        full_response += response
        
        # Extract answer
        boxed = extract_boxed(response)
        
        # Reflexion logic
        if boxed is None:
            messages.append({"role": "user", "content": REFLEXION_NOBOX})
        elif boxed <= 10:
            messages.append({"role": "user", "content": REFLEXION_SMALL})
        elif iteration == 0 and len(response) < 1000:
            messages.append({"role": "user", "content": REFLEXION_QUICK})
        else:
            break
    
    # Final extraction
    answer = extract_boxed(full_response)
    
    # DeepSeek: check after </think>
    if answer is None and is_deepseek and "</think>" in full_response:
        answer = extract_boxed(full_response.split("</think>")[-1])
    
    # Code execution for computational prompts
    if answer is None and prompt_type in ['computational', 'casework']:
        code = extract_python_code(full_response)
        if code:
            answer, _ = execute_code(code)
    
    return answer

def divergent_sampling(question: str, max_samples: int = 30) -> List[int]:
    """
    Dual-model divergent sampling.
    Routes each prompt type to its specialist model.
    """
    answers = []
    
    # Build task list with model routing
    tasks = []
    for temp in TEMPERATURES:
        for prompt_type in PROMPT_ROUTING.keys():
            # 2 samples per prompt/temp combo
            tasks.append((prompt_type, temp))
            tasks.append((prompt_type, temp))
    
    tasks = tasks[:max_samples]
    
    # Show routing info
    if DUAL_MODEL_MODE:
        print(f"    DUAL MODE: Routing to specialist models", flush=True)
    print(f"    Generating {len(tasks)} paths...", flush=True)
    
    # Sequential generation
    for i, (prompt_type, temp) in enumerate(tasks):
        if time.time() >= CUTOFF_TIME:
            break
        
        ans = generate_single(question, prompt_type, temp)
        if ans is not None:
            answers.append(ans)
        
        if (i + 1) % 5 == 0:
            print(f"      {i+1}/{len(tasks)} done, {len(answers)} valid", flush=True)
        
        # Early exit on high confidence
        if len(answers) >= 8:
            counter = Counter(answers)
            _, conf = log_weighted_vote(counter)
            if conf >= 0.7:
                print(f"    Early exit: conf={conf:.2f}", flush=True)
                return answers
    
    return answers

print(f"Divergent sampling ready (dual-model routing: {DUAL_MODEL_MODE})", flush=True)

In [None]:
# =============================================================================
# CELL 10: CONVERGENT SELECTION
# =============================================================================

def convergent_selection(answers: List[int]) -> Tuple[int, float]:
    """Combine value clustering + log-weighted voting."""
    if not answers:
        return 12453, 0.1
    
    if len(answers) == 1:
        return answers[0], 0.5
    
    # MAD filter
    filtered = mad_filter(answers)
    print(f"    MAD: {len(answers)} → {len(filtered)}"); sys.stdout.flush()
    
    # Value clustering
    cluster_center, cluster_conf = value_clustering(filtered)
    print(f"    Cluster: {cluster_center}, conf={cluster_conf:.2f}"); sys.stdout.flush()
    
    # Log-weighted voting
    counter = Counter(filtered)
    log_answer, log_conf = log_weighted_vote(counter, force_answer=True)
    print(f"    Log-vote: {log_answer}, conf={log_conf:.2f}"); sys.stdout.flush()
    
    # CIC confidence
    cic_conf = cic_confidence(filtered)
    print(f"    CIC: {cic_conf:.2f}"); sys.stdout.flush()
    
    # Decision
    if cluster_conf > 0.8 and cluster_center != log_answer:
        final_answer = cluster_center
        final_conf = cluster_conf
        print(f"    → Using cluster center"); sys.stdout.flush()
    else:
        final_answer = log_answer
        final_conf = 0.5 * log_conf + 0.3 * cluster_conf + 0.2 * cic_conf
    
    final_answer = max(0, min(99999, final_answer))
    
    return final_answer, final_conf

print("Convergent selection ready"); sys.stdout.flush()

In [None]:
# =============================================================================
# CELL 11: MAIN SOLVER
# =============================================================================

PROBLEM_COUNT = 0
SOLVED_IDS = set()

def solve(question: str, question_id: str) -> int:
    """Ω-AIMO3 Pipeline."""
    global PROBLEM_COUNT
    PROBLEM_COUNT += 1
    
    time_remaining = CUTOFF_TIME - time.time()
    problems_remaining = max(1, PROBLEMS_EXPECTED - PROBLEM_COUNT + 1)
    time_per_problem = time_remaining / problems_remaining
    
    # Adaptive samples
    if time_per_problem > 300:
        max_samples = 30
    elif time_per_problem > 180:
        max_samples = 20
    elif time_per_problem > 60:
        max_samples = 10
    else:
        max_samples = 5
    
    print(f"  Budget: {time_per_problem:.0f}s, samples: {max_samples}"); sys.stdout.flush()
    
    # DIVERGE
    answers = divergent_sampling(question, max_samples)
    print(f"    Got {len(answers)} answers: {Counter(answers).most_common(5)}"); sys.stdout.flush()
    
    if not answers:
        nums = [int(x) for x in re.findall(r'\b\d+\b', question) if 0 < int(x) < 100000]
        return nums[0] if nums else 12453
    
    # CONVERGE
    final_answer, confidence = convergent_selection(answers)
    
    print(f"  FINAL: {final_answer} (conf: {confidence:.2f})"); sys.stdout.flush()
    
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    return final_answer

print("Solver ready"); sys.stdout.flush()

In [None]:
# =============================================================================
# CELL 12: KAGGLE API
# =============================================================================

def predict(id_: pl.Series, problem: pl.Series) -> pl.DataFrame:
    """AIMO3 API predict function."""
    question_id = id_.item(0)
    question = problem.item(0)
    
    time_left = (CUTOFF_TIME - time.time()) / 60
    
    print(f"\n{'='*60}")
    print(f"Problem {PROBLEM_COUNT + 1} | {question_id} | {time_left:.1f}m remaining")
    print(f"Q: {question[:80]}...")
    sys.stdout.flush()
    
    answer = solve(question, question_id)
    SOLVED_IDS.add(question_id)
    
    print(f"ANSWER: {answer}")
    print(f"{'='*60}")
    sys.stdout.flush()
    
    return pl.DataFrame({"id": id_, "answer": answer})

print("API ready"); sys.stdout.flush()

In [None]:
# =============================================================================
# CELL 13: RUN
# =============================================================================

print("="*60, flush=True)
print("RYANAIMO - Ω-AIMO3 Dual-Model Pipeline", flush=True)
print("="*60, flush=True)
print(f"Mode: {'DUAL MODEL' if DUAL_MODEL_MODE else 'SINGLE MODEL'}", flush=True)
print(f"Reasoning: {MODEL_CONFIGS['reasoning']['path']}", flush=True)
print(f"Coding: {MODEL_CONFIGS['coding']['path']}", flush=True)
print(f"Pipeline: 5 prompts × 3 temps × 2 samples = 30 max", flush=True)
print(f"Selection: Value clustering + Log-weighted voting", flush=True)
print(f"Budget: {TOTAL_BUDGET//3600}h {(TOTAL_BUDGET%3600)//60}m", flush=True)
print("="*60, flush=True)

try:
    import kaggle_evaluation.aimo_3_inference_server
    HAS_KAGGLE_EVAL = True
except ImportError:
    print("WARNING: kaggle_evaluation not available", flush=True)
    HAS_KAGGLE_EVAL = False

if HAS_KAGGLE_EVAL:
    server = kaggle_evaluation.aimo_3_inference_server.AIMO3InferenceServer(predict)
    
    if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
        print("\n>>> COMPETITION MODE <<<", flush=True)
        server.serve()
    else:
        print("\n>>> LOCAL MODE <<<", flush=True)
        test_paths = [
            '/kaggle/input/ai-mathematical-olympiad-progress-prize-3/test.csv',
            '/kaggle/input/aimo-validation-aime/aime_problems.csv',
            'test.csv',
        ]
        test_file = None
        for p in test_paths:
            if os.path.exists(p):
                test_file = p
                break
        
        if test_file:
            print(f"Using: {test_file}", flush=True)
            server.run_local_gateway((test_file,))
        else:
            print("No test file found", flush=True)
else:
    print("\n>>> VALIDATION MODE <<<", flush=True)
    sample_id = pl.Series(["test_001"])
    sample_problem = pl.Series(["Find the remainder when 2^100 is divided by 127."])
    result = predict(sample_id, sample_problem)
    print(f"\nResult: {result}", flush=True)

print(f"\nCompleted in {(time.time() - START_TIME)/60:.1f}m", flush=True)
print(f"Solved: {len(SOLVED_IDS)} problems", flush=True)