# RYANAIMO - AIMO3 (Three-Model Strategy)

## Pass 1: DeepSeek-R1-Distill-Llama-70B
- Primary reasoning model with `<think>` tags
- 94.5% MATH-500, 70.0% AIME 2024
- Solves all 50 problems first

## Pass 2: Qwen-72B-Math (IMO fine-tuned)
- Re-tackles unsolved OR confidence < 70%
- Specifically fine-tuned on olympiad math

## Pass 3: DeepSeek-Coder-V2-Lite (Verifier)
- Code verification on disagreements
- Tiebreaker between DeepSeek vs Qwen

**Kaggle Inputs:**
1. `deepseek-r1` - DeepSeek-R1-Distill-Llama-70B
2. `qwen-72b-math-nf4` - Your fine-tuned Qwen model (optional)
3. `deepseek-coder-v2-lite-nf4` - Coder verifier (optional)
4. `vllm-wheels-py311` - Python 3.11 + torch 2.6.0 compatible wheels

In [None]:
# =============================================================================
# CELL 1: ENVIRONMENT SETUP
# =============================================================================
print("CELL1 START", flush=True)

import os
import sys
import subprocess
import glob as globmod

os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_DATASETS_OFFLINE"] = "1"
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Check Kaggle's torch BEFORE any installs
import torch
print(f"Kaggle torch: {torch.__version__}", flush=True)
print(f"CUDA: {torch.cuda.is_available()}", flush=True)
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}", flush=True)

# Find wheel directory (try multiple names)
WHEEL_CANDIDATES = [
    "/kaggle/input/vllm-wheels-py311",
    "/kaggle/input/deepseek-offline-wheels", 
    "/kaggle/input/aimo3-vllm-wheels",
    "/kaggle/input/vllm-cp311-wheels",
]

wheel_dir = None
for candidate in WHEEL_CANDIDATES:
    if os.path.exists(candidate):
        whl_files = globmod.glob(f"{candidate}/*.whl")
        if whl_files:
            wheel_dir = candidate
            print(f"Found {len(whl_files)} wheels in {candidate}", flush=True)
            break

if wheel_dir is None:
    # List available inputs
    inputs = os.listdir("/kaggle/input") if os.path.exists("/kaggle/input") else []
    raise RuntimeError(f"No wheel dir found! Available inputs: {inputs}")

print(f"Installing from {wheel_dir}...", flush=True)

# Install vLLM without touching torch (keep Kaggle's)
subprocess.run([
    sys.executable, "-m", "pip", "install",
    "--no-index", f"--find-links={wheel_dir}",
    "--no-deps", "vllm"
], timeout=120)

# Install dependencies (skip torch, numpy - use Kaggle's)
deps = ["transformers", "accelerate", "safetensors", "tokenizers", 
        "sentencepiece", "huggingface_hub", "pydantic", "msgspec", 
        "cloudpickle", "einops", "filelock", "regex", "tqdm", 
        "packaging", "typing_extensions", "jinja2", "triton",
        "xgrammar", "compressed_tensors", "outlines_core", "lark"]

for pkg in deps:
    subprocess.run([
        sys.executable, "-m", "pip", "install",
        "--no-index", f"--find-links={wheel_dir}",
        "--no-deps", "--quiet", pkg
    ], timeout=60, capture_output=True)

print("Install done", flush=True)

# Test import
from vllm import LLM, SamplingParams
print("vLLM imported OK", flush=True)
print("CELL1 DONE", flush=True)

In [None]:
# =============================================================================
# CELL 2: IMPORTS + CONSTANTS
# =============================================================================
import time
import math
import re
import gc
import statistics
from typing import Optional, List, Dict, Tuple
from collections import Counter, defaultdict
import numpy as np
import polars as pl

START_TIME = time.time()
TOTAL_BUDGET = (4 * 60 + 50) * 60  # 4h50m
CUTOFF_TIME = START_TIME + TOTAL_BUDGET
PROBLEMS_EXPECTED = 50

print(f"Budget: {TOTAL_BUDGET//3600}h {(TOTAL_BUDGET%3600)//60}m", flush=True)

In [None]:
# =============================================================================
# CELL 3: MODEL PATHS + LOAD/UNLOAD FUNCTIONS
# =============================================================================
print("CELL3: SETTING UP MODELS", flush=True)

# Model 1: DeepSeek-R1-70B (primary reasoning)
DEEPSEEK_PATHS = [
    "/kaggle/input/deepseek-r1/transformers/deepseek-r1-distill-llama-70b/1",
]

# Model 2: Qwen-72B-Math (your fine-tuned model)
QWEN_PATHS = [
    "/kaggle/input/qwen-72b-math-nf4",
]

# Model 3: DeepSeek-Coder-V2-Lite (verifier)
CODER_PATHS = [
    "/kaggle/input/deepseek-coder-v2-lite-nf4/deepseek-coder-v2-lite-nf4",
]

def find_model(paths):
    for path in paths:
        if os.path.exists(path):
            if os.path.exists(os.path.join(path, "config.json")):
                return path
            configs = globmod.glob(f"{path}/**/config.json", recursive=True)
            if configs:
                return os.path.dirname(configs[0])
    return None

# Find all model paths
DEEPSEEK_PATH = find_model(DEEPSEEK_PATHS)
QWEN_PATH = find_model(QWEN_PATHS)
CODER_PATH = find_model(CODER_PATHS)

print(f"DeepSeek: {DEEPSEEK_PATH}", flush=True)
print(f"Qwen: {QWEN_PATH}", flush=True)
print(f"Coder: {CODER_PATH}", flush=True)

# Current loaded model
CURRENT_MODEL = None
CURRENT_MODEL_NAME = None

def unload_model():
    """Unload current model to free VRAM."""
    global CURRENT_MODEL, CURRENT_MODEL_NAME
    if CURRENT_MODEL is not None:
        print(f"  Unloading {CURRENT_MODEL_NAME}...", flush=True)
        del CURRENT_MODEL
        CURRENT_MODEL = None
        CURRENT_MODEL_NAME = None
        gc.collect()
        torch.cuda.empty_cache()
        time.sleep(2)  # Give CUDA time to release memory

def load_deepseek():
    """Load DeepSeek-R1-70B with vLLM."""
    global CURRENT_MODEL, CURRENT_MODEL_NAME
    unload_model()
    
    if DEEPSEEK_PATH is None:
        raise RuntimeError("DeepSeek model not found!")
    
    print(f"  Loading DeepSeek from {DEEPSEEK_PATH}...", flush=True)
    CURRENT_MODEL = LLM(
        model=DEEPSEEK_PATH,
        tensor_parallel_size=1,
        gpu_memory_utilization=0.90,
        trust_remote_code=True,
        max_model_len=8192,
        enforce_eager=True,
        dtype="bfloat16",
    )
    CURRENT_MODEL_NAME = "deepseek"
    print("  DeepSeek loaded!", flush=True)
    return CURRENT_MODEL

def load_qwen():
    """Load Qwen-72B-Math with vLLM."""
    global CURRENT_MODEL, CURRENT_MODEL_NAME
    unload_model()
    
    if QWEN_PATH is None:
        print("  Qwen model not found, skipping", flush=True)
        return None
    
    print(f"  Loading Qwen from {QWEN_PATH}...", flush=True)
    CURRENT_MODEL = LLM(
        model=QWEN_PATH,
        tensor_parallel_size=1,
        gpu_memory_utilization=0.90,
        trust_remote_code=True,
        max_model_len=8192,
        enforce_eager=True,
        dtype="bfloat16",
    )
    CURRENT_MODEL_NAME = "qwen"
    print("  Qwen loaded!", flush=True)
    return CURRENT_MODEL

def load_coder():
    """Load DeepSeek-Coder-V2-Lite with transformers (smaller model)."""
    global CURRENT_MODEL, CURRENT_MODEL_NAME
    unload_model()
    
    if CODER_PATH is None:
        print("  Coder model not found, skipping", flush=True)
        return None
    
    print(f"  Loading Coder from {CODER_PATH}...", flush=True)
    from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
    
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    
    tokenizer = AutoTokenizer.from_pretrained(CODER_PATH, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        CODER_PATH,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
    )
    CURRENT_MODEL = (model, tokenizer)
    CURRENT_MODEL_NAME = "coder"
    print("  Coder loaded!", flush=True)
    return CURRENT_MODEL

print("Model functions ready", flush=True)

In [None]:
# =============================================================================
# CELL 4: PROMPTS (DeepSeek-R1 format)
# =============================================================================

# DeepSeek-R1 uses <think> tags for reasoning
PROMPTS = {
    'algebraic': """Solve this mathematics olympiad problem using algebraic manipulation.
Think step-by-step in <think> tags. Define variables. Check all cases.
Verify by substitution. Final integer answer in \\boxed{}. Answer: 0-99999.""",

    'backwards': """Solve by working backwards from what the answer must look like.
In <think> tags, analyze constraints. What form must the answer take?
Derive from goal to given. Final integer in \\boxed{}. Answer: 0-99999.""",

    'verification': """Solve with rigorous verification.
In <think> tags, solve then VERIFY: substitute back into all constraints.
If verification fails, try different approach. Final integer in \\boxed{}.""",

    'computational': """Solve by writing Python code.
In <think> tags, plan approach. Write clean Python with sympy.
Print intermediate results. Final integer in \\boxed{}. Answer: 0-99999.""",

    'casework': """Solve by systematic case analysis.
In <think> tags, enumerate ALL cases exhaustively.
Compute each case's contribution. Sum all. Final integer in \\boxed{}.""",
}

TEMPERATURES = [1.0, 0.85, 0.7]
STOP_TOKENS = ["<｜end▁of▁sentence｜>", "<|endoftext|>", "</s>"]

print(f"Prompts: {list(PROMPTS.keys())}", flush=True)
print(f"Temperatures: {TEMPERATURES}", flush=True)

In [None]:
# =============================================================================
# CELL 5: ANSWER EXTRACTION
# =============================================================================

def extract_boxed(text: str) -> Optional[int]:
    """Extract integer from \\boxed{}."""
    patterns = [
        r'\\boxed\{(\d+)\}',
        r'boxed\{(\d+)\}',
        r'\\boxed\s*\{\s*(\d+)\s*\}',
    ]
    for pattern in patterns:
        matches = re.findall(pattern, text)
        if matches:
            val = int(matches[-1])
            if 0 <= val <= 99999:
                return val
    
    # Fallback
    fallback = re.findall(r'answer\s*(?:is|=|:)\s*(\d+)', text[-500:], re.I)
    if fallback:
        val = int(fallback[-1])
        if 0 <= val <= 99999:
            return val
    return None

print("Extraction ready", flush=True)

In [None]:
# =============================================================================
# CELL 6: VALUE CLUSTERING + VOTING
# =============================================================================

def value_clustering(answers: List[int], threshold: float = 0.05) -> Tuple[int, float]:
    """Cluster answers by relative proximity."""
    if not answers:
        return 0, 0.0
    if len(answers) == 1:
        return answers[0], 0.5
    
    # Simple clustering: group answers within threshold
    clusters = defaultdict(list)
    for ans in answers:
        placed = False
        for center in clusters:
            if abs(ans - center) / max(abs(center), 1) < threshold:
                clusters[center].append(ans)
                placed = True
                break
        if not placed:
            clusters[ans].append(ans)
    
    # Find largest cluster
    best_cluster = max(clusters.values(), key=len)
    center = int(statistics.median(best_cluster))
    confidence = len(best_cluster) / len(answers)
    
    return center, confidence

def log_weighted_vote(answers: List[int]) -> Tuple[int, float]:
    """Log-weighted voting (penalize trivial small answers)."""
    if not answers:
        return 12453, 0.1
    
    counter = Counter(answers)
    weighted = {}
    for val, count in counter.items():
        weight = math.log(1.25 + abs(val)) * count
        weighted[val] = weight
    
    best = max(weighted, key=weighted.get)
    total = sum(weighted.values())
    conf = weighted[best] / total if total > 0 else 0.5
    
    return best, conf

def select_answer(answers: List[int]) -> Tuple[int, float]:
    """Combine clustering + voting."""
    if not answers:
        return 12453, 0.1
    if len(answers) == 1:
        return answers[0], 0.5
    
    # Cluster
    cluster_ans, cluster_conf = value_clustering(answers)
    
    # Vote
    vote_ans, vote_conf = log_weighted_vote(answers)
    
    # Prefer cluster if high confidence, else vote
    if cluster_conf > 0.7:
        return cluster_ans, cluster_conf
    return vote_ans, vote_conf

print("Selection ready", flush=True)

In [None]:
# =============================================================================
# CELL 7: GENERATION FUNCTIONS (ALL THREE MODELS)
# =============================================================================

def format_deepseek_prompt(question: str, system: str) -> str:
    """Format for DeepSeek-R1."""
    return f"<｜begin▁of▁sentence｜><｜User｜>{system}\n\n{question}<｜Assistant｜><think>\n"

def format_qwen_prompt(question: str) -> str:
    """Format for Qwen-Math."""
    return f"""<|im_start|>system
You are a mathematics olympiad expert. Solve problems step-by-step with rigorous reasoning.
<|im_end|>
<|im_start|>user
{question}

Provide your final answer as an integer in \\boxed{{}}.
<|im_end|>
<|im_start|>assistant
"""

def format_coder_prompt(question: str, answers_to_verify: List[int]) -> str:
    """Format for Coder verification."""
    answers_str = ", ".join(map(str, answers_to_verify))
    return f"""You are verifying math competition answers with Python code.

Problem: {question}

Candidate answers to verify: {answers_str}

Write Python code to check which answer(s) satisfy the problem constraints.
Then state which answer is correct in \\boxed{{}}.
"""

def generate_deepseek(question: str, prompt_type: str, temp: float) -> Optional[int]:
    """Generate with DeepSeek-R1."""
    if CURRENT_MODEL is None or CURRENT_MODEL_NAME != "deepseek":
        return None
    if time.time() >= CUTOFF_TIME:
        return None
    
    system = PROMPTS[prompt_type]
    prompt = format_deepseek_prompt(question, system)
    
    params = SamplingParams(
        temperature=temp,
        top_p=0.95,
        max_tokens=6144,
        stop=STOP_TOKENS,
    )
    
    try:
        outputs = CURRENT_MODEL.generate([prompt], sampling_params=params)
        response = outputs[0].outputs[0].text
        if "</think>" in response:
            return extract_boxed(response.split("</think>")[-1])
        return extract_boxed(response)
    except Exception as e:
        print(f"    DeepSeek error: {e}", flush=True)
        return None

def generate_qwen(question: str, temp: float) -> Optional[int]:
    """Generate with Qwen-Math."""
    if CURRENT_MODEL is None or CURRENT_MODEL_NAME != "qwen":
        return None
    if time.time() >= CUTOFF_TIME:
        return None
    
    prompt = format_qwen_prompt(question)
    
    params = SamplingParams(
        temperature=temp,
        top_p=0.95,
        max_tokens=6144,
        stop=["<|im_end|>", "<|endoftext|>"],
    )
    
    try:
        outputs = CURRENT_MODEL.generate([prompt], sampling_params=params)
        response = outputs[0].outputs[0].text
        return extract_boxed(response)
    except Exception as e:
        print(f"    Qwen error: {e}", flush=True)
        return None

def generate_coder_verify(question: str, answers: List[int]) -> Optional[int]:
    """Verify answers with Coder model."""
    if CURRENT_MODEL is None or CURRENT_MODEL_NAME != "coder":
        return None
    if time.time() >= CUTOFF_TIME:
        return None
    
    model, tokenizer = CURRENT_MODEL
    prompt = format_coder_prompt(question, answers)
    
    try:
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=4096,
                temperature=0.7,
                top_p=0.95,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
            )
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return extract_boxed(response)
    except Exception as e:
        print(f"    Coder error: {e}", flush=True)
        return None

def sample_deepseek(question: str, max_samples: int = 15) -> Tuple[List[int], float]:
    """Sample multiple solutions from DeepSeek."""
    answers = []
    tasks = [(ptype, temp) for temp in TEMPERATURES for ptype in PROMPTS.keys()]
    tasks = tasks[:max_samples]
    
    for i, (ptype, temp) in enumerate(tasks):
        if time.time() >= CUTOFF_TIME:
            break
        ans = generate_deepseek(question, ptype, temp)
        if ans is not None:
            answers.append(ans)
        
        # Early consensus
        if len(answers) >= 5:
            counter = Counter(answers)
            if counter.most_common(1)[0][1] >= 4:
                break
    
    if answers:
        final, conf = select_answer(answers)
        return answers, conf
    return [], 0.0

def sample_qwen(question: str, num_samples: int = 5) -> Tuple[List[int], float]:
    """Sample solutions from Qwen."""
    answers = []
    temps = [0.7, 0.85, 1.0, 0.9, 0.8][:num_samples]
    
    for temp in temps:
        if time.time() >= CUTOFF_TIME:
            break
        ans = generate_qwen(question, temp)
        if ans is not None:
            answers.append(ans)
    
    if answers:
        final, conf = select_answer(answers)
        return answers, conf
    return [], 0.0

print("Generation functions ready (DeepSeek, Qwen, Coder)", flush=True)

In [None]:
# =============================================================================
# CELL 8: THREE-PASS SOLVER
# =============================================================================

# Storage for results across passes
RESULTS = {}  # {qid: {"question": str, "deepseek": (ans, conf, [answers]), "qwen": ..., "final": int}}
PROBLEM_COUNT = 0

def solve_pass1_deepseek(qid: str, question: str, max_samples: int) -> Tuple[int, float, List[int]]:
    """Pass 1: DeepSeek-R1 on all problems."""
    answers, conf = sample_deepseek(question, max_samples)
    if answers:
        ans, conf = select_answer(answers)
        return ans, conf, answers
    return None, 0.0, []

def solve_pass2_qwen(qid: str, question: str, num_samples: int = 5) -> Tuple[int, float, List[int]]:
    """Pass 2: Qwen-Math on low-confidence problems."""
    answers, conf = sample_qwen(question, num_samples)
    if answers:
        ans, conf = select_answer(answers)
        return ans, conf, answers
    return None, 0.0, []

def solve_pass3_verify(qid: str, question: str, candidates: List[int]) -> Optional[int]:
    """Pass 3: Coder verification on disagreements."""
    if not candidates or len(set(candidates)) == 1:
        return candidates[0] if candidates else None
    return generate_coder_verify(question, list(set(candidates)))

def combine_results(qid: str) -> int:
    """Combine results from all passes."""
    r = RESULTS.get(qid, {})
    
    ds_ans, ds_conf, ds_all = r.get("deepseek", (None, 0, []))
    qw_ans, qw_conf, qw_all = r.get("qwen", (None, 0, []))
    coder_ans = r.get("coder", None)
    
    # If coder verified, trust it
    if coder_ans is not None:
        return coder_ans
    
    # If both models agree, high confidence
    if ds_ans is not None and qw_ans is not None and ds_ans == qw_ans:
        return ds_ans
    
    # If only DeepSeek, use it
    if qw_ans is None:
        return ds_ans if ds_ans is not None else 12453
    
    # If DeepSeek high confidence, trust it
    if ds_conf > 0.7:
        return ds_ans
    
    # If Qwen high confidence, trust it
    if qw_conf > 0.7:
        return qw_ans
    
    # Combine all answers and vote
    all_answers = ds_all + qw_all
    if all_answers:
        final, _ = select_answer(all_answers)
        return final
    
    return ds_ans if ds_ans is not None else (qw_ans if qw_ans is not None else 12453)

def solve_single(qid: str, question: str) -> int:
    """Single-pass solver (called during Pass 1)."""
    global PROBLEM_COUNT
    PROBLEM_COUNT += 1
    
    time_left = CUTOFF_TIME - time.time()
    problems_left = max(1, PROBLEMS_EXPECTED - PROBLEM_COUNT + 1)
    time_per = time_left / problems_left
    
    # Adaptive samples
    if time_per > 300:
        max_samples = 20
    elif time_per > 180:
        max_samples = 15
    elif time_per > 60:
        max_samples = 10
    else:
        max_samples = 5
    
    print(f"  Time/problem: {time_per:.0f}s, samples: {max_samples}", flush=True)
    
    # Pass 1: DeepSeek
    ans, conf, answers = solve_pass1_deepseek(qid, question, max_samples)
    
    # Store result
    RESULTS[qid] = {
        "question": question,
        "deepseek": (ans, conf, answers),
        "qwen": (None, 0, []),
        "coder": None,
        "final": ans if ans else 12453,
    }
    
    print(f"  DeepSeek: {ans} (conf: {conf:.2f}, n={len(answers)})", flush=True)
    
    gc.collect()
    torch.cuda.empty_cache()
    
    return ans if ans else 12453

def run_pass2():
    """Run Pass 2: Qwen on low-confidence problems."""
    if QWEN_PATH is None:
        print("Qwen not available, skipping Pass 2", flush=True)
        return
    
    # Find problems needing Pass 2
    needs_pass2 = []
    for qid, r in RESULTS.items():
        ds_ans, ds_conf, _ = r["deepseek"]
        if ds_conf < 0.70 or ds_ans is None:
            needs_pass2.append(qid)
    
    if not needs_pass2:
        print("All problems high confidence, skipping Pass 2", flush=True)
        return
    
    print(f"\n{'='*60}", flush=True)
    print(f"PASS 2: Qwen on {len(needs_pass2)} low-confidence problems", flush=True)
    print(f"{'='*60}", flush=True)
    
    load_qwen()
    
    for i, qid in enumerate(needs_pass2):
        if time.time() >= CUTOFF_TIME - 120:  # Reserve 2min for finalization
            print(f"Time limit approaching, stopping Pass 2", flush=True)
            break
        
        question = RESULTS[qid]["question"]
        print(f"\nPass2 [{i+1}/{len(needs_pass2)}] {qid}", flush=True)
        
        ans, conf, answers = solve_pass2_qwen(qid, question, num_samples=5)
        RESULTS[qid]["qwen"] = (ans, conf, answers)
        
        print(f"  Qwen: {ans} (conf: {conf:.2f})", flush=True)
    
    unload_model()

def run_pass3():
    """Run Pass 3: Coder verification on disagreements."""
    if CODER_PATH is None:
        print("Coder not available, skipping Pass 3", flush=True)
        return
    
    # Find disagreements
    disagreements = []
    for qid, r in RESULTS.items():
        ds_ans, _, _ = r["deepseek"]
        qw_ans, _, _ = r["qwen"]
        if ds_ans is not None and qw_ans is not None and ds_ans != qw_ans:
            disagreements.append(qid)
    
    if not disagreements:
        print("No disagreements, skipping Pass 3", flush=True)
        return
    
    print(f"\n{'='*60}", flush=True)
    print(f"PASS 3: Coder verification on {len(disagreements)} disagreements", flush=True)
    print(f"{'='*60}", flush=True)
    
    load_coder()
    
    for i, qid in enumerate(disagreements):
        if time.time() >= CUTOFF_TIME - 60:
            print(f"Time limit approaching, stopping Pass 3", flush=True)
            break
        
        r = RESULTS[qid]
        question = r["question"]
        ds_ans, _, _ = r["deepseek"]
        qw_ans, _, _ = r["qwen"]
        
        print(f"\nPass3 [{i+1}/{len(disagreements)}] {qid}: DS={ds_ans} vs QW={qw_ans}", flush=True)
        
        verified = solve_pass3_verify(qid, question, [ds_ans, qw_ans])
        RESULTS[qid]["coder"] = verified
        
        print(f"  Coder verified: {verified}", flush=True)
    
    unload_model()

def finalize_results():
    """Compute final answers for all problems."""
    print(f"\n{'='*60}", flush=True)
    print("FINALIZING RESULTS", flush=True)
    print(f"{'='*60}", flush=True)
    
    for qid in RESULTS:
        RESULTS[qid]["final"] = combine_results(qid)
        ds_ans, ds_conf, _ = RESULTS[qid]["deepseek"]
        qw_ans, qw_conf, _ = RESULTS[qid]["qwen"]
        coder = RESULTS[qid]["coder"]
        final = RESULTS[qid]["final"]
        print(f"{qid}: DS={ds_ans}({ds_conf:.2f}) QW={qw_ans}({qw_conf:.2f}) C={coder} -> {final}", flush=True)

print("Three-pass solver ready", flush=True)

In [None]:
# =============================================================================
# CELL 9: KAGGLE API (Three-Pass)
# =============================================================================

# Track all problems for multi-pass
ALL_PROBLEMS = []
PASS1_COMPLETE = False

def predict(id_: pl.Series, problem: pl.Series) -> pl.DataFrame:
    """Called by Kaggle for each problem. During Pass 1, collect all & solve with DeepSeek."""
    global PASS1_COMPLETE
    
    qid = id_.item(0)
    question = problem.item(0)
    
    time_left = (CUTOFF_TIME - time.time()) / 60
    
    print(f"\n{'='*60}", flush=True)
    print(f"Problem {PROBLEM_COUNT + 1} | {qid} | {time_left:.1f}m left", flush=True)
    print(f"Q: {question[:100]}...", flush=True)
    
    # Store problem for later passes
    ALL_PROBLEMS.append((qid, question))
    
    # Pass 1: Solve with DeepSeek
    answer = solve_single(qid, question)
    
    print(f"ANSWER (Pass1): {answer}", flush=True)
    print(f"{'='*60}", flush=True)
    
    return pl.DataFrame({"id": id_, "answer": answer})

def run_multi_pass():
    """Run Pass 2 and Pass 3 after all problems collected."""
    global PASS1_COMPLETE
    
    if PASS1_COMPLETE:
        return
    PASS1_COMPLETE = True
    
    print(f"\n{'#'*60}", flush=True)
    print("PASS 1 COMPLETE - Starting multi-pass refinement", flush=True)
    print(f"{'#'*60}", flush=True)
    
    # Pass 2: Qwen on low-confidence
    run_pass2()
    
    # Pass 3: Coder verification
    run_pass3()
    
    # Finalize
    finalize_results()
    
    # Update any answers that changed
    print(f"\n{'='*60}", flush=True)
    print("FINAL ANSWERS:", flush=True)
    for qid in RESULTS:
        print(f"  {qid}: {RESULTS[qid]['final']}", flush=True)

print("Kaggle API ready (three-pass)", flush=True)

In [None]:
# =============================================================================
# CELL 10: RUN (Three-Pass Strategy)
# =============================================================================

print("="*60, flush=True)
print("RYANAIMO - AIMO3 (Three-Model Strategy)", flush=True)
print("="*60, flush=True)
print(f"Pass 1: DeepSeek-R1-70B @ {DEEPSEEK_PATH}", flush=True)
print(f"Pass 2: Qwen-72B-Math @ {QWEN_PATH}", flush=True)
print(f"Pass 3: DeepSeek-Coder @ {CODER_PATH}", flush=True)
print("="*60, flush=True)
print("Strategy:", flush=True)
print("  1. DeepSeek solves all 50 problems (5 prompts x 3 temps)", flush=True)
print("  2. Qwen re-tackles conf < 70%", flush=True)
print("  3. Coder verifies disagreements", flush=True)
print("="*60, flush=True)

# Load primary model (DeepSeek) for Pass 1
print("\nLoading DeepSeek for Pass 1...", flush=True)
load_deepseek()

try:
    import kaggle_evaluation.aimo_3_inference_server
    server = kaggle_evaluation.aimo_3_inference_server.AIMO3InferenceServer(predict)
    
    if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
        print(">>> COMPETITION MODE <<<", flush=True)
        server.serve()
        # After all problems served, run Pass 2 and 3
        run_multi_pass()
    else:
        print(">>> LOCAL TEST <<<", flush=True)
        test_file = '/kaggle/input/ai-mathematical-olympiad-progress-prize-3/test.csv'
        if os.path.exists(test_file):
            server.run_local_gateway((test_file,))
            run_multi_pass()
        else:
            # Manual test with single problem
            print("No test file, running manual test...", flush=True)
            sample = pl.DataFrame({
                "id": ["test"], 
                "problem": ["Find the remainder when 2^100 is divided by 127."]
            })
            result = predict(sample["id"], sample["problem"])
            print(result)
            run_multi_pass()

except ImportError:
    print(">>> VALIDATION MODE (no kaggle_evaluation) <<<", flush=True)
    sample = pl.DataFrame({
        "id": ["test"], 
        "problem": ["Find the remainder when 2^100 is divided by 127."]
    })
    result = predict(sample["id"], sample["problem"])
    print(result)
    run_multi_pass()

print(f"\nTotal time: {(time.time() - START_TIME)/60:.1f}m", flush=True)
print("Done!", flush=True)