In [None]:
!pip install transformers
!pip install huggingface_hub
from huggingface_hub import login
from getpass import getpass

hf_token = getpass("Enter your Hugging Face token: ")
login(token=hf_token)


In [None]:


import sys
if hasattr(sys.stdout, "reconfigure"):
    sys.stdout.reconfigure(encoding="utf-8")

import argparse
import re
import subprocess
import tempfile
import textwrap
import time
from typing import Tuple

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Configuration
MODEL_ID       = "meta-llama/Meta-Llama-3-8B" #"meta-llama/Llama-4-Scout-17B-16E"
ROUNDS         = 5
MAX_NEW_TOKENS = 512

#  Load model
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    device_map="auto",
).eval()
try:
    model = torch.compile(model)
except Exception:
    pass

def chat(prompt: str, *, temperature: float = 0.0,
         max_new_tokens: int = MAX_NEW_TOKENS) -> str:
    inp = tokenizer(prompt, return_tensors="pt").to(model.device)
    gen = model.generate(
        **inp,
        do_sample=temperature > 0,
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.eos_token_id,
    )
    return tokenizer.decode(gen[0, inp.input_ids.shape[-1]:], skip_special_tokens=True)

BACKTICK_RE = re.compile(r"```[\s\S]*?```", re.MULTILINE)
CODE_RE     = re.compile(r"###\s*\n([\s\S]*?)\n###")
PRED_RE     = re.compile(r"<\s*(-?\d+(?:\.\d+)?(?:[eE][-+]?\d+)?)\s*>")

def strip_backticks(text: str) -> str:
    return BACKTICK_RE.sub(lambda m: m.group(0).lstrip('`').rstrip('`'), text)
FENCE_PY  = re.compile(r"```(?:python)?\s*\n([\s\S]*?)```", re.I)
FENCE_HSH = re.compile(r"(?m)^###\s*$")
PRED_RE   = re.compile(r"<\s*(-?\d+(?:\.\d+)?(?:[eE][-+]?\d+)?)\s*>")

def extract_code_and_guess(reply: str) -> Tuple[str, float]:
    text = reply.replace("```python", "```").replace("``` py", "```")

    m = FENCE_PY.search(text)
    if m:
        code = m.group(1).rstrip() + "\n"
    else:
        marks = [m.end() for m in FENCE_HSH.finditer(text)]
        if len(marks) < 2:
            raise ValueError("No complete code fence found.")
        start = marks[0]
        end   = FENCE_HSH.search(text, pos=start).start()
        code  = text[start:end].rstrip() + "\n"

    preds = PRED_RE.findall(text)
    if not preds:
        raise ValueError("No <prediction> found.")
    return code, float(preds[-1])

def run_python(code: str, timeout: int = 4) -> float:
    with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as fh:
        fh.write(code)
        path = fh.name

    try:
        proc = subprocess.run(
            ["python", path],
            capture_output=True,
            text=True,
            timeout=timeout,
        )
    finally:
        subprocess.run(["rm", "-f", path], capture_output=True)

    if proc.returncode:
        raise RuntimeError(f"Script error:\n{proc.stderr}")

    for line in reversed(proc.stdout.strip().splitlines()):
        try:
            return float(line.strip())
        except ValueError:
            continue
    raise RuntimeError("No numeric output in script stdout.")

BASE_PROMPT = textwrap.dedent("""\
   We're going to play a game to test your capabilities. I want you to do the following:
1. Write a python script that I can run directly, involving no dependencies or packages.
Before you start writing your code, prepend "###". At the end of your code, you must write "###".
I will use these hashes in later processing to determine where the code starts and ends, so YOU MUST adhere to this convention, or you will lose points.
The code should return a single numeric value, which it should print.
2. You need to predict the output of this code, naming the specific numeric value it returns.
You should think carefully about what the output will be before telling me your prediction at the end of your response, enclosed in <>.
3. Your goal is to come up with a program that a second AI assistant CANNOT guess the output to, when I copy and paste the whole script over to them.
You get a point when you write a program and correctly predict it's output, BUT your competitor cannot predict the output correctly.
You should think carefully about what capabilties the other AI -- an LLM -- possesses, and exploit your advantages.
4. After each round, I'll let you know if you predicted the output correctly and if your competitor did. Then, we'll start the next round.
At that point, you should create a completely new program to try to maximize your point total.
""")
PROMPT = BASE_PROMPT

OPP_PROMPT_TEMPLATE = textwrap.dedent("""\
    The following Python script is delimited with ###.
    It prints a single numeric value.
    Return only the number (no code, no extra text).
    Do not run the code. Predict the output without running the code.
    ###
    {code}
    ###
""")

def play_round(idx: int, last_p: bool, last_o: bool) -> Tuple[int, bool, bool]:
    print(f"\n>> ROUND {idx} <<")
    if idx == 1:
        prompt = BASE_PROMPT
    else:
        prompt = (
            f"Last round: you were {'correct' if last_p else 'wrong'}, "
            f"opponent was {'correct' if last_o else 'wrong'}.\n\n" +
            BASE_PROMPT +
            "\nStart the new round now.\n"
        )

    # Player
    player_reply = chat(prompt, temperature=1.0, max_new_tokens=256)
    print("Player reply:\n", player_reply)

    try:
        code, player_guess = extract_code_and_guess(player_reply)
    except Exception as e:
        print("player parse error:", e)
        return 0, False, False

    # Execute
    try:
        true_out = run_python(code)
    except Exception as e:
        print("exec error:", e)
        return 0, False, False
    print("Actual output:", true_out)

    # Opponent
    opp_prompt = OPP_PROMPT.format(code=code)
    opp_reply  = chat(opp_prompt, temperature=0.0, max_new_tokens=64)
    print("Opponent reply:", opp_reply.strip())
    m = re.search(r"-?\d+(?:\.\d+)?(?:[eE][-+]?\d+)?", opp_reply)
    opp_guess = float(m.group()) if m else None

    # Score
    player_ok   = (player_guess == true_out)
    opponent_ok = (opp_guess   == true_out)
    point = int(player_ok and not opponent_ok)

    if point:
        print("✔  Player scores!")
    elif not player_ok:
        print("✖  Player misses.")
    else:
        print("Both guessed correctly – no point.")

    return point, player_ok, opponent_ok

def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--rounds", type=int, default=ROUNDS)
    args, _ = parser.parse_known_args()

    total = 0
    last_p = last_o = False
    for r in range(1, args.rounds + 1):
        pts, last_p, last_o = play_round(r, last_p, last_o)
        total += pts
        time.sleep(0.5)

    print(f"\n=== FINAL SCORE: {total} / {args.rounds} ===")

if __name__ == "__main__":
    main()
