# CORE Eval Data Exploration

Explore the CORE benchmark eval bundle (from the DCLM paper).
Run `python -m core.dataset --eval` first to download the bundle.

In [1]:
import json
import csv
import random
import yaml
from pathlib import Path
from jinja2 import Template
from transformers import AutoTokenizer

EVAL_DIR = Path("../data/eval_data")
assert EVAL_DIR.exists(), "Run 'python -m core.dataset --eval' first"

with open(EVAL_DIR / "core.yaml") as f:
    config = yaml.safe_load(f)
tasks = config["icl_tasks"]

def load_data(task):
    with open(EVAL_DIR / "eval_data" / task["dataset_uri"]) as f:
        return [json.loads(line) for line in f]

tokenizer = AutoTokenizer.from_pretrained("gpt2")

  from .autonotebook import tqdm as notebook_tqdm


## Task Overview: 0-shot vs Few-shot

Each CORE task specifies `num_fewshot` â€” how many examples from the same dataset are
prepended as demonstrations before the actual query. The split is:
- **0-shot**: model sees only the query, no demonstrations
- **Few-shot (3 or 10)**: model sees N solved examples before the query

In [2]:
print(f"{'Label':<38} {'Type':<20} {'N-shot':<8} {'Delimiter'}")
print("-" * 95)
for t in tasks:
    n = t['num_fewshot'][0]
    d = repr(t.get('continuation_delimiter', ' '))
    shot_label = f"{n}-shot"
    print(f"{t['label']:<38} {t['icl_task_type']:<20} {shot_label:<8} {d}")

Label                                  Type                 N-shot   Delimiter
-----------------------------------------------------------------------------------------------
hellaswag_zeroshot                     multiple_choice      0-shot   ' '
jeopardy                               language_modeling    10-shot  '\nAnswer: '
bigbench_qa_wikidata                   language_modeling    10-shot  ' '
arc_easy                               multiple_choice      10-shot  '\nAnswer: '
arc_challenge                          multiple_choice      10-shot  '\nAnswer: '
copa                                   multiple_choice      0-shot   ' '
commonsense_qa                         multiple_choice      10-shot  ' '
piqa                                   multiple_choice      10-shot  '\nAnswer: '
openbook_qa                            multiple_choice      0-shot   ' '
lambada_openai                         language_modeling    0-shot   ' '
hellaswag                              multiple_choice     

## 0-shot Multiple Choice: hellaswag_zeroshot

No demonstrations. The model only sees the query + each candidate answer.
The prompt sent to the model for each choice is just:
```
<query> <choice>
```
The model scores each by computing mean cross-entropy over the answer tokens.

In [3]:
task_0shot = next(t for t in tasks if t['label'] == 'hellaswag_zeroshot')
data_0shot = load_data(task_0shot)
ex = data_0shot[0]
delimiter = task_0shot.get('continuation_delimiter', ' ')

print(f"Task: {task_0shot['label']} ({task_0shot['num_fewshot'][0]}-shot)")
print(f"Type: {task_0shot['icl_task_type']}")
print(f"Query: {ex['query']}")
print(f"Gold: choice {ex['gold']} = {ex['choices'][ex['gold']]!r}")
print()

# Show the exact prompt the model sees for each choice
for i, choice in enumerate(ex['choices']):
    prompt = f"{ex['query']}{delimiter}{choice}"
    tokens = tokenizer.encode(prompt, add_special_tokens=False)
    gold = " <-- GOLD" if i == ex['gold'] else ""
    print(f"--- Choice {i}{gold} ---")
    print(f"Full prompt ({len(tokens)} tokens):")
    print(prompt)
    print()

Task: hellaswag_zeroshot (0-shot)
Type: multiple_choice
Query: Roof shingle removal: A man is sitting on a roof. He
Gold: choice 3 = 'starts pulling up roofing on a roof.'

--- Choice 0 ---
Full prompt (26 tokens):
Roof shingle removal: A man is sitting on a roof. He is using wrap to wrap a pair of skis.

--- Choice 1 ---
Full prompt (21 tokens):
Roof shingle removal: A man is sitting on a roof. He is ripping level tiles off.

--- Choice 2 ---
Full prompt (23 tokens):
Roof shingle removal: A man is sitting on a roof. He is holding a rubik's cube.

--- Choice 3 <-- GOLD ---
Full prompt (24 tokens):
Roof shingle removal: A man is sitting on a roof. He starts pulling up roofing on a roof.



## 10-shot Multiple Choice: arc_easy

10 solved examples are prepended. The model sees:
```
<query_1>\nAnswer: <gold_answer_1>

<query_2>\nAnswer: <gold_answer_2>

... (10 total)

<actual_query>\nAnswer: <candidate_choice>
```
The few-shot examples teach the model the task format via in-context learning.
Loss is only measured over the candidate choice tokens (after the common prefix).

In [4]:
task_10shot = next(t for t in tasks if t['label'] == 'arc_easy')
data_10shot = load_data(task_10shot)
delimiter = task_10shot.get('continuation_delimiter', ' ')
num_fewshot = task_10shot['num_fewshot'][0]

# Pick example and sample few-shot demos (same logic as core_eval.py)
idx = 0
ex = data_10shot[idx]
rng = random.Random(1234 + idx)
available = [i for i in range(len(data_10shot)) if i != idx]
fewshot = [data_10shot[i] for i in rng.sample(available, num_fewshot)]

print(f"Task: {task_10shot['label']} ({num_fewshot}-shot)")
print(f"Type: {task_10shot['icl_task_type']}")
print(f"Delimiter: {delimiter!r}")
print(f"Query: {ex['query']}")
print(f"Gold: choice {ex['gold']} = {ex['choices'][ex['gold']]!r}")
print()

# Render using the same jinja2 template as core_eval.py
tpl = Template(
    "{%- for ex in fewshot -%}{{ ex.query }}{{ d }}{{ ex.choices[ex.gold] }}\n\n"
    "{% endfor -%}{{ item.query }}{{ d }}{{ choice }}"
)

# Show the full prompt for the gold choice
gold_prompt = tpl.render(fewshot=fewshot, d=delimiter, item=ex, choice=ex['choices'][ex['gold']])
gold_tokens = tokenizer.encode(gold_prompt, add_special_tokens=False)

print(f"=== Full 10-shot prompt for GOLD choice ({len(gold_tokens)} tokens) ===")
print(gold_prompt)
print()

Task: arc_easy (10-shot)
Type: multiple_choice
Delimiter: '\nAnswer: '
Query: Question: Which statement best explains why photosynthesis is the foundation of most food webs?
Gold: choice 0 = 'Sunlight is the source of energy for nearly all ecosystems.'

=== Full 10-shot prompt for GOLD choice (338 tokens) ===
Question: The first telescopes were invented hundreds of years ago. Which was discovered as a result of this invention?
Answer: the moons of Jupiter

Question: Some birds fly south in the fall and return in the spring. This is an example of
Answer: migration

Question: In order for cells to grow at a normal rate, they must
Answer: take in nutrients.

Question: Soil is important to a forest because the soil ___.
Answer: provides nutrients to the trees

Question: Leather basketballs are made for indoor use on smooth surfaces. Rubber basketballs are made for use on many different surfaces. Which of the following properties of rubber makes it better than leather for use on many differ

In [5]:
# Show where the scoring region is
# All choices share the same prefix (few-shot demos + query), only the answer differs
all_prompts = [tpl.render(fewshot=fewshot, d=delimiter, item=ex, choice=c) for c in ex['choices']]
all_tokens = [tokenizer.encode(p, add_special_tokens=False) for p in all_prompts]

# Find common prefix length
min_len = min(len(t) for t in all_tokens)
prefix_len = 0
for i in range(min_len):
    if all(t[i] == all_tokens[0][i] for t in all_tokens):
        prefix_len = i + 1
    else:
        break

print(f"Common prefix: {prefix_len} tokens (few-shot demos + query + delimiter)")
print(f"This is the same across all {len(ex['choices'])} choices.")
print(f"Loss is measured ONLY on tokens after position {prefix_len}.")
print()

for i, (prompt, tokens) in enumerate(zip(all_prompts, all_tokens)):
    answer_tokens = tokens[prefix_len:]
    answer_text = tokenizer.decode(answer_tokens)
    gold = " <-- GOLD" if i == ex['gold'] else ""
    print(f"Choice {i}{gold}: {answer_text!r} ({len(answer_tokens)} tokens scored)")

Common prefix: 326 tokens (few-shot demos + query + delimiter)
This is the same across all 4 choices.
Loss is measured ONLY on tokens after position 326.

Choice 0 <-- GOLD: ' Sunlight is the source of energy for nearly all ecosystems.' (12 tokens scored)
Choice 1: ' Most ecosystems are found on land instead of in water.' (11 tokens scored)
Choice 2: ' Carbon dioxide is more available than other gases.' (9 tokens scored)
Choice 3: ' The producers in all ecosystems are plants.' (8 tokens scored)


## 0-shot Language Modeling: lambada_openai

No demonstrations. The model sees a passage and must predict the final word.
Scoring: argmax prediction at every token position in the continuation must
exactly match the ground truth. Binary correct/incorrect.

In [9]:
task_lm0 = next(t for t in tasks if t['label'] == 'bigbench_dyck_languages')
data_lm0 = load_data(task_lm0)
delimiter = task_lm0.get('continuation_delimiter', ' ')
ex = data_lm0[0]

print(f"Task: {task_lm0['label']} ({task_lm0['num_fewshot'][0]}-shot)")
print(f"Type: {task_lm0['icl_task_type']}")
print()

# The prompt WITHOUT continuation (what the model conditions on)
prompt_without = ex['context'].strip() + delimiter
# The full prompt WITH continuation (what gets forwarded)
prompt_with = ex['context'].strip() + delimiter + ex['continuation']

tokens_without = tokenizer.encode(prompt_without, add_special_tokens=False)
tokens_with = tokenizer.encode(prompt_with, add_special_tokens=False)

continuation_tokens = tokens_with[len(tokens_without):]

print(f"Context ({len(tokens_without)} tokens):")
print(ex['context'][:300])
print()
print(f"Continuation: {ex['continuation']!r}")
print(f"Continuation tokens ({len(continuation_tokens)}): {continuation_tokens}")
print(f"Decoded: {tokenizer.decode(continuation_tokens)!r}")
print()
print("Model must predict EVERY continuation token correctly via argmax.")

Task: bigbench_dyck_languages (10-shot)
Type: language_modeling

Context (73 tokens):
Complete the rest of the sequence, making sure that the parentheses are closed properly. 

Input: [ < < { } > [ { [ ] ( ( ( ( < ( ( ) ) > ) ) ) [ ] ) } ] { } < [ { ( { < ( ) > } ) } ( ) ] > > {
Output:

Continuation: '} ]'
Continuation tokens (1): [2361]
Decoded: ' ]'

Model must predict EVERY continuation token correctly via argmax.


## 10-shot Language Modeling: jeopardy

10 solved examples are prepended. The model sees:
```
<context_1>\nAnswer: <answer_1>

<context_2>\nAnswer: <answer_2>

... (10 total)

<actual_context>\nAnswer: <expected_answer>
```
The few-shot examples teach the model the Q&A format.
Scoring is still exact argmax match on the answer tokens.

In [7]:
task_lm10 = next(t for t in tasks if t['label'] == 'jeopardy')
data_lm10 = load_data(task_lm10)
delimiter = task_lm10.get('continuation_delimiter', ' ')
num_fewshot = task_lm10['num_fewshot'][0]

idx = 0
ex = data_lm10[idx]
rng = random.Random(1234 + idx)
available = [i for i in range(len(data_lm10)) if i != idx]
fewshot = [data_lm10[i] for i in rng.sample(available, num_fewshot)]

# Render using same templates as core_eval.py
base_tpl = Template(
    "{%- for ex in fewshot -%}{{ ex.context | trim }}{{ d }}{{ ex.continuation }}\n\n"
    "{% endfor -%}{{ item.context | trim }}{{ d }}"
)
full_tpl = Template(
    "{%- for ex in fewshot -%}{{ ex.context | trim }}{{ d }}{{ ex.continuation }}\n\n"
    "{% endfor -%}{{ item.context | trim }}{{ d }}{{ item.continuation }}"
)

ctx = dict(fewshot=fewshot, d=delimiter, item=ex)
prompt_without = base_tpl.render(**ctx).strip()
prompt_with = full_tpl.render(**ctx)

tokens_without = tokenizer.encode(prompt_without, add_special_tokens=False)
tokens_with = tokenizer.encode(prompt_with, add_special_tokens=False)
continuation_tokens = tokens_with[len(tokens_without):]

print(f"Task: {task_lm10['label']} ({num_fewshot}-shot)")
print(f"Delimiter: {delimiter!r}")
print(f"Expected answer: {ex['continuation']!r}")
print(f"\nTotal prompt: {len(tokens_with)} tokens")
print(f"Context (demos + query): {len(tokens_without)} tokens")
print(f"Answer tokens to predict: {len(continuation_tokens)} tokens")
print()

print("=== Full 10-shot prompt ===")
print(prompt_with)
print()

print(f"--- Scoring region ---")
print(f"Continuation: {tokenizer.decode(continuation_tokens)!r}")
print(f"Token IDs: {continuation_tokens}")
print(f"Model must get ALL of these exactly right via argmax.")

Task: jeopardy (10-shot)
Delimiter: '\nAnswer: '
Expected answer: 'Admiral Richard Byrd'

Total prompt: 344 tokens
Context (demos + query): 341 tokens
Answer tokens to predict: 3 tokens

=== Full 10-shot prompt ===
WORD ORIGINS: This word for a cantankerous personality is a variation of ordinary
Answer: ornery

AMERICAN HISTORY: In 1888 he won the presidency using the campaign song Grandfathers Hat Fits Ben
Answer: Benjamin Harrison

WORLD HISTORY: In 1199 this crusader king of England was mortally wounded while besieging the castle of Chalus
Answer: Richard the Lionhearted

WORLD HISTORY: Jerusalem was captured by this king of Babylon in 597 & 586 B.C.
Answer: Nebuchadnezzar

WORLD HISTORY: In 1606, Willem Janszoon landed on Cape York Peninsula, becoming the 1st European to visit this continent
Answer: Australia

WORLD HISTORY: According to legend, he was a swineherd before he conquered the Incas
Answer: Francisco Pizarro

AMERICAN HISTORY: Victor Marie du Pont served as a captain of 

## 0-shot Schema: winograd

No demonstrations. Two different context options, same continuation.
The model must judge which context makes the continuation more likely.

In [8]:
task_schema = next(t for t in tasks if t['label'] == 'winograd')
data_schema = load_data(task_schema)
delimiter = task_schema.get('continuation_delimiter', ' ')
ex = data_schema[0]

print(f"Task: {task_schema['label']} ({task_schema['num_fewshot'][0]}-shot)")
print(f"Type: {task_schema['icl_task_type']}")
print(f"Continuation: {ex['continuation']!r}")
print(f"Gold: context option {ex['gold']}")
print()

for i, ctx_opt in enumerate(ex['context_options']):
    prompt = f"{ctx_opt}{delimiter}{ex['continuation']}"
    tokens = tokenizer.encode(prompt, add_special_tokens=False)
    gold = " <-- GOLD" if i == ex['gold'] else ""
    print(f"--- Context option {i}{gold} ---")
    print(f"Prompt ({len(tokens)} tokens): {prompt}")
    print()

# Show the scoring region (common suffix)
all_prompts = [f"{co}{delimiter}{ex['continuation']}" for co in ex['context_options']]
all_tokens = [tokenizer.encode(p, add_special_tokens=False) for p in all_prompts]

min_len = min(len(t) for t in all_tokens)
suffix_len = 0
for i in range(1, min_len + 1):
    if all(t[-i] == all_tokens[0][-i] for t in all_tokens):
        suffix_len = i
    else:
        break

suffix_tokens = all_tokens[0][-suffix_len:]
print(f"Common suffix: {suffix_len} tokens = {tokenizer.decode(suffix_tokens)!r}")
print(f"Loss is measured over this suffix for each context option.")
print(f"Lowest mean loss = model's preferred context.")

Task: winograd (0-shot)
Type: schema
Continuation: 'feared violence.'
Gold: context option 0

--- Context option 0 <-- GOLD ---
Prompt (17 tokens): The city councilmen refused the demonstrators a permit because the city councilmen feared violence.

--- Context option 1 ---
Prompt (15 tokens): The city councilmen refused the demonstrators a permit because the demonstrators feared violence.

Common suffix: 3 tokens = ' feared violence.'
Loss is measured over this suffix for each context option.
Lowest mean loss = model's preferred context.
