In [2]:
# testing to see if models are capable of counting task 
# w/ performance > random chance
import pandas as pd
import numpy as np
from pprint import pprint
import importlib
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import sys
sys.path.append('..')
from utils.evals import load_model, create_counting_prompt
from tqdm import tqdm

if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

In [None]:
PERSISTENT_MODEL_DIR = "/workspace/models/"
DATA_DIR = "/workspace/data/synthetic-data-1.json"
MODEL_PREFIX = [
    'Qwen3-1.7B',
    'Qwen3-4B',
    'Qwen3-8B',
    'Qwen3-14B',
    'Qwen3-32B',
    'Phi-3-mini-4k-instruct' # ideally, we'll later use this to check for generality
][4]

# import benchmarking dataset 
count_df = pd.read_json(DATA_DIR)

# Formatting 
Get test data into appropriate form, including instruct formatting. 

Create a torch dataset to hold all test examples + use dataloader or efficient batched inference during evaluation.

In [4]:
tokenizer, model = load_model(PERSISTENT_MODEL_DIR + MODEL_PREFIX)

Loading checkpoint shards:   0%|          | 0/17 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
count_df['sample_ix'] = list(range(0, len(count_df)))
count_df['clean_list_string'] = [' '.join(ele) for ele in count_df['clean_list']]

count_df['prompt'] = count_df.apply(
    lambda row: create_counting_prompt(
        entity_type = row['category'],
        word_list = row['clean_list_string'],
        tokenizer = tokenizer
    ), axis = 1
)

# pprint(count_df['prompt'][0])

In [14]:
# load dataset into a dataloader – this will help handle batching
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, sample_index, tokenized_prompts):
        self.sample_index = sample_index
        self.input_ids = tokenized_prompts['input_ids']
        self.attention_mask = tokenized_prompts['attention_mask']

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return {
            'sample_index': self.sample_index[idx],
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx]
        }

# we're tokenizing everything here, but the dataloader will handle batching later :) "tokenizing is very cheap"
tokenized_prompts = tokenizer(count_df['prompt'].tolist(), add_special_tokens = False, max_length = 100, padding = 'max_length', truncation = True, return_tensors = 'pt')
print(tokenized_prompts['attention_mask'].sum(dim = 1).max()) # Must be under max length to confirm nothing was truncated (since attention mask applies a 1 to indicate "some guy was here")

count_dl = DataLoader(TextDataset(
    count_df['sample_ix'].tolist(),
    tokenized_prompts # don't move to gpu yet (or will have big mem problems)
), batch_size = 4, shuffle = False)

# pprint(next(iter(count_dl)), width = 80)

tensor(75)


# Model evaluation

In [15]:
k = 1 
results = []
with torch.no_grad():
    for batch in tqdm(count_dl, total = len(count_dl)):
        sample_ix = batch["sample_index"]
        input_ids = batch["input_ids"].to(device)
        attn_mask = batch["attention_mask"].to(device)

        probs = model(input_ids, attention_mask=attn_mask).logits.softmax(-1)

        last_pos = attn_mask.sum(1) - 1
        b_idx = torch.arange(input_ids.size(0), device = device)

        tok_ids = input_ids[b_idx, last_pos]
        tok_probs = probs[b_idx, last_pos, :]

        
        topk_probs, topk_idx = torch.topk(tok_probs, k = k, dim = 1)
        tokens_flat = tokenizer.convert_ids_to_tokens(topk_idx.cpu().flatten().tolist())
        topk_tokens = [tokens_flat[i * k:(i + 1) * k] for i in range(len(tokens_flat) // k)]

        for s_ix, toks, ps in zip(sample_ix.tolist(), topk_tokens, topk_probs.cpu()):
            results.append(
                {
                    "sample_ix": s_ix,
                    "output_token": toks[0], # this code only supports k = 1 for now            
                    "output_prob": ps.tolist()[0]
                }
            )

100%|██████████| 1248/1248 [01:59<00:00, 10.44it/s]


In [None]:
results = pd.DataFrame(results)

# join with the original dataframe to get accuracy 
combined = pd.merge(results, count_df, on = 'sample_ix', how = 'inner')\
    .assign(is_correct = lambda df:  np.where(df['output_token'].str.strip() == df['clean_category_count'].astype(str).str.strip(), 1, 0))\
    .assign(is_int = lambda df: np.where(df['output_token'].isin(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']), 1, 0  ))

In [None]:
##  accuracy metrics 
# todo: want to check accuracy for count + instruction following (does it output what we expect) – generally answer format following seems to be pretty good (even w/ 1.7B)
# todo: also want to check answer plausibility (basically, does the model say "4" when the full list is only 3)
# chance accuracy is expected prob. of a uniform random guess 
# being correct, averaged over dataset (since lists are of varying lengths)
n_rows = len(combined)
n_correct = combined['is_correct'].sum()

# overall accuracy 
acc = combined['is_correct'].mean()

# expected accuracy under a uniform random guess
chance = (1/combined['list_length']).mean()
accuracy_above_chance = acc - chance 

pct_integer = combined["is_int"].mean()

# param count 
params = list(model.parameters())
total = sum(p.numel() for p in params)
                                

metrics = pd.Series({
    'model': MODEL_PREFIX,
    'params': total/1e9, # in billions
    'total_rows': n_rows,
    'total_correct': n_correct,
    'accuracy': acc,
    'chance_accuracy': chance,
    'accuracy_above_chance': accuracy_above_chance,
    'pct_integer': pct_integer # correct format
})

# metrics

model                    Qwen3-14B
params                   14.768307
total_rows                    4989
total_correct                 3912
accuracy                  0.784125
chance_accuracy           0.210704
accuracy_above_chance     0.573422
pct_integer                    1.0
dtype: object

In [None]:
# write data for test results + agg. metrics 
combined.to_csv(f"/workspace/data/results_{MODEL_PREFIX}.csv", index = False)
metrics.to_csv(f"/workspace/data/agg_metrics_{MODEL_PREFIX}.csv")

## Plot 
dims of interest: correct length, total length, accuracy – also want to see how performance changes across model sizes.

1. Model size (params) v. accuracy 
2. Total length, accuracy (check to see how accuracy changes as list length changes)
3. Correct length (x), total length (y) – a heatmap



In [None]:
from pathlib import Path
MODEL_PREFIX = [
    'Qwen3-1.7B',
    'Qwen3-4B',
    'Qwen3-8B',
    'Qwen3-14B'
]

metric_names = [
    "model", "params", "total_rows", "total_correct",
    "accuracy", "chance_accuracy", "accuracy_above_chance", "pct_integer"
]

dfs = []
for pre in MODEL_PREFIX:
    fp = Path(f"/workspace/data/agg_metrics_{pre}.csv")

    # 1. read as a plain 1-column DataFrame
    tmp = pd.read_csv(fp, header=None)

    # 2. grab the only column, drop the first row (the stray 0)
    vals = tmp.iloc[1:, 0].tolist()          # list of 8 values

    # 3. build a tidy one-row DataFrame with proper headers
    row = pd.DataFrame([vals], columns=metric_names)
    row["model_size"] = pre                 # optional
    dfs.append(row)

combined = pd.concat(dfs, ignore_index=True)
# combined

In [50]:
# accuracy across parameter sizes - here, we're looking at how accuracy scales across model sizes :) (the red line represents random chance accuracy)
import plotly.express as px

fig = px.line(combined.astype({"params":"float", "accuracy":"float"}), 
              x="params", y="accuracy", markers = True,
              labels = {"params": "Model size (billions of parameters)",
                        "accuracy": "Accuracy"},
                        title = "Accuracy scaling across Qwen-3 family")

fig.add_hline(y=0.21, line_dash="dash", line_color="red")  # chance baseline

fig.update_layout(template="plotly_white")
# fig.update_layout(paper_bgcolor="white", plot_bgcolor="white")
fig.show()

In [68]:
# accuracy across list lengths (14B)
# my naive guess is that the task increases in difficulty as the list of entities increases in size
res_fp = Path(f"/workspace/data/results_Qwen3-14B.csv")
combined_df = pd.read_csv(res_fp)

acc_len = (combined_df
           .groupby("list_length")["is_correct"]
           .agg(["mean", "count"])      
           .reset_index()
           .rename(columns={"mean": "accuracy", "count": "n"}))

acc_len["chance"] = 1 / acc_len["list_length"]

fig = px.line(
    acc_len,
    x="list_length",
    y="accuracy",
    markers=True,
    labels={"list_length": "Number of items in list",
            "accuracy": "Accuracy"},
    title="Qwen-3-14B: accuracy vs. list length",
)

fig.add_scatter(
    x=acc_len["list_length"],
    y=acc_len["chance"],
    mode="lines",
    line=dict(color="red", dash="dash"),
    name="Random chance",
    hovertemplate="Chance<br>List length: %{x}<br>Prob: %{y:.2%}<extra></extra>",
)

fig.update_traces(
    selector=dict(mode="markers+lines"),
    hovertemplate="List length: %{x}<br>Accuracy: %{y:.2%}<br>n: %{customdata}",
    customdata=acc_len["n"],
)

fig.update_layout(template="plotly_white")
fig.show()