In [None]:
# testing to see if models are capable of counting task 
# w/ performance > random chance
import pandas as pd
from pprint import pprint
import importlib
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import sys
sys.path.append('..')
from utils.evals import load_model, create_counting_prompt

if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

In [None]:
PERSISTENT_MODEL_DIR = "/workspace/models"
DATA_DIR = "/workspace/data/synthetic-data.json"

model_dirs = [
    PERSISTENT_MODEL_DIR + '/Qwen3-1.7B',
    PERSISTENT_MODEL_DIR + '/Qwen3-4B',
    PERSISTENT_MODEL_DIR + '/Qwen3-8B',
    PERSISTENT_MODEL_DIR + '/Qwen3-14B',
    PERSISTENT_MODEL_DIR + '/Phi-3-mini-4k-instruct' # ideally, we'll later use this to check for generality
]

# import benchmarking dataset 
count_df = pd.read_json(DATA_DIR)

# Formatting 
Get test data into appropriate form, including instruct formatting. 

Create a torch dataset to hold all test examples + use dataloader or efficient batched inference during evaluation.

In [None]:
import importlib
importlib.reload(utils.evals)
from utils.evals import load_model, create_counting_prompt

In [None]:
# load model 
tok, model = load_model(dir = model_dirs[2])

In [None]:
test = create_counting_prompt('fruit', ['banana', 'dog', 'animal', 'apple', 'guava'], tok, device)
pprint(test)

In [None]:
ids = tok(test, return_tensors = 'pt').to(device)
ids
out = model(input_ids = ids['input_ids'], attention_mask = ids['attention_mask'])


In [None]:
out['logits'].shape # 1 x 64 

predicted_token_ids = torch.argmax(out['logits'][0], dim = -1)
predicted_text = tok.decode(predicted_token_ids, skip_special_tokens=False)
print(predicted_text)

In [None]:
# load dataset into a dataloader – this will help handle batching
from torch.utils.data import Dataset, DataLoader

from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, q_indices, questions, choices, subjects, answer_chars, tokenized_prompts):
        self.q_indices = q_indices
        self.questions = questions
        self.choices = choices
        self.subjects = subjects
        self.answer_chars = answer_chars
        self.input_ids = tokenized_prompts['input_ids']
        self.attention_mask = tokenized_prompts['attention_mask']

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return {
            'q_indices': self.q_indices[idx],
            'questions': self.questions[idx],
            'choices': self.choices[idx],
            'subjects': self.subjects[idx],
            'answer_chars': self.answer_chars[idx],
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx]
        }

# we're tokenizing everything here, but the dataloader will handle batching later :) "tokenizing is very cheap"
tokenized_prompts = tokenizer(q_df['input_prompt'].tolist(), add_special_tokens = False, max_length = 400, padding = 'max_length', truncation = True, return_tensors = 'pt')
print(tokenized_prompts['attention_mask'].sum(dim = 1).max()) # Must be under max length to confirm nothing was truncated (since attention mask applies a 1 to indicate "some guy was here")

q_dl = DataLoader(TextDataset(
    q_df['q_ix'].tolist(),
    q_df['question'].tolist(),
    q_df['choices'].tolist(),
    q_df['subject'].tolist(),
    q_df['answer_char'].tolist(),
    tokenized_prompts # don't move to gpu yet (or will have big mem problems)
), batch_size = 4, shuffle = False)

# pprint.pp(next(iter(mmlu_dl)), width = 80)

# Model evaluation

In [None]:
# chance accuracy is expected prob. of a uniform random guess 
# being correct, averaged over dataset (since lists are of varying lengths)


# want to check accuracy for count + instruction following (does it output what we expect)

In [None]:
# plotting: performance across model family