# Finding and using anchor points

In this notebook, we show how to find anchor points based on your training set and how to use them to estimate the performance of new models in the test set.

## Preparing data

Loading packages

In [66]:
import numpy as np
import pickle
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import pairwise_distances
from irt import *
from utils import *

random_state = 420

The leaderboard dataset we will use is composed by six scenarios (sub-datasets):
1. TruthfulQA
1. GSM8K
1. Winogrande
1. ARC
1. HellaSwag
1. MMLU

MMLU is further divided into sub-scenarios (e.g., abstract algebra, anatomy, etc). Let's check scenarios and sub-scenarios:

In [2]:
scenarios

{'harness_truthfulqa_mc_0': ['harness_truthfulqa_mc_0'],
 'gsm8k': ['harness_gsm8k_5'],
 'winogrande': ['harness_winogrande_5'],
 'arc': ['harness_arc_challenge_25'],
 'hellaswag': ['harness_hellaswag_10'],
 'mmlu': ['harness_hendrycksTest_abstract_algebra_5',
  'harness_hendrycksTest_anatomy_5',
  'harness_hendrycksTest_astronomy_5',
  'harness_hendrycksTest_business_ethics_5',
  'harness_hendrycksTest_clinical_knowledge_5',
  'harness_hendrycksTest_college_biology_5',
  'harness_hendrycksTest_college_chemistry_5',
  'harness_hendrycksTest_college_computer_science_5',
  'harness_hendrycksTest_college_mathematics_5',
  'harness_hendrycksTest_college_medicine_5',
  'harness_hendrycksTest_college_physics_5',
  'harness_hendrycksTest_computer_security_5',
  'harness_hendrycksTest_conceptual_physics_5',
  'harness_hendrycksTest_econometrics_5',
  'harness_hendrycksTest_electrical_engineering_5',
  'harness_hendrycksTest_elementary_mathematics_5',
  'harness_hendrycksTest_formal_logic_5',
 

In [3]:
SELECTED_SCENARIOS = ['gsm8k', 'arc', 'hellaswag', 'harness_truthfulqa_mc_0']

# select gsm8k, arc, hellaswag
lb_scenarios = {'lb': []}
for scenario in scenarios.keys():
    if scenario in SELECTED_SCENARIOS:
        lb_scenarios['lb'].append(scenarios[scenario][0])

In [4]:
lb_scenarios

{'lb': ['harness_truthfulqa_mc_0',
  'harness_gsm8k_5',
  'harness_arc_challenge_25',
  'harness_hellaswag_10']}

In [5]:
lb_scenario_to_readable = {}
for scenario in scenarios.keys():
    if scenario in SELECTED_SCENARIOS:
        lb_scenario_to_readable[scenarios[scenario][0]] = scenario

lb_scenario_to_readable['harness_truthfulqa_mc_0'] = 'truthfulqa'

Loading leaderboard data:

In [6]:
#with open('data/lb.pickle', 'rb') as handle:
#    data = pickle.load(handle)
with open('data/lb_scenarios.pickle', 'rb') as handle:
    data = pickle.load(handle)

In [7]:
for s in data['data'].keys():
    print(s)
    print('Number of nan values in data:', np.sum(np.isnan(data['data'][s]['correctness'])))

harness_hellaswag_10
Number of nan values in data: 0
harness_truthfulqa_mc_0
Number of nan values in data: 104
harness_arc_challenge_25
Number of nan values in data: 0
harness_gsm8k_5
Number of nan values in data: 0


In this dataset, we have data from 395 models. Let's see the names of some of them below

In [8]:
len(data['models']),data['models'][:10]

(1,
 [['open-llm-leaderboard/details_moreh__MoMo-70B-lora-1.8.6-DPO',
   'open-llm-leaderboard/details_cloudyu__Yi-34Bx3-MoE-90B',
   'open-llm-leaderboard/details_Weyaxi__Helion-4x34B',
   'open-llm-leaderboard/details_Weyaxi__Bagel-Hermes-34B-Slerp',
   'open-llm-leaderboard/details_Weyaxi__Bagel-Hermes-2x34b',
   'open-llm-leaderboard/details_nfaheem__Marcoroni-7b-DPO-Merge',
   'open-llm-leaderboard/details_jondurbin__bagel-dpo-34b-v0.2',
   'open-llm-leaderboard/details_udkai__Turdus',
   'open-llm-leaderboard/details_gagan3012__MetaModel_moe',
   'open-llm-leaderboard/details_jeonsworld__CarbonVillain-en-10.7B-v3',
   'open-llm-leaderboard/details_TomGrc__FusionNet',
   'open-llm-leaderboard/details_kekmodel__StopCarbon-10.7B-v6',
   'open-llm-leaderboard/details_jeonsworld__CarbonVillain-en-10.7B-v1',
   'open-llm-leaderboard/details_Weyaxi__SauerkrautLM-UNA-SOLAR-Instruct',
   'open-llm-leaderboard/details_VAGOsolutions__SauerkrautLM-SOLAR-Instruct',
   'open-llm-leaderboard/de

Below, we will process the data so all correctness scores (for all scenarios) are stored in $Y$. The dictionaries `scenarios_position` and `subscenarios_position` give the position of scenarios/subscenarios correctness scores in $Y$.

In [9]:
scenarios_position, subscenarios_position = prepare_data(lb_scenarios, data)
Y = create_responses(lb_scenarios, data)
Y.shape

(393, 13350)

In [10]:
scenarios_position['lb'][0], scenarios_position['lb'][-1]

(0, 13349)

In [11]:
subscenarios_position['lb'].keys()

dict_keys(['harness_truthfulqa_mc_0', 'harness_gsm8k_5', 'harness_arc_challenge_25', 'harness_hellaswag_10'])

For example, below you can see the scores for MMLU:

In [12]:
Y[:,scenarios_position['lb']], Y[:,scenarios_position['lb']].shape

(array([[0.99999999, 0.99994655, 1.        , ..., 1.        , 0.        ,
         1.        ],
        [0.99984026, 0.99999999, 0.99908489, ..., 1.        , 1.        ,
         1.        ],
        [0.99937792, 0.99999997, 0.99732544, ..., 1.        , 1.        ,
         1.        ],
        ...,
        [0.59677787, 0.99801392, 0.20934594, ..., 0.        , 0.        ,
         0.        ],
        [0.67288482, 0.99814252, 0.3865014 , ..., 0.        , 1.        ,
         1.        ],
        [0.42345796, 0.99926741, 0.82057698, ..., 0.        , 0.        ,
         0.        ]]),
 (393, 13350))

In [13]:
# count nan values in Y
np.sum(np.isnan(Y))

104

In [14]:
# fill nan values with 0
Y[np.isnan(Y)] = 0

# print stats of Y
print('Y stats:')
print('min:', np.min(Y))
print('max:', np.max(Y))
print('mean:', np.mean(Y))
print('std:', np.std(Y))

Y stats:
min: 0.0
max: 1.0000000000000002
mean: 0.7231160348021274
std: 0.443520043003996


In [15]:
subscenarios_position['lb'].keys()

dict_keys(['harness_truthfulqa_mc_0', 'harness_gsm8k_5', 'harness_arc_challenge_25', 'harness_hellaswag_10'])

In [16]:
for s in subscenarios_position.keys():
    for k in subscenarios_position[s].keys():
        print(s, k, subscenarios_position[s][k][0], subscenarios_position[s][k][-1])

lb harness_truthfulqa_mc_0 0 816
lb harness_gsm8k_5 817 2135
lb harness_arc_challenge_25 2136 3307
lb harness_hellaswag_10 3308 13349


For scenarios that have multiple subscenarios, it is usually the case that we want to give equal importance to individual subscenarios when computing the aggregated performance in that scenario. This is equivalent to using a weighted average when computing the aggregated performance. We will create `balance_weights`, a vector of weights to help us compute those weighted averages. These weights will be different than one only for MMLU, which is the only scenario with multiple subscenarios.

In [17]:
balance_weights = np.ones(Y.shape[1])

N = len(scenarios_position['lb'])
n_sub = len(lb_scenarios['lb'])
for sub in lb_scenarios['lb']:
    n_i = len(subscenarios_position['lb'][sub])
    balance_weights[subscenarios_position['lb'][sub]] = N/(n_sub*n_i)  

We can see below that first averaging within subscenarios and then computing a simple average is equivalent to using a weighted average from the beginning:

In [18]:
accs1 = np.mean([Y[:,subscenarios_position['lb'][sub]].mean(axis=1) for sub in lb_scenarios['lb']], axis=0)
accs2 = (balance_weights*Y)[:,scenarios_position['lb']].mean(axis=1)

np.abs(accs1 - accs2).mean()

1.1356762293373007e-14

## Create IRT Model for whole response dataset

In [32]:
def validate_IRT_dimension(Y_bin_train, Y_train, scenarios, Ds, device, epochs, lr, scenarios_position, subscenarios_position):
    val_ind = list(range(0,Y_bin_train.shape[0],5)) # Validation indices
    train_ind = [i for i in range(Y_bin_train.shape[0]) if i not in val_ind]

    # Saving the training dataset in the needed format
    create_irt_dataset(Y_bin_train[train_ind], 'data/irt_val_dataset.jsonlines')

    # Trying different Ds
    errors = []  
    errors2 = []

    for D in tqdm(Ds):
        dataset_name = 'data/irt_val_dataset.jsonlines'
        model_name = 'data/irt_val_model/'
        
        # Load trained IRT model parameters
        train_irt_model(dataset_name, model_name, D, lr, epochs, device)
        A, B, Theta = load_irt_parameters(model_name)
        
        # Determine seen and unseen items for validation
        seen_items = list(range(0, Y_bin_train.shape[1], 2))
        unseen_items = list(range(1, Y_bin_train.shape[1], 2))

        # Estimate ability parameters for the validation set
        thetas = [estimate_ability_parameters(Y_bin_train[val_ind][j][seen_items], A[:, :, seen_items], B[:, :, seen_items]) for j in range(len(val_ind))]

        # Compute validation errors for each scenario and update the errors list (in the end, we give the same weight for all scenarios)
        errors2.append([])
        for scenario in scenarios.keys():
            balance_weights = np.ones(Y.shape[1])

            selected_subscenarios = scenarios[scenario]

            N = len(scenarios_position)
            n_sub = len(selected_subscenarios)
            for sub in selected_subscenarios:
                n_i = len(subscenarios_position[scenario][sub])
                balance_weights[subscenarios_position[scenario][sub]] = N/(n_sub*n_i) 
            
            ind = [u for u in unseen_items if u in scenarios_position[scenario]]
            errors2[-1].append(np.mean([abs((balance_weights*item_curve(thetas[j], A, B))[0,ind].mean()-Y_train[val_ind][j,ind].mean())for j in range(len(val_ind))]))
        errors.append(np.mean(errors2[-1]))

    return errors, errors2, Ds

def train_IRT_for_scenarios(Y_bin_train, Y_train, all_scenarios, scenarios_position, subscenarios_position):
    Ds = [15,] #2,5,10,] # Dimensions to try TODO: uncomment the rest of the dimensions
    device = 'cuda' # Either 'cuda' or 'cpu' 
    epochs = 2000  # Number of epochs for IRT model training (py-irt default is 2000)
    lr = .1  # Learning rate for IRT model training (py-irt default is .1)

    errors, errors2, Ds = validate_IRT_dimension(Y_bin_train, Y_train, all_scenarios, Ds, device, epochs, lr, scenarios_position, subscenarios_position)

    ind_D = np.argmin(np.array(errors))
    D = Ds[ind_D]

    create_irt_dataset(Y_bin_train, 'data/irt_dataset.jsonlines')

    train_irt_model(dataset_name='data/irt_dataset.jsonlines', 
                model_name=f'data/irt_model_lb/', 
                D=D, lr=lr, epochs=epochs, device=device)
    
    return errors, errors2

In [20]:
Y_test = Y[:100]
Y_train = Y[100:]

Y_bin_train = np.zeros(Y_train.shape)
Y_bin_test = np.zeros(Y_test.shape)

cs = np.linspace(0.01,.99,100)  # Threshold values to consider
for scenario in tqdm(lb_scenarios.keys()):
    ind = scenarios_position[scenario]
    # Find the best threshold value that minimizes the difference between averages
    c = cs[np.argmin([np.mean((np.abs((Y_train[:,ind]>c).mean(axis=1)-Y_train[:,ind].mean(axis=1)))) for c in tqdm(cs)])]
    # Apply the threshold to train and test responses
    Y_bin_train[:,ind] = (Y_train[:,ind]>c).astype(int)
    Y_bin_test[:,ind] = (Y_test[:,ind]>c).astype(int)

100%|██████████| 100/100 [00:01<00:00, 63.01it/s]
100%|██████████| 1/1 [00:01<00:00,  1.62s/it]


In [39]:
lb_scenarios

{'lb': ['harness_truthfulqa_mc_0',
  'harness_gsm8k_5',
  'harness_arc_challenge_25',
  'harness_hellaswag_10']}

In [40]:
all_scenarios = {
    'truthfulqa': ['harness_truthfulqa_mc_0'],
    'hellaswag': ['harness_hellaswag_10'],
    'arc': ['harness_arc_challenge_25'],
    'gsm8k': ['harness_gsm8k_5']
}

scenarios_pos = {}
subs_position = {}
for s in lb_scenarios['lb']:
    scenarios_pos[lb_scenario_to_readable[s]] = subscenarios_position['lb'][s]
    subs_position[lb_scenario_to_readable[s]] = {s: subscenarios_position['lb'][s]}

In [29]:
subs_position.keys()

dict_keys(['truthfulqa', 'gsm8k', 'arc', 'hellaswag'])

In [41]:
err, err2 = train_IRT_for_scenarios(Y_bin_train, Y_train, all_scenarios, scenarios_pos, subs_position)

  0%|          | 0/1 [00:00<?, ?it/s]

[18:32:55] config: model_type='multidim_2pl' epochs=2000              cli.py:109
           priors='hierarchical' initializers=[] dims=15 lr=0.1                 
           lr_decay=0.9999 dropout=0.5 hidden=100 vocab_size=None               
           log_every=200 seed=42 deterministic=True                             
           data_path: data/irt_val_dataset.jsonlines                  cli.py:111
           output directory: data/irt_val_model/                      cli.py:112
[18:32:56] amortized: False                                       dataset.py:112
[18:33:03] Vocab size: None                                       training.py:90
[18:33:04] Training Model...                                          cli.py:116
[18:33:04] args: {'device': 'cuda', 'num_items': 13350,          training.py:134
           'num_subjects': 234}                                                 
           Parsed Model Args: {'device': 'cuda', 'num_items':    training.py:147
           13350, 'num_subje

100%|██████████| 1/1 [01:32<00:00, 92.45s/it]


[18:34:31] config: model_type='multidim_2pl' epochs=2000              cli.py:109
           priors='hierarchical' initializers=[] dims=15 lr=0.1                 
           lr_decay=0.9999 dropout=0.5 hidden=100 vocab_size=None               
           log_every=200 seed=42 deterministic=True                             
           data_path: data/irt_dataset.jsonlines                      cli.py:111
           output directory: data/irt_model_lb/                       cli.py:112
[18:34:32] amortized: False                                       dataset.py:112
[18:34:41] Vocab size: None                                       training.py:90
[18:34:42] Training Model...                                          cli.py:116
[18:34:42] args: {'device': 'cuda', 'num_items': 13350,          training.py:134
           'num_subjects': 293}                                                 
           Parsed Model Args: {'device': 'cuda', 'num_items':    training.py:147
           13350, 'num_subje

In [51]:
err, err2

([0.5087422991787031],
 [[0.4745979047501622,
   0.7885986845537435,
   0.5788823331096964,
   0.19289027430121009]])

## Getting and using anchor points

Let's split the data in train and test (recent models are placed in the test set):

In [42]:
Y_test = Y[:100]
Y_train = Y[100:]

In [43]:
(balance_weights*Y_train)[:,scenarios_position['lb']].mean(axis=1).max()

0.6547273254796847

The variable `number_item` gives the number of anchor points we want to find in each scenario:

The variable `clustering` specified how the clusting is run. If `clustering="correct."`, then correctness is used. On the other hand, if `clustering="irt"`, then the IRT embeddings for examples are used.

Computing anchor points and their weights for each scenario:

In [56]:
def compute_anchor_points(clustering, selected_scenarios, scenarios_position, balance_weights, Y_train, number_item, random_state):
    anchor_points = {}
    anchor_weights = {}

    for scenario in selected_scenarios:

        if clustering=='correct.':
            X = Y_train[:,scenarios_position[scenario]].T
        elif clustering=='irt':
            A, B, _ = load_irt_parameters(f'data/irt_model_lb/')
            X = np.vstack((A.squeeze(), B.squeeze().reshape((1,-1)))).T
            X = X[scenarios_position[scenario]]
        else:
            raise NotImplementedError
            
        #Normalizing balance_weights, so their sum is one within each scenario
        norm_balance_weights = balance_weights[scenarios_position[scenario]]
        norm_balance_weights /= norm_balance_weights.sum()

        # Fitting the KMeans model
        kmeans = KMeans(n_clusters=number_item, n_init="auto", random_state=random_state)
        kmeans.fit(X, sample_weight=norm_balance_weights)

        # Calculating anchor points
        anchor_points[scenario] = pairwise_distances(kmeans.cluster_centers_, X, metric='euclidean').argmin(axis=1)

        # Calculating anchor weights
        anchor_weights[scenario] = np.array([np.sum(norm_balance_weights[kmeans.labels_==c]) for c in range(number_item)])

    return anchor_points, anchor_weights

def estimate_lambdas(err, Y_train, scenarios_position, number_item):
    lambds = {} 

    for i,scenario in tqdm(enumerate(scenarios_position.keys())):
        v = np.var(Y_train[:,scenarios_position[scenario]], axis=1).mean()
        b = np.mean(err[i]) 
        lambds[scenario] = get_lambda(b, v/(4*number_item))

    return lambds

def get_lambda(b, v):
    return (b**2)/(v+(b**2))

In [45]:
subscenarios_position['lb']['harness_hellaswag_10'][-1]

13349

In [46]:
scenarios_position

{'lb': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  120,
  121,
  122,
  123,
  124,
  125,
  126,
  127,
  128,
  129,
  130,
  131,
  132,
  133,
  134,
  135,
  136,
  137,
  138,
  139,
  140,
  141,
  142,
  143,
  144,
  145,
  146,
  147,
  148,
  149,
  150,
  151,
  152,
  153,
  154,
  155,
  156,
  157

In [47]:
err, err2

([0.5087422991787031],
 [[0.4745979047501622,
   0.7885986845537435,
   0.5788823331096964,
   0.19289027430121009]])

In [48]:
subscenarios_position['lb'].keys()

dict_keys(['harness_truthfulqa_mc_0', 'harness_gsm8k_5', 'harness_arc_challenge_25', 'harness_hellaswag_10'])

Saving

In [67]:
clustering = 'irt' # 'correct.' or 'irt'

for s_displ in range(10):
    for anchor_num in [10, 15, 20, 30, 40, 50, 100]:
        all_anchor_points = {}
        all_anchor_weights = {}
        
        for scenario in lb_scenarios['lb']:
            scenario_pos = {scenario: subscenarios_position['lb'][scenario]}
            anchor_points, anchor_weights = compute_anchor_points(clustering, [scenario], scenario_pos, balance_weights, Y_train, anchor_num, random_state+s_displ)
            all_anchor_points[scenario] = anchor_points[scenario]
            all_anchor_weights[scenario] = anchor_weights[scenario]

        # save to tinybenchmark_lb file, putting 'seen_examples', 'examples_weights', 'irt_parameters', 'scenarios_position', 'subscenarios_position', 'optimal_lambdas'
        tinybenchmark_lb = {'seen_examples':all_anchor_points,
                            'examples_weights':all_anchor_weights,
                            'scenarios_position':scenarios_position,
                            'subscenarios_position':subscenarios_position,
                            }

        with open(f'data/tinybenchmark_rep/anchor_{anchor_num}_{random_state+s_displ}.pickle', 'wb') as handle:
            pickle.dump(tinybenchmark_lb, handle, protocol=pickle.HIGHEST_PROTOCOL)

        all_anchors = []
        all_weights = []
        for k in all_anchor_points.keys():
            all_anchors.append(all_anchor_points[k])
            all_weights.append(all_anchor_weights[k])

        all_anchors = np.hstack(all_anchors)
        all_weights = np.hstack(all_weights)

        A, B, Theta = load_irt_parameters(f'data/irt_model_lb/')
        ind_D = np.argmin(np.array(err))

        scenarios_pos = {}
        subs_position = {}
        for s in lb_scenarios['lb']:
            scenarios_pos[lb_scenario_to_readable[s]] = subscenarios_position['lb'][s]
            subs_position[lb_scenario_to_readable[s]] = {s: subscenarios_position['lb'][s]}
            all_anchor_weights[lb_scenario_to_readable[s]] = all_anchor_weights.pop(s)

        optimal_lambdas = estimate_lambdas(err2[ind_D], Y_train, scenarios_pos, anchor_num)

        with open(f'data/tinybenchmark_rep/tinybenchmark_{anchor_num}_{random_state+s_displ}.pickle', 'wb') as handle:
            pickle.dump({'lb': {'seen_examples': all_anchors, 
                                'examples_weights': all_anchor_weights, 
                                'irt_parameters': {'A': A, 'B': B}, 
                                'scenarios_position': scenarios_pos, 
                                'subscenarios_position': subs_position, 
                                'optimal_lambdas': optimal_lambdas}
                            }, handle, protocol=pickle.HIGHEST_PROTOCOL)

4it [00:00, 226.82it/s]
4it [00:00, 238.49it/s]
4it [00:00, 238.76it/s]
4it [00:00, 241.34it/s]
4it [00:00, 237.12it/s]
4it [00:00, 239.30it/s]
4it [00:00, 250.10it/s]
4it [00:00, 231.18it/s]
4it [00:00, 232.24it/s]
4it [00:00, 256.22it/s]
4it [00:00, 244.47it/s]
4it [00:00, 254.68it/s]
4it [00:00, 241.20it/s]
4it [00:00, 251.30it/s]
4it [00:00, 257.48it/s]
4it [00:00, 246.99it/s]
4it [00:00, 241.92it/s]
4it [00:00, 256.06it/s]
4it [00:00, 251.80it/s]
4it [00:00, 247.73it/s]
4it [00:00, 253.67it/s]
4it [00:00, 233.37it/s]
4it [00:00, 239.56it/s]
4it [00:00, 256.23it/s]
4it [00:00, 250.82it/s]
4it [00:00, 240.32it/s]
4it [00:00, 250.91it/s]
4it [00:00, 246.78it/s]
4it [00:00, 256.20it/s]
4it [00:00, 239.24it/s]
4it [00:00, 244.87it/s]
4it [00:00, 219.11it/s]
4it [00:00, 244.90it/s]
4it [00:00, 254.10it/s]
4it [00:00, 259.78it/s]
4it [00:00, 190.48it/s]
4it [00:00, 220.79it/s]
4it [00:00, 232.94it/s]
4it [00:00, 239.78it/s]
4it [00:00, 248.91it/s]
4it [00:00, 234.47it/s]
4it [00:00, 184.

In [59]:
subs_position['truthfulqa']

{'harness_truthfulqa_mc_0': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  120,
  121,
  122,
  123,
  124,
  125,
  126,
  127,
  128,
  129,
  130,
  131,
  132,
  133,
  134,
  135,
  136,
  137,
  138,
  139,
  140,
  141,
  142,
  143,
  144,
  145,
  146,
  147,
  148,
  149,
  150,
  151,
  152,
  153,
  154

Checking results

In [60]:
tinybenchmark_lb['seen_examples']

{'harness_truthfulqa_mc_0': array([541, 596, 632, 810, 231, 500, 667, 357, 660, 548, 746, 222, 159,
         60,  52, 703, 459, 435, 735, 653, 171,  89,   7, 268, 198, 220,
        190, 614, 255, 524, 139, 425, 346, 727, 635,   9, 261, 476, 310,
        405, 223, 491, 694, 133, 805, 561, 671, 568,  31, 285,  71, 188,
        351,  93, 196, 452, 615, 329,  39, 130, 443, 765,  59, 336, 565,
        609,  85, 309, 257, 465, 610, 354, 716, 670, 802,  68, 337, 599,
        471, 633, 725, 742, 717, 157, 560,  64, 755, 480, 680, 816, 684,
        175, 415, 704, 164, 797, 693, 747, 103, 135]),
 'harness_gsm8k_5': array([ 494, 1257, 1101,  696, 1187,  680, 1058, 1233,  406,  569,  915,
        1304,    7, 1020,  504,  255,  949,   32,  299, 1074,  349,  560,
        1141,  445,  377,  803, 1183,  139,  426,  314,  921,  154,  240,
         448,  801,  727,  924,  753, 1184,  834, 1242, 1259,  513,  789,
         131,  435,  674,  490,  611,  591,  817,  956,  106,  198,  321,
         543,  936

In [61]:
tinybenchmark_lb['seen_examples'].keys()

dict_keys(['harness_truthfulqa_mc_0', 'harness_gsm8k_5', 'harness_arc_challenge_25', 'harness_hellaswag_10'])

In [62]:
tinybenchmark_lb['seen_examples'][scenario] + tinybenchmark_lb['subscenarios_position']['lb'][scenario][0]

array([ 4121, 10073, 12711, 10200,  9128,  3384, 12921,  3448,  5922,
        9937, 12486,  8781, 13213, 13315,  6112, 10577,  8533,  3648,
       13034,  7044, 12041, 13148,  5577,  7866,  6446, 12273,  7030,
        5402, 11687,  9829,  6852, 10344,  8628,  5995,  4969, 12955,
        9880, 11279, 11173, 12967, 10450, 10635,  7180,  8778,  6550,
        4796, 11795,  9045,  6144,  4636,  4213, 11811,  8278,  8544,
        8849,  5817,  3737, 12186,  7650,  3552,  8668,  7000,  3657,
        9578,  5094,  8749,  7316,  4050,  6085, 12852,  4774, 12202,
        3643,  8789,  5169, 13212,  7560,  5123,  8298,  9303, 11780,
       11155,  4045,  6183,  4692,  4101, 11962,  9231, 11809,  8861,
       10578,  3583,  8500,  8907, 10310, 10269, 11929,  7912,  5652,
        6755])

Using anchor points to estimate performance in the test set and reporting the average prediction error

In [63]:
for scenario in tinybenchmark_lb['seen_examples'].keys():
    Y_anchor = Y_test[:,subscenarios_position['lb'][scenario]][:,all_anchor_points[scenario]]
    Y_hat = (Y_anchor*all_anchor_weights[scenario]).sum(axis=1)
    Y_true = (Y_test)[:,subscenarios_position['lb'][scenario]].mean(axis=1)

    print(f"scenario: {scenario}, avg. error: {np.abs(Y_hat-Y_true).mean():.3f}")

KeyError: 'harness_truthfulqa_mc_0'