# Catwalk LLM evaluations

## Prerequisites

Follow the README to checkout repo, set up beaker secrets for github/google sheets, install gantry, etc. 

Run this notebook in the associated ai2-olmo-eval conda environment.

## Local setup (edit on first time use)

In [12]:
# Directory for the local repository where gantry will be run from
MY_REPO_DIR = "/Users/yulingg/desktop/llm-eval-yulingg/ai2-olmo-eval/" ## NOTE: CHANGE THIS to your local repo path!

MY_DEFAULT_CATWALK_OPTIONS = {
    "gsheet": "Catwalk_Evaluation_yulingg", ## NOTE: CHANGE THIS to the google sheet you want to write to!
    "split": "validation",
    "batch-size": 32,
    "random-subsample-seed": 1234,
    "model-max-length": 2048,
    "max-batch-tokens": 20480
}

MY_DEFAULT_GANTRY_OPTIONS = {
    "workspace": "ai2/yulingg-llm-eval", ## NOTE: CHANGE THIS to the beaker workspace you want to use!
    "beaker-image": "oyvindt/ai2-olmo-eval-image", # keep
    "beaker-dataset-cache": "/net/nfs.cirrascale/aristo/oyvindt/hf_datasets_cache", # keep
    "cluster": "ai2/aristo-cirrascale" # run on cirrascale machines for dataset cache to work
}


## Load code dependencies (run first)

In [13]:
import os
import sys
sys.path.insert(0, os.path.join(MY_REPO_DIR, "ai2_internal"))

import utils_internal
from utils_internal import *
from task_library import *
from model_library import *

utils_internal.REPO_DIR = MY_REPO_DIR
utils_internal.DEFAULT_CATWALK_OPTIONS = MY_DEFAULT_CATWALK_OPTIONS
utils_internal.DEFAULT_GANTRY_OPTIONS = MY_DEFAULT_GANTRY_OPTIONS

## Simple examples

Specify model and task directly:

In [4]:
model_specs = {"name": "EleutherAI/pythia-160m", "checkpoint": "step140000"}
task_specs = {"task": "arc_challenge arc_easy", "limit": 1000, "num-shots": 0}

res = run_catwalk(model_specs, task_specs)

Running lmeval-pythia-160m-step140000-f0f04321b8


Use model and task library:

In [5]:
res = run_catwalk(MODEL_SPECS["pythia-160m-step140000"], TASK_SPECS["rc20_n0_val1000"])

Running lmeval-pythia-160m-step140000-ee8f7f6d24


## Sample data analysis from google sheet

Load google sheet (need GDRIVE_SERVICE_ACCOUNT_JSON environment variable OR supply appropriate auth_file).

In [14]:
all_res = load_gsheet_as_df(MY_DEFAULT_CATWALK_OPTIONS['gsheet'], auth_file="/Users/yulingg/desktop/llm-eval-yulingg/downloaded_credentials_file.json")

In [15]:
all_res

Unnamed: 0,date,model,model_kwargs,full_model,task,primary_metric,metric,processing_time,num_instances,model_max_length,max_batch_tokens,batch_size,limit,split,random_subsample_seed,num_shots,unconditioned_prompt,all_metrics,beaker_id
0,2024-01-09 05:12:46 UTC,llama2-7b,"{'revision': None, 'trust_remote_code': False}",lm::pretrained=llama2-7b,social_iqa,acc_uncond,0.480000,100.451221,1000,2048,4096,32,1000,validation,1234,0,Answer:,"{""acc_raw"": 0.465, ""predicted_indices_raw"": [[...",01HKP9VR07QZZ8HCJM1NP4QV96
1,2024-01-09 05:12:46 UTC,llama2-7b,"{'revision': None, 'trust_remote_code': False}",lm::pretrained=llama2-7b,csqa,acc_uncond,0.629000,110.525418,1000,2048,4096,32,1000,validation,1234,0,Answer:,"{""acc_raw"": 0.587, ""predicted_indices_raw"": [[...",01HKP9VR07QZZ8HCJM1NP4QV96
2,2024-01-09 05:12:58 UTC,llama-7b,"{'revision': None, 'trust_remote_code': False}",lm::pretrained=llama-7b,social_iqa,acc_uncond,0.489000,100.337260,1000,2048,4096,32,1000,validation,1234,0,Answer:,"{""acc_raw"": 0.469, ""predicted_indices_raw"": [[...",01HKP9VJ3VAERXS3X91X12RS42
3,2024-01-09 05:12:58 UTC,llama-7b,"{'revision': None, 'trust_remote_code': False}",lm::pretrained=llama-7b,csqa,acc_uncond,0.626000,110.408994,1000,2048,4096,32,1000,validation,1234,0,Answer:,"{""acc_raw"": 0.589, ""predicted_indices_raw"": [[...",01HKP9VJ3VAERXS3X91X12RS42
4,2024-01-09 15:12:24 UTC,EleutherAI/pythia-160m,"{'revision': 'step140000', 'trust_remote_code'...",lm::pretrained=EleutherAI/pythia-160m,arc_challenge,acc_uncond,0.284281,3.800181,299,2048,20480,32,1000,validation,1234,0,Answer:,"{""acc_raw"": 0.21070234113712374, ""predicted_in...",01HKQC8C9J6GSJTZ36GR3D5QBD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353,2024-01-10 09:21:14 UTC,olmo-7b-v1_5-mix-mitch-ish-mosaic-step557000-hf,"{'revision': None, 'trust_remote_code': False}",lm::pretrained=olmo-7b-v1_5-mix-mitch-ish-mosa...,wsc,acc_raw,0.375000,12.721434,104,2048,4096,32,1000,validation,1234,0,,"{""acc_raw"": 0.375, ""predicted_indices_raw"": [[...",01HKS6YZAHS5EY2RC43FBSZ07F
354,2024-01-10 14:37:41 UTC,olmo-7b-v1_5-mix-mitch-ish-mosaic-step557000-hf,"{'revision': None, 'trust_remote_code': False}",lm::pretrained=olmo-7b-v1_5-mix-mitch-ish-mosa...,social_iqa,acc_uncond,0.465000,97.686666,1000,2048,4096,32,1000,validation,1234,0,Answer:,"{""acc_raw"": 0.457, ""predicted_indices_raw"": [[...",01HKSW4Y5X6R5VJ6ZPZ6HSJQFR
355,2024-01-10 14:37:41 UTC,olmo-7b-v1_5-mix-mitch-ish-mosaic-step557000-hf,"{'revision': None, 'trust_remote_code': False}",lm::pretrained=olmo-7b-v1_5-mix-mitch-ish-mosa...,csqa,acc_uncond,0.618000,108.669518,1000,2048,4096,32,1000,validation,1234,0,Answer:,"{""acc_raw"": 0.564, ""predicted_indices_raw"": [[...",01HKSW4Y5X6R5VJ6ZPZ6HSJQFR
356,2024-01-23 02:44:54 UTC,EleutherAI/pythia-160m,"{'revision': 'step140000', 'trust_remote_code'...",lm::pretrained=EleutherAI/pythia-160m,arc_challenge,acc_uncond,0.284281,4.655930,299,2048,20480,32,1000,validation,1234,0,Answer:,"{""acc_raw"": 0.21070234113712374, ""predicted_in...",01HMT2Y51AZECJQ7J996GCTFBE


In [16]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

# # Sample data
# data = np.random.rand(10, 5)  # 10x5 matrix of random numbers between 0 and 1
# df = pd.DataFrame(data)

# Use the built-in "PiYG" colormap
cmap = plt.colormaps.get_cmap('PiYG')

# Function to blend color with white
def blend_with_white(color, factor=0.5):
    # Blending with white can be done by averaging the color with white
    white = np.array([1, 1, 1, 1])
    return white * factor + np.array(color) * (1 - factor)

# Adjust the colormap
new_colors = [blend_with_white(cmap(i), 0.3) for i in range(cmap.N)]
new_cmap = mcolors.LinearSegmentedColormap.from_list("adjusted_PiYG", new_colors, N=cmap.N)

# If taking value from 0 to 1
# # Function to apply the color mapping
# def colorize(val):
#     color = new_cmap(val)
#     return f'background-color: {matplotlib.colors.rgb2hex(color)}'

# Function to apply the color mapping
# If taking value from 0 to 100
def colorize(val):
    # Normalize the value to the range [0, 1]
    normalized_val = float(val) / 100 if val != "NA" else 0.0
    color = new_cmap(normalized_val)
    return f'background-color: {matplotlib.colors.rgb2hex(color)}'

In [17]:
# Define the subset of models and tasks you want to display
models_to_plot = []
for model_name in set(all_res['model']):
    if "olmo" not in model_name and "7b" in model_name: # e.g., focus on 7B models
        models_to_plot.append(model_name)
models_to_plot += ["olmo-7b-v1_5-mix-mitch-ish-mosaic-step557000-hf"]
# models_to_plot = [('metric', item.capitalize()) for item in models_to_plot] # match models_by_task.columns format
print(models_to_plot)

tasks_to_plot = TASK_SPECS["rc_plus_n0_val1000"]["task"].split() # e.g., focus on the 20 tasks
print(tasks_to_plot)

['tiiuae/falcon-rw-7b', 'llama2-7b', 'HuggingFaceH4/zephyr-7b-beta', 'llama-7b', 'Salesforce/xgen-7b-4k-base', 'tiiuae/falcon-7b', 'mosaicml/mpt-7b-instruct', 'mosaicml/mpt-7b', 'Salesforce/xgen-7b-8k-inst', 'olmo-7b-v1_5-mix-mitch-ish-mosaic-step557000-hf']
['arc_challenge', 'arc_easy', 'boolq', 'copa', 'headqa_en', 'hellaswag', 'logiqa', 'mathqa', 'mrpc', 'openbookqa', 'piqa', 'qnli', 'qqp', 'rte', 'sciq', 'sst', 'wic', 'winogrande', 'wnli', 'wsc', 'social_iqa', 'csqa']


In [18]:
processed_model_name_res = all_res.copy()
# Filter the DataFrame to include only the selected models and tasks
processed_model_name_res = processed_model_name_res[processed_model_name_res['model'].isin(models_to_plot) & processed_model_name_res['task'].isin(tasks_to_plot)]

def get_model_name_with_revision(row):
    # Split the "model" names at "/", take whatever is after it, and capitalize the "model" names
    model_name = row['model'].split("/")[-1].capitalize()
    # Extract and concatenate the revision if it exists
    model_kwargs = eval(row['model_kwargs'])  # 'model_kwargs' is a string representation of a dictionary
    revision = model_kwargs.get('revision')
    if revision is not None:
        model_name += f"-{revision}"
    return model_name

# Apply the process model name function to each row of the DataFrame
processed_model_name_res['model'] = processed_model_name_res.apply(get_model_name_with_revision, axis=1)
# Remove duplicate models and task eval, keeping the first instance of each set of such cases
all_res_dedup = processed_model_name_res[~processed_model_name_res.duplicated(subset=['full_model', 'model_kwargs', 'task', 'metric'], keep='first')]

# Rows - task, Columns - model
models_by_task = all_res_dedup.loc[:,['model', 'task', 'metric']].groupby(['task', 'model']).sum().unstack('model')

# Move Olmo model columns to be behind
word_to_move = "Olmo"
# Create a boolean mask that is True for columns starting the word
mask = models_by_task.columns.get_level_values(1).str.startswith(word_to_move)
# Select columns that do NOT contain the word, followed by columns that DO contain it
cols_without_word = models_by_task.columns[~mask]
cols_with_word = models_by_task.columns[mask]
# Combine the columns in the desired order and reindex the DataFrame
new_column_order = cols_without_word.tolist() + cols_with_word.tolist()
models_by_task = models_by_task.reindex(columns=new_column_order)

# Map to 0 to 100
models_by_task = models_by_task.apply(pd.to_numeric, errors='coerce')
models_by_task = models_by_task.apply(lambda x: x.map(lambda y: '{:.2f}'.format(y*100) if pd.notnull(y) else '{:.2f}'.format(0.0)))

# Remove the top header row that says "metric"
models_by_task.columns = models_by_task.columns.droplevel(0)

In [19]:
models_by_task

model,Falcon-7b,Falcon-rw-7b,Llama-7b,Llama2-7b,Mpt-7b,Mpt-7b-instruct,Xgen-7b-4k-base,Xgen-7b-8k-inst,Zephyr-7b-beta,Olmo-7b-v1_5-mix-mitch-ish-mosaic-step557000-hf
task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
arc_challenge,47.49,43.14,44.48,48.49,46.49,46.15,45.82,47.83,57.86,48.49
arc_easy,70.35,65.09,67.89,69.47,70.53,70.0,67.02,67.72,79.65,65.44
boolq,74.6,70.2,75.4,80.2,74.2,73.4,73.6,77.4,86.6,73.4
copa,86.0,87.0,91.0,86.0,85.0,90.0,80.0,80.0,88.0,90.0
csqa,64.6,61.2,62.6,62.9,63.4,65.8,59.3,59.8,62.2,61.8
headqa_en,38.6,36.5,38.7,39.5,37.4,38.3,40.8,38.9,47.2,37.3
hellaswag,75.9,73.3,76.2,76.8,77.6,77.5,67.2,76.2,82.1,76.4
logiqa,23.66,21.81,19.51,26.11,22.89,23.5,22.89,24.88,32.57,23.35
mathqa,30.0,27.5,30.2,31.6,28.9,28.9,27.8,28.6,40.0,26.6
mrpc,62.75,39.95,68.63,69.12,67.65,68.14,52.7,41.67,71.81,68.38


In [20]:
# Apply the coloring to the DataFrame
styled_df = models_by_task.style.map(colorize)

# Display the DataFrame
styled_df

model,Falcon-7b,Falcon-rw-7b,Llama-7b,Llama2-7b,Mpt-7b,Mpt-7b-instruct,Xgen-7b-4k-base,Xgen-7b-8k-inst,Zephyr-7b-beta,Olmo-7b-v1_5-mix-mitch-ish-mosaic-step557000-hf
task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
arc_challenge,47.49,43.14,44.48,48.49,46.49,46.15,45.82,47.83,57.86,48.49
arc_easy,70.35,65.09,67.89,69.47,70.53,70.0,67.02,67.72,79.65,65.44
boolq,74.6,70.2,75.4,80.2,74.2,73.4,73.6,77.4,86.6,73.4
copa,86.0,87.0,91.0,86.0,85.0,90.0,80.0,80.0,88.0,90.0
csqa,64.6,61.2,62.6,62.9,63.4,65.8,59.3,59.8,62.2,61.8
headqa_en,38.6,36.5,38.7,39.5,37.4,38.3,40.8,38.9,47.2,37.3
hellaswag,75.9,73.3,76.2,76.8,77.6,77.5,67.2,76.2,82.1,76.4
logiqa,23.66,21.81,19.51,26.11,22.89,23.5,22.89,24.88,32.57,23.35
mathqa,30.0,27.5,30.2,31.6,28.9,28.9,27.8,28.6,40.0,26.6
mrpc,62.75,39.95,68.63,69.12,67.65,68.14,52.7,41.67,71.81,68.38
