In [None]:
%load_ext autoreload
%autoreload 2

import figs
import itertools
import seaborn as sns

sns.set_theme(style="whitegrid")

# Overview

In [None]:
filtered_df = df.copy()
filtered_df = filtered_df[~filtered_df["Hugging Face"]]
filtered_df = filtered_df[~filtered_df["Overtraining"]]
filtered_df = filtered_df[filtered_df["Optimizer"] != "Adam"]
filtered_df = filtered_df[filtered_df["Context Length"] != 1024]
filtered_df = filtered_df[
    filtered_df["Intervention"].isin(
        [
            None,
            ("Architecture",),
            ("Pretraining Data",),
            ("Tokenizer",),
            ("Size",),
            ("Optimizer",),
            ("Context Length",),
        ]
    )
]

figs.plot_intervention(
    filtered_df,
    "FineWeb-Edu Validation Loss",
    {
        "HellaSwag Test Loss": [
            "HellaSwag Loss",
        ],
    },
    "Intervention",
    plot_group="first",
    group_order=[
        "nan",
        "Pretraining Data",
        "Tokenizer",
        "Architecture",
        "Size",
        "Context Length",
        "Optimizer",
    ],
    group_names=[
        "None",
        "Pretraining Data",
        "Tokenizer",
        "Architecture",
        "Size",
        "Context Length",
        "Optim. Settings",
    ],
    z_order=[3, 3, 3, 2, 2, 2, 3],
    x_range=(0, 4.5),
    # entropy_df=entropy_df,
    fit_curve=[
        "None",
        "Pretraining Data",
        "Tokenizer",
        # "Architecture",
    ],
    subsample=3,
    title="Loss-to-Loss Scaling",
    verbose=True,
    legend_kwargs={
        "loc": "upper center",
        "bbox_to_anchor": (0.5, -0.2),
        "ncol": 2,
    },
    save_path="fig-overview",
)

# Schematic

In [None]:
figs.plot_schematic(save_path="fig-schematic")

# General Scaling Trends

In [None]:
# Apply filters
filtered_df = df.copy()
filtered_df = filtered_df[~filtered_df["Hugging Face"]]
filtered_df = filtered_df[~filtered_df["Overtraining"]]
filtered_df = filtered_df[
    filtered_df["Intervention"].isin(
        [
            None,
            ("Pretraining Data",),
            ("Architecture",),
            ("Architecture", "Pretraining Data"),
        ]
    )
]
# filtered_df = filtered_df[filtered_df["FineWeb-Edu Validation Loss"] < 5]
# filtered_df = filtered_df[filtered_df["Architecture"] == "Mamba"]
# filtered_df = filtered_df[filtered_df["Pretraining Data"] == "C4"]

archs = filtered_df["Architecture"].unique()
pretrains = filtered_df["Pretraining Data"].unique()


for arch, pretrain in itertools.product(archs, pretrains):
    _df = filtered_df.copy()
    _df = _df[_df["Architecture"] == arch]
    _df = _df[_df["Pretraining Data"] == pretrain]
    _df = _df[_df[f"{pretrain} Validation Loss"] < 4.5]
    models = _df["Name"].unique()
    if len(models) == 0:
        continue

    print(models)

    figs.plot_l2l(
        _df,
        f"{pretrain} Validation Loss",
        {
            "Validation Loss": [
                "The Pile UC Validation Loss",
                "RefineWeb Validation Loss",
                "Slimpajama Validation Loss",
                "C4 Validation Loss",
            ],
            "Test Loss": [
                "ARC-Challenge Loss",
                "ARC-Easy Loss",
                "OpenBookQA Loss",
                "PIQA Loss",
                "COPA Loss",
                "Winogrande Loss",
                "HellaSwag Loss",
                # "CommonSenseQA Loss",
                # "Social IQa Loss",
                # "MMLU Loss",
            ],
        },
        fit_curves=True,
        # entropy_df=entropy_df,
        titles=["Train-to-Train", "Train-to-Test"],
        # titles=["Train-to-Test"],
        legend_kwargs=[
            {
                "loc": "upper center",
                "bbox_to_anchor": (0.5, -0.2),
                "ncol": 1,
                "title": "Validation Set",
            },
            {
                "loc": "upper center",
                "bbox_to_anchor": (0.5, -0.2),
                "ncol": 1,
                "title": "Test Set",
            },
        ],
        e_min=0.1,
        save_path=f"fig-l2l-all_{arch.lower().replace(' ', '-')}_{pretrain.lower().replace(' ', '-')}",
        # verbose=True,
    )

# Intervening on Pretraining Data, Tokenizer, and Architecture

In [None]:
filtered_df = df.copy()
filtered_df = filtered_df[~filtered_df["Overtraining"]]


def tuple_contains_any_combination(row, vals):
    if row is None:
        if None in vals:
            return True
        return False
    return all(r in vals for r in row)


filtered_df = filtered_df[
    filtered_df["Intervention"].apply(
        tuple_contains_any_combination,
        vals=[None, "Pretraining Data", "Tokenizer", "Architecture"],
    )
]

figs.plot_intervention_matched_dims(
    filtered_df,
    "Pretraining Data",
    ["Architecture", "Tokenizer"],
    [
        ["Llama", "tiktoken"],
        ["Llama", "gpt2"],
        ["Llama", "gpt2-HF"],  # could kick this
        ["Mamba", "tiktoken"],
        ["Mamba", "gpt2"],
        ["GPT", "gpt-neox"],
    ],
    "FineWeb-Edu Validation Loss",
    {
        "C4 Validation Loss": [
            # "The Pile UC Validation Loss",
            # "FineWeb-Edu Validation Loss",
            # "RefineWeb Validation Loss",
            # "Slimpajama Validation Loss",
            "C4 Validation Loss",
        ],
        "ARC-Easy Test Loss": [
            # "ARC-Challenge Loss",
            "ARC-Easy Loss",
            # "OpenBookQA Loss",
            # "PIQA Loss",
            # "COPA Loss",
            # "Winogrande Loss",
            # "HellaSwag Loss",
        ],
    },
    ["Train-to-Train", "Train-to-Test"],
    title_x_pos=0.6,
    fit_curve=True,
    x_lim_upper=4.5,
    x_name="FW-Edu Val. Loss",
    # verbose=True,
    legend_kwargs={
        "loc": "upper center",
        "bbox_to_anchor": (0.5, 0),
        "ncol": 5,
    },
    # save_path="fig-intervention-matched_pretraining_specific2",
)

In [None]:
filtered_df = df.copy()
filtered_df = filtered_df[~filtered_df["Overtraining"]]


def tuple_contains_any_combination(row, vals):
    if row is None:
        if None in vals:
            return True
        return False
    return all(r in vals for r in row)


filtered_df = filtered_df[
    filtered_df["Intervention"].apply(
        tuple_contains_any_combination,
        vals=[None, "Pretraining Data", "Tokenizer", "Architecture"],
    )
]

figs.plot_intervention_matched_dims(
    filtered_df,
    "Tokenizer",
    ["Architecture", "Pretraining Data"],
    [
        ["Llama", "C4"],
        ["Llama", "FineWeb-Edu"],
        ["Llama", "The Pile UC"],
        ["Mamba", "C4"],
        # ["Mamba", "FineWeb-EDU"],
        ["Mamba", "The Pile UC"],
        ["GPT", "The Pile"],
    ],
    # "C4 Validation Loss",
    "FineWeb-Edu Validation Loss",
    {
        # "Average Validation Loss": [
        # "The Pile UC Val. Loss": [
        "C4 Validation Loss": [
            # "The Pile UC Validation Loss",
            # "FineWeb-Edu Validation Loss",
            # "RefineWeb Validation Loss",
            # "Slimpajama Validation Loss",
            "C4 Validation Loss",
        ],
        # "Average Test Loss": [
        # "HellaSwag Test Loss": [
        "ARC-Easy Test Loss": [
            # "ARC-Challenge Loss",
            "ARC-Easy Loss",
            # "OpenBookQA Loss",
            # "PIQA Loss",
            # "COPA Loss",
            # "Winogrande Loss",
            # "HellaSwag Loss",
        ],
    },
    ["Train-to-Train", "Train-to-Test"],
    title_x_pos=0.6,
    fit_curve=True,
    x_lim_upper=4.5,
    # verbose=True,
    x_name="FW-Edu Val. Loss",
    shorten_col_titles=True,
    legend_kwargs={
        "loc": "upper center",
        "bbox_to_anchor": (0.5, 0),
        "ncol": 4,
    },
    save_path="fig-intervention-matched_tokenizer_specific2",
)

In [None]:
filtered_df = df.copy()
filtered_df = filtered_df[~filtered_df["Overtraining"]]
filtered_df = filtered_df[
    filtered_df["Intervention"].apply(
        figs.tuple_contains_any_combination,
        vals=[None, "Pretraining Data", "Tokenizer", "Architecture"],
    )
]

figs.plot_intervention_matched_dims(
    filtered_df,
    "Architecture",
    ["Pretraining Data", "Tokenizer"],
    [
        ["C4", "gpt2"],
        ["C4", "tiktoken"],
        ["FineWeb-Edu", "gpt2"],
        ["FineWeb-Edu", "tiktoken"],
        ["The Pile UC", "tiktoken"],
        ["The Pile", "gpt-neox"],
    ],
    # "C4 Validation Loss",
    "FineWeb-Edu Validation Loss",
    {
        # "Average Validation Loss": [
        # "The Pile UC Val. Loss": [
        "C4 Validation Loss": [
            # "The Pile UC Validation Loss",
            # "FineWeb-Edu Validation Loss",
            # "RefineWeb Validation Loss",
            # "Slimpajama Validation Loss",
            "C4 Validation Loss",
        ],
        # "Average Test Loss": [
        # "HellaSwag Test Loss": [
        "ARC-Easy Test Loss": [
            # "ARC-Challenge Loss",
            "ARC-Easy Loss",
            # "OpenBookQA Loss",
            # "PIQA Loss",
            # "COPA Loss",
            # "Winogrande Loss",
            # "HellaSwag Loss",
        ],
    },
    ["Train-to-Train", "Train-to-Test"],
    shorten_col_titles=True,
    title_x_pos=0.6,
    fit_curve=True,
    x_lim_upper=4.5,
    # verbose=True,
    x_name="FW-Edu Val. Loss",
    legend_kwargs={
        "loc": "upper center",
        "bbox_to_anchor": (0.5, 0),
        "ncol": 4,
    },
    save_path="fig-intervention-matched_architecture_specific2",
)

# Model Size

In [None]:
filtered_df = df.copy()
filtered_df = filtered_df[~filtered_df["Hugging Face"]]
filtered_df = filtered_df[~filtered_df["Overtraining"]]
filtered_df = filtered_df[
    filtered_df["Intervention"].apply(
        figs.tuple_contains_any_combination,
        vals=[None, "Size", "Pretraining Data", "Architecture"],
    )
]

figs.plot_intervention_size(
    filtered_df,
    "FineWeb-Edu Validation Loss",
    {
        # "Average Validation Loss": [
        #     "The Pile UC Validation Loss",
        #     # "FineWeb-Edu Validation Loss",
        #     "RefineWeb Validation Loss",
        #     "Slimpajama Validation Loss",
        #     "C4 Validation Loss",
        # ],
        "AverageTest Loss": [
            "ARC-Challenge Loss",
            "ARC-Easy Loss",
            "OpenBookQA Loss",
            "PIQA Loss",
            "COPA Loss",
            "Winogrande Loss",
            "HellaSwag Loss",
        ],
    },
    markersize=5,
    tokenizer="tiktoken",
    # arch="Mamba",
    # pretrain=["The Pile UC", "FineWeb-EDU"],
    save_path="fig-intervention-size_fw_test",
)

# Optimizer

In [None]:
filtered_df = df.copy()
filtered_df = filtered_df[~filtered_df["Hugging Face"]]
filtered_df = filtered_df[~filtered_df["Overtraining"]]
filtered_df = filtered_df[
    filtered_df["Intervention"].apply(
        figs.tuple_contains_any_combination,
        vals=[None, "Optimizer", "Pretraining Data", "Architecture"],
    )
]
# Filter models by name where the minimum loss on C4 Validation is below 7
# filtered_df = filtered_df.groupby("Name").filter(lambda x: x["C4 Validation Loss"].min() < 7)
filtered_df = filtered_df[filtered_df["C4 Validation Loss"] < 7]


print(len(filtered_df["Name"].unique()))

figs.plot_intervention_optim(
    filtered_df,
    "FineWeb-Edu Validation Loss",
    {
        "Average Validation Loss": [
            "The Pile UC Validation Loss",
            # "FineWeb-Edu Validation Loss",
            "RefineWeb Validation Loss",
            "Slimpajama Validation Loss",
            "C4 Validation Loss",
        ],
        # "AverageTest Loss": [
        #     "ARC-Challenge Loss",
        #     "ARC-Easy Loss",
        #     "OpenBookQA Loss",
        #     "PIQA Loss",
        #     "COPA Loss",
        #     "Winogrande Loss",
        #     "HellaSwag Loss",
        # ],
    },
    tokenizer="tiktoken",
    # arch="Mamba",
    pretrain="FineWeb-Edu",
    # pretrain=["The Pile UC", "FineWeb-EDU"],
    save_path="fig-intervention-optim_fw",
)

# Context Length


In [None]:
filtered_df = df.copy()
filtered_df = filtered_df[~filtered_df["Hugging Face"]]
filtered_df = filtered_df[~filtered_df["Overtraining"]]
filtered_df = filtered_df[
    filtered_df["Intervention"].apply(
        figs.tuple_contains_any_combination,
        vals=[None, "Context Length", "Pretraining Data", "Architecture"],
    )
]

figs.plot_intervention_ctx(
    filtered_df,
    "FineWeb-Edu Validation Loss",
    {
        "Average Validation Loss": [
            "The Pile UC Validation Loss",
            # "FineWeb-Edu Validation Loss",
            "RefineWeb Validation Loss",
            "Slimpajama Validation Loss",
            "C4 Validation Loss",
        ],
        # "AverageTest Loss": [
        #     "ARC-Challenge Loss",
        #     "ARC-Easy Loss",
        #     "OpenBookQA Loss",
        #     "PIQA Loss",
        #     "COPA Loss",
        #     "Winogrande Loss",
        #     "HellaSwag Loss",
        # ],
    },
    tokenizer="tiktoken",
    markersize=20,
    # arch="Mamba",
    # pretrain=["The Pile UC", "FineWeb-EDU"],
    save_path="fig-intervention-ctx_fw",
)