In [None]:
%load_ext autoreload
%autoreload 2

import figs
import utils

import seaborn as sns
import pandas as pd

sns.set_theme(style="whitegrid")

# Grids

In [None]:
# List of columns to compare
columns_to_compare = [
    "c4_val_loss",
    "pile_uc_val_loss",
    "fineweb_edu_100bt_val_loss",
    "refineweb_val_loss",
    "slimpajama_val_loss",
    "arc_challenge/loss",
    "arc_easy/loss",
    "hellaswag/loss",
    "piqa/loss",
    "openbookqa/loss",
    "winogrande/loss",
]

# Apply filters
filtered_df = filtered_df.copy()
# filtered_df, valid_models = utils.filter_min_tokens(filtered_df, min_tokens=7.5e9)
# filtered_df = utils.filter_within_chinchilla(filtered_df)
# filtered_df = filtered_df[filtered_df["size"].between(400e6, 430e6)]
# filtered_df = filtered_df[filtered_df["n_layers"].between(23, 26)]
filtered_df = filtered_df[filtered_df["tokenizer"] == "tiktoken"]
# filtered_df = filtered_df[filtered_df["position_embedding"] == "rope"]
# filtered_df = filtered_df[filtered_df["norm_type"] == "rms_norm"]

# Create the grid plot
utils.create_grid_comparison(
    filtered_df,
    columns_to_compare,
    # transform="log",
    save_path="grid-all_tiktoken.png",
)

In [None]:
# List of columns to compare
columns_to_compare = [
    "c4_val_loss",
    "pile_uc_val_loss",
    "fineweb_edu_100bt_val_loss",
    "refineweb_val_loss",
    "slimpajama_val_loss",
]

# Apply filters
filtered_df = df.copy()
# filtered_df, valid_models = utils.filter_min_tokens(filtered_df, min_tokens=7.5e9)
# filtered_df = utils.filter_within_chinchilla(filtered_df)
# filtered_df = filtered_df[filtered_df["size"].between(400e6, 430e6)]
# filtered_df = filtered_df[filtered_df["n_layers"].between(23, 26)]

# Create the grid plot
utils.create_grid_comparison(
    filtered_df,
    columns_to_compare,
    # transform="log",
    save_path="grid-val.png",
)

In [None]:
# List of columns to compare
columns_to_compare = [
    "arc_challenge/loss",
    "arc_easy/loss",
    "hellaswag/loss",
    "piqa/loss",
    "openbookqa/loss",
    "winogrande/loss",
]

# Apply filters
filtered_df = df.copy()
filtered_df, valid_models = utils.filter_min_tokens(filtered_df, min_tokens=7.5e9)
filtered_df = utils.filter_within_chinchilla(filtered_df)
# filtered_df = filtered_df[filtered_df["size"].between(400e6, 430e6)]
# filtered_df = filtered_df[filtered_df["n_layers"].between(23, 26)]

# Create the grid plot
utils.create_grid_comparison(
    filtered_df,
    columns_to_compare,
    # transform="log",
    # save_path="grid-task_min-tokens7.5e9_chinchilla.png",
)

# Loss-to-Loss plots
Joint plots as in the "Scaling Laws for Loss-to-Loss" paper.

In [None]:
filtered_df = df.copy()
filtered_df = filtered_df[filtered_df["tokenizer"] == "tiktoken"]
# filtered_df = filtered_df[filtered_df["position_embedding"].isin(["rope", np.nan])]
# filtered_df = filtered_df[filtered_df["norm_type"] == "rms_norm"]

utils.plot_joint_train2train_v2(
    filtered_df,
    save_path="train2train2_tiktoken.png",
)

# Figures

In [None]:
filtered_df = df.copy()
filtered_df = filtered_df[filtered_df["tokenizer"] == "tiktoken"]
filtered_df = filtered_df[filtered_df["context_length"] == 2048]

figs.plot_fit_vs_accuracy(filtered_df, "llama", "c4", "fig_fit_vs_accuracy_llama_c4")

## Intervene on Architecture

In [None]:
filtered_df = df.copy()

# TODO: fix this once we have evals with the same tokenizer

# Fix pretraining data
# GPT models were trained on the Pile or the Pile deduped depending on the name
filtered_df.loc[
    (filtered_df["arch"] == "GPT") & (filtered_df["name"].str.contains("deduped")),
    "pretraining_data",
] = "The Pile deduped"
filtered_df.loc[
    (filtered_df["arch"] == "GPT") & ~(filtered_df["name"].str.contains("deduped")),
    "pretraining_data",
] = "The Pile"
# LLama models were trained on the Pile deduped
filtered_df.loc[
    (filtered_df["arch"] == "llama") & (filtered_df["pretraining_data"] == "The Pile"),
    "pretraining_data",
] = "The Pile deduped"

# Make difference between GPT models apparent
filtered_df.loc[
    filtered_df["name"].str.contains("neo"),
    "arch",
] = "GPT neo"
filtered_df.loc[
    filtered_df["name"].str.contains("neox"),
    "arch",
] = "GPT neox"
filtered_df.loc[
    filtered_df["name"].str.contains("pythia"),
    "arch",
] = "GPT pythia"
filtered_df.loc[
    filtered_df["name"].str.contains("gpt-j"),
    "arch",
] = "GPT j"

# For now, only keep Mamba, Mamba2, GPT-pythia, PGT-neox, all of which use the same
filtered_df = filtered_df[
    filtered_df["arch"].isin(["Mamba", "Mamba2", "GPT pythia", "GPT neox"])
]

figs.plot_intervention_arch(
    filtered_df, avg_only=False, save_path="fig_intervention_arch"
)

## Jointly Intervene on Architecture, Tokenizer, Pretraining Data

In [None]:
df = utils.load_and_prepare_latest("df_combined")

# Amend data
# Make architecture explicit
df.loc[df["Architecture"] == "mamba", "Architecture"] = "Mamba"
df.loc[df["Architecture"] == "llama", "Architecture"] = "LLaMA"

# Make tokenizer explicit
df.loc[
    df["Hugging Face"] & (df["Name"].str.contains("neox|pythia|mamba|mamba2")),
    "Tokenizer",
] = "gpt-neox"
df.loc[
    df["Hugging Face"] & (df["Name"].str.contains("gpt-j|neo-|ablation")), "Tokenizer"
] = "gpt2-HF"
df.loc[df["Hugging Face"] & (df["Name"].str.contains("Qwen")), "Tokenizer"] = "qwen"

# Make pretraining data explicit
df.loc[df["Pretraining Data"] == "FineWeb-EDU", "Pretraining Data"] = "FineWeb-EDU"
df.loc[df["Pretraining Data"] == "fineweb_edu_100bt", "Pretraining Data"] = (
    "FineWeb-EDU"
)
df.loc[df["Pretraining Data"] == "c4", "Pretraining Data"] = "C4"
df.loc[df["Pretraining Data"] == "pile_uc", "Pretraining Data"] = "The Pile UC"
# GPT models were trained on the Pile or the Pile deduped depending on the name
df.loc[
    (df["Architecture"] == "GPT") & (df["Name"].str.contains("deduped")),
    "Pretraining Data",
] = "The Pile Deduped"
df.loc[
    (df["Architecture"] == "GPT") & ~(df["Name"].str.contains("deduped")),
    "Pretraining Data",
] = "The Pile"
# LLama models were trained on the Pile deduped
df.loc[
    (df["Architecture"] == "LLaMA")
    & (df["Pretraining Data"] == "The Pile")
    & df["Hugging Face"],
    "Pretraining Data",
] = "The Pile Deduped"

# Make base models explicit
df["Size Intervention"] = df["Name"].str.contains("_d_|_l_")
df["Context Length Intervention"] = df["Name"].str.contains("_ctx_")
df["Optimizer Intervention"] = df["Name"].str.contains("_lr_|_wd_|adam|cosine")
df["Is Base"] = ~(
    df["Size Intervention"]
    | df["Context Length Intervention"]
    | df["Optimizer Intervention"]
)

# Rename columns
legible_columns = {
    "c4": "C4",
    "pile_uc": "The Pile UC",
    "fineweb_edu_100bt": "FineWeb-EDU",
    "refineweb": "RefineWeb",
    "slimpajama": "Slimpajama",
    "arc_challenge": "ARC-Challenge",
    "arc_easy": "ARC-Easy",
    "openbookqa": "OpenBookQA",
    "piqa": "PIQA",
    "copa": "COPA",
    "winogrande": "Winogrande",
    "hellaswag": "HellaSwag",
    "mmlu": "MMLU",
    "social_iqa": "Social IQa",
    "commonsenseqa": "CommonSenseQA",
}
for col in df.columns:
    for old, new in legible_columns.items():
        if old in col:
            df.rename(columns={col: col.replace(old, new)}, inplace=True)

In [None]:
filtered_df = df.copy()
filtered_df = filtered_df[filtered_df["Is Base"]]

figs.plot_intervention_arch_tokenizer_pretraining(
    filtered_df,
    # save_path="debug_arch_tokenizer_pretraining_val2test",
    xdata="C4 Validation Loss",
    ydata=[
        "ARC-Challenge Loss",
        "ARC-Easy Loss",
        "OpenBookQA Loss",
        "PIQA Loss",
        "COPA Loss",
        "Winogrande Loss",
        "HellaSwag Loss",
    ],
)

## Intervene on Optimizer

In [None]:
df = utils.load_latest("df_train_scratch")

In [None]:
# Add optimizer columns
# lr 3e-3, 3e-4
df["lr"]
# wd 0.1 0.033
# optimizer adam, adamw
# scheduler cosine, wsd

In [None]:
df["name"].unique()