## Imports

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import torch
import numpy as np
from transformers import GPTNeoXForCausalLM, AutoTokenizer
from safetensors.torch import load_file

from bergson.approx_unrolling.utils import TensorDict

import os

## Load model

In [56]:
from transformers import GPTNeoXForCausalLM, AutoTokenizer

model_str = "EleutherAI/pythia-14m"
step = 5000
model = GPTNeoXForCausalLM.from_pretrained(
    model_str,
    revision=f"step{step}",
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(
    model_str,
    revision=f"step{step}",
)

## 1. Exammining the matrices

In [128]:
def test_comparison(path_1, path_2):
    files_1 = os.listdir(path_1)
    files_2 = os.listdir(path_2)

    for file_1 in files_1:
        if file_1 in files_2:
            if file_1.endswith(".safetensors"):
                tensor_1 = TensorDict(
                    load_file(
                        os.path.join(path_1, file_1),
                        device="cuda",
                    )
                )
                tensor_2 = TensorDict(
                    load_file(
                        os.path.join(path_2, file_1),
                        device="cuda",
                    )
                )
                diff = tensor_1 - tensor_2
                all_close = tensor_1.allclose(tensor_2, rtol=1e-5, atol=1e-5)
                all_close_values = all(all_close.values())
                if not all_close_values:
                    print(file_1)
                    print("Differences found:")
                    print(diff.max())
                # check if all_close has any key that is False


In [129]:
path_1 = "/root/bergson/tests/caches/cache_1/.models/EleutherAI/pythia-14m/checkpoint_1000/influence_results/factors_ekfac_half"
path_2 = "/root/bergson/tests/caches/cache_2/.models/EleutherAI/pythia-14m/checkpoint_1000/influence_results/factors_ekfac_half"

In [None]:
test_comparison(path_2, path_1)

In [None]:
test_comparison(path_2, path_1)

In [84]:
tensor_1 = TensorDict(load_file(os.path.join(path_1, "gradient_covariance.safetensors"), device="cuda"))
tensor_2 = TensorDict(load_file(os.path.join(path_2, "gradient_covariance.safetensors"), device="cuda"))

In [None]:
(tensor_1 - tensor_2).argmax()

In [None]:
for k, v in (tensor_1 - tensor_2).items():
    print(k, v.max().item(), v.min().item(), v.mean().item(), v.std().item())

In [None]:
d = TensorDict(
    load_file(
        "/root/bergson/bergson/approx_unrolling/.models/EleutherAI/pythia-14m/segment_0/influence_results/factors_ekfac_half/average_gradient_covariance.safetensors",
        device="cuda",
    )
)


d_2 = TensorDict(
    load_file(
        "/root/bergson/.models/EleutherAI/influence_results/factors_ekfac_half/gradient_covariance.safetensors",
        device="cuda",
    )
)

diff = d - d_2

for k, v in diff.items():
    if v.max() < 1e-5:
        continue
    print(k)
    print(v.max())
    print("----" * 10)

In [None]:
number = 15335424
# determine prime decomposition of number
number / 7488


In [5]:
from bergson.approx_unrolling.utils import TensorDict


d = TensorDict(d)

In [7]:
test_list = [d, d]

In [None]:
sum(test_list)

In [42]:
path = "/home/louis/bergson/bergson/approx_unrolling/influence_results/wikitext/factors_ekfac_half"

# list of all subfolders or files
import os

subfolders = []
for dirpath, dirnames, filenames in os.walk(path):
    for dirname in dirnames:
        subfolders.append(os.path.join(dirpath, dirname))
    for filename in filenames:
        subfolders.append(os.path.join(dirpath, filename))


In [6]:
filenames
filenames_json = [f for f in subfolders if f.endswith(".json")]
filesnames_safetensors = [f for f in subfolders if f.endswith(".safetensors")]

In [None]:
filenames_json

In [None]:
import json

json_file = filenames_json[0]
with open(json_file, "r") as f:
    data = json.load(f)
data

In [None]:
filesnames_safetensors

In [None]:
from safetensors import safe_open

for i in range(len(filesnames_safetensors)):
    path = filesnames_safetensors[i]
    tensors = {}
    with safe_open(path, framework="pt", device=0) as f:
        for k in f.keys():
            tensors[k] = f.get_tensor(k)
    print(tensors.keys())

In [11]:
path = filesnames_safetensors[-2]
with safe_open(path, framework="pt", device=0) as f:
    for k in f.keys():
        tensors[k] = f.get_tensor(k)

In [29]:
gpt_2 = torch.load("/home/louis/bergson/bergson/approx_unrolling/checkpoints/model.pth")

In [6]:
all_weights = model.state_dict()

all_mlps = {k: v for k, v in all_weights.items() if "mlp" in k}


In [None]:
len(all_mlps)

In [None]:
type(model.named_modules())

In [31]:
module_keys = [m[0] for m in model.named_modules()]

In [48]:
track_attention = True
track_mlp = True
total_modules = []
for m in module_keys:
    if "dropout" in m.lower() or "layernorm" in m.lower():
        continue

    if "attention" in m.lower() and track_attention:
        total_modules.append(m)
    if "mlp" in m.lower() and track_mlp:
        total_modules.append(m)


In [4]:
from examples.wikitext.pipeline import get_wikitext_dataset

train_dataset = get_wikitext_dataset(
    split="eval_train",
)

In [7]:
# sample from train_dataset
sample = train_dataset[0]


## Debugging kronfluence/bergson

In [10]:
path_kronfluence = "/root/quelle/.models/EleutherAI/pythia-14m/checkpoint_1000/influence_results/factors_ekfac"
activation_path = "/root/quelle/.models/EleutherAI/pythia-14m/checkpoint_1000/influence_results/factors_ekfac/activation_covariance.safetensors"
gradient_path = "/root/quelle/.models/EleutherAI/pythia-14m/checkpoint_1000/influence_results/factors_ekfac/gradient_covariance.safetensors"

path_sharded = "/root/bergson-approx-unrolling/bergson/hessians/influence_results_sharded"
activation_path_sharded = (
    "/root/bergson-approx-unrolling/bergson/hessians/influence_results_sharded/activation_covariance.safetensors"
)
gradient_path_sharded = (
    "/root/bergson-approx-unrolling/bergson/hessians/influence_results_sharded/gradient_covariance.safetensors"
)


name = "gpt_neox.layers.0.attention.dense"

In [11]:
total_processed_path = (
    "/root/bergson-approx-unrolling/bergson/hessians/training_data/influence_results/total_processed.safetensors"
)

In [16]:
ekfac_new_path = (
    "/root/bergson-approx-unrolling/bergson/hessians/training_data/influence_results/activation_covariance.safetensors"
)

ekfac_new = TensorDict(load_file(ekfac_new_path, device="cuda"))

In [12]:
total_processed = load_file(total_processed_path, device="cuda")

In [13]:
total_processed

{'total_processed': tensor(22343, device='cuda:0')}

In [14]:
num_path = "/root/quelle/bergson/approx_unrolling/.models/EleutherAI/pythia-14m/checkpoint_1000/influence_results/factors_ekfac/num_gradient_covariance_processed.safetensors"
num_processed = TensorDict(load_file(num_path, device="cuda"))

In [15]:
num_processed

TensorDict({'gpt_neox.layers.0.attention.dense': tensor([2439168], device='cuda:0'), 'gpt_neox.layers.0.attention.query_key_value': tensor([2439168], device='cuda:0'), 'gpt_neox.layers.0.mlp.dense_4h_to_h': tensor([2439168], device='cuda:0'), 'gpt_neox.layers.0.mlp.dense_h_to_4h': tensor([2439168], device='cuda:0'), 'gpt_neox.layers.1.attention.dense': tensor([2439168], device='cuda:0'), 'gpt_neox.layers.1.attention.query_key_value': tensor([2439168], device='cuda:0'), 'gpt_neox.layers.1.mlp.dense_4h_to_h': tensor([2439168], device='cuda:0'), 'gpt_neox.layers.1.mlp.dense_h_to_4h': tensor([2439168], device='cuda:0'), 'gpt_neox.layers.2.attention.dense': tensor([2439168], device='cuda:0'), 'gpt_neox.layers.2.attention.query_key_value': tensor([2439168], device='cuda:0'), 'gpt_neox.layers.2.mlp.dense_4h_to_h': tensor([2439168], device='cuda:0'), 'gpt_neox.layers.2.mlp.dense_h_to_4h': tensor([2439168], device='cuda:0'), 'gpt_neox.layers.3.attention.dense': tensor([2439168], device='cuda:0'

In [32]:
102400 / 2048

50.0

In [37]:
(32 * 2048) * 37 + 7 * 2048

2439168

In [34]:
act_kronfluence = TensorDict(load_file(activation_path, device="cuda"))
grad_kronfluence = TensorDict(load_file(gradient_path, device="cuda"))
act_sharded = TensorDict(load_file(activation_path_sharded, device="cuda"))
grad_sharded = TensorDict(load_file(gradient_path_sharded, device="cuda"))


In [35]:
act_kronfluence.allclose(act_sharded, rtol=1e-5, atol=1).values()

dict_values([True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True])

In [36]:
grad_kronfluence.allclose(grad_sharded, rtol=1e-5, atol=1).values()

dict_values([True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True])

In [53]:
x = torch.randn(2048, 2048, device="cuda").to(torch.bfloat16)
x.dtype

torch.bfloat16

In [54]:
torch.linalg.eigh(x)

RuntimeError: "linalg_eigh_cuda" not implemented for 'BFloat16'

## Debugging final algorithm

In [5]:
path = "/root/bergson-approx-unrolling/bergson/hessians/training_data/influence_results/gradient_eigen_sharded"


In [6]:
files = os.listdir(path)
tensors = []
for file in files:
    if file.endswith(".safetensors"):
        tensor = TensorDict(load_file(os.path.join(path, file), device="cuda"))
        tensors.append(tensor)

In [7]:
d = tensors[0]


In [17]:
d.size()

TensorDict({'layers.0.mlp.down_proj': torch.Size([448, 18944]), 'layers.0.mlp.gate_proj': torch.Size([0, 3584]), 'layers.0.mlp.up_proj': torch.Size([0, 3584]), 'layers.1.mlp.down_proj': torch.Size([448, 18944]), 'layers.1.mlp.gate_proj': torch.Size([0, 3584]), 'layers.1.mlp.up_proj': torch.Size([0, 3584]), 'layers.10.mlp.down_proj': torch.Size([448, 18944]), 'layers.10.mlp.gate_proj': torch.Size([0, 3584]), 'layers.10.mlp.up_proj': torch.Size([0, 3584]), 'layers.11.mlp.down_proj': torch.Size([448, 18944]), 'layers.11.mlp.gate_proj': torch.Size([0, 3584]), 'layers.11.mlp.up_proj': torch.Size([0, 3584]), 'layers.12.mlp.down_proj': torch.Size([448, 18944]), 'layers.12.mlp.gate_proj': torch.Size([0, 3584]), 'layers.12.mlp.up_proj': torch.Size([0, 3584]), 'layers.13.mlp.down_proj': torch.Size([448, 18944]), 'layers.13.mlp.gate_proj': torch.Size([0, 3584]), 'layers.13.mlp.up_proj': torch.Size([0, 3584]), 'layers.14.mlp.down_proj': torch.Size([448, 18944]), 'layers.14.mlp.gate_proj': torch.Si

In [None]:
model.device

device(type='cuda', index=2)

In [157]:
a, b = 8, 25
batch = 1
L = torch.nn.Linear(a, b)

In [171]:
v = torch.rand(batch, a)
Ds = torch.rand(batch, b)
prod = Ds.T @ v


v_prime = torch.rand(batch, a)
Ds_prime = torch.rand(batch, b)
A = torch.outer(v_prime[0], v_prime[0])
B = torch.outer(Ds_prime[0], Ds_prime[0])

In [172]:
final_result = B @ prod @ A

In [175]:
transformed_Ds = B @ Ds.T
transformed_v = v @ A
transformed_prod = transformed_Ds @ transformed_v
torch.allclose(final_result, transformed_prod)

True

In [179]:
# sharded toy example
A_1, A_2 = torch.chunk(A, 2, dim=0)
B_1, B_2 = torch.chunk(B, 2, dim=0)

In [203]:
v_1, v_2 = torch.chunk(v, 2, dim=1)
Av_1, Av_2 = v_1 @ A_1, v_2 @ A_2
Av = Av_1 + Av_2
torch.allclose(Av, v @ A)

True

In [207]:
d_1, d_2 = B_1 @ Ds.T, B_2 @ Ds.T
d = torch.cat([d_1, d_2], dim=0)
torch.allclose(d, B @ Ds.T)

True

In [206]:
torch.allclose(final_result, d @ Av)

True