## Imports

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import gc
import hashlib
import json
import os
import random
from contextlib import nullcontext
from dataclasses import asdict
from typing import Literal, Optional

import numpy as np
import torch
import torch.distributed as dist
import torch.nn.functional as F
from datasets import Dataset, DatasetDict, IterableDatasetDict, load_dataset
from jaxtyping import Float
from safetensors import safe_open
from safetensors.torch import load_file, save_file
from torch import Tensor
from torch.profiler import (
    ProfilerActivity,
    profile,
    record_function,
    schedule,
    tensorboard_trace_handler,
)
from tqdm.auto import tqdm
from tqdm.notebook import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, PreTrainedModel

from bergson.data import DataConfig, IndexConfig, create_index, load_gradients, pad_and_tensor, tokenize
from bergson.gradients import (
    GradientProcessor,
)
from bergson.hessians.collector import EkfacCollector
from bergson.hessians.logger import get_logger
from bergson.hessians.utils import TensorDict
from bergson.utils import assert_type

## 0. Load EKFAC

In [None]:
ekfac_path = ""

# all paths inside ekfac_path
world_size = len(os.listdir(ekfac_path + "/activation_covariance_sharded"))

eigen_a_paths = [ekfac_path + f"/activation_eigen_sharded/shard_{rank}.safetensors" for rank in range(world_size)]
eigen_a = [load_file(path, device="cuda") for path in eigen_a_paths]

eigen_g_paths = [ekfac_path + f"/gradient_eigen_sharded/shard_{rank}.safetensors" for rank in range(world_size)]
eigen_g = [load_file(path, device="cuda") for path in eigen_g_paths]

lambda_factor_paths = [
    ekfac_path + f"/eigenvalue_correction_sharded/shard_{rank}.safetensors" for rank in range(world_size)
]
lambda_factor = [load_file(path, device="cuda") for path in lambda_factor_paths]


In [15]:
ekfac_path = "/root/bergson-approx-unrolling/bergson/hessians/peft_fin_mis_fin/influence_results"
world_size = len(os.listdir(ekfac_path + "/activation_covariance_sharded"))
lambda_factor_paths = [
    ekfac_path + f"/inverse_eigenvalue_correction_sharded/shard_{rank}.safetensors" for rank in range(world_size)
]
lambda_factor = [load_file(path, device="cuda") for path in lambda_factor_paths]
lambda_factor_tensor = {}
for k, v in lambda_factor[0].items():
    lambda_factor_tensor[k] = torch.cat([lambda_factor[rank][k] for rank in range(world_size)], dim=0)


In [16]:
for k, v in lambda_factor_tensor.items():
    print(k, v.mean().item(), v.std().item(), v.min().item(), v.max().item())

layers.0.mlp.down_proj.lora_A.default 10.0 0.0 10.0 10.0
layers.0.mlp.down_proj.lora_B.default 10.0 0.0 10.0 10.0
layers.0.mlp.gate_proj.lora_A.default 10.0 0.0 10.0 10.0
layers.0.mlp.gate_proj.lora_B.default 10.0 0.0 10.0 10.0
layers.0.mlp.up_proj.lora_A.default 10.0 0.0 10.0 10.0
layers.0.mlp.up_proj.lora_B.default 10.0 0.0 10.0 10.0
layers.1.mlp.down_proj.lora_A.default 10.0 0.0 10.0 10.0
layers.1.mlp.down_proj.lora_B.default 10.0 0.0 10.0 10.0
layers.1.mlp.gate_proj.lora_A.default 10.0 0.0 10.0 10.0
layers.1.mlp.gate_proj.lora_B.default 10.0 0.0 10.0 10.0
layers.1.mlp.up_proj.lora_A.default 10.0 0.0 10.0 10.0
layers.1.mlp.up_proj.lora_B.default 10.0 0.0 10.0 10.0
layers.10.mlp.down_proj.lora_A.default 10.0 0.0 10.0 10.0
layers.10.mlp.down_proj.lora_B.default 10.0 0.0 10.0 10.0
layers.10.mlp.gate_proj.lora_A.default 10.0 0.0 10.0 10.0
layers.10.mlp.gate_proj.lora_B.default 10.0 0.0 10.0 10.0
layers.10.mlp.up_proj.lora_A.default 10.0 0.0 10.0 10.0
layers.10.mlp.up_proj.lora_B.default

## 1. Load the gradient


In [None]:
gradient_path = ""

mmap = load_gradients(gradient_path)
with open(os.path.join(gradient_path, "info.json")) as f:
    info = json.load(f)


NameError: name 'load_gradients' is not defined

## 2. Apply EKFAC