In [9]:
%load_ext autoreload
%autoreload 2

# Stable imports (won't reload)
import logging
import os

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import torch
from dataclasses import dataclass
from typing import Optional

# Force reload of project modules
import importlib
import sys

modules_to_reload = [
    'calibration', 'compress_qk', 'compress_mlp', 'compress_vo',
    'compression_utils', 'eval', 'model_utils', 'patchers.patch'
]

for module_name in modules_to_reload:
    if module_name in sys.modules:
        del sys.modules[module_name]

from calibration import get_model_attrs, load_calibs
from compress_qk import compress_qk, compress_qk_svd
from compress_mlp import compress_mlp
from compress_vo import compress_vo
from compression_utils import allocate_global_sparsity
from eval import compute_perplexity, load_calibration_texts, load_eval_texts
from model_utils import load_model, reload_compressed_model, save_compressed_model, save_model
from patchers.patch import patch_config

logger = logging.getLogger("MoDeGPT")
logger.setLevel(logging.INFO)
if not logger.handlers:
    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
    console = logging.StreamHandler()
    console.setFormatter(formatter)
    logger.addHandler(console)
    os.makedirs("logs", exist_ok=True)
    file = logging.FileHandler("logs/run_modegpt.log")
    file.setFormatter(formatter)
    logger.addHandler(file)


@dataclass
class Config:
    model: str = "meta-llama/Llama-2-7b-hf"
    compression_ratio: float = 0.4
    calib_size: str = "16"
    eval_size: str = "8"
    output_dir: str = "./compressed_output/llama2-7b"
    device: int = 0
    skip: str = "mlp"
    local_model_path: str = ""
    load_calibs_from: str = "./calibs/llama2-7b_sz16.pt"
    calibs_save_path: str = ""
    calibs_batch_size: int = 8

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
newest


In [2]:
args = Config(
    model="meta-llama/Llama-2-7b-hf",
    compression_ratio=0.4,
    calib_size="16",
    eval_size="8",
    output_dir="./compressed_output/llama2-7b",
    device=0,
    skip="mlp",
    load_calibs_from="",
    calibs_save_path="./calibs/llama2-7b_qk-updated_sz16.pt",
    calibs_batch_size=8,
)

## Setup

In [3]:
model, tokenizer, config = load_model(args.model, device=args.device)

2025-11-09 20:07:45,803 - INFO - Loading model from: meta-llama/Llama-2-7b-hf


`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.35it/s]
2025-11-09 20:08:15,732 - INFO - ✔ Loaded model on cuda:0 with float16.
2025-11-09 20:08:15,734 - INFO - No pad_token found. Set pad_token = eos_token.


In [4]:
calib_texts, eval_texts, cov_mlp, cov_q, cov_k, cov_x, bi_scores, layer_keep_ratios = (
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
)

In [5]:
rotary_mask = None

In [6]:
model_compressed, tokenizer_compressed = None, None

In [7]:
def loader():
    global calib_texts, eval_texts, cov_mlp, cov_q, cov_k, cov_x, bi_scores, layer_keep_ratios
    logger.info("Loading calibration and evaluation texts...")
    calib_texts = load_calibration_texts(
        args.calib_size, model, tokenizer, batch_size=int(args.calibs_batch_size)
    )
    eval_texts = load_eval_texts(
        args.eval_size, model, tokenizer, batch_size=args.calibs_batch_size
    )

    cov_mlp, cov_q, cov_k, cov_x, bi_scores = load_calibs(
        model,
        tokenizer,
        calib_texts,
        int(args.calibs_batch_size),
        load_calibs_from=args.load_calibs_from,
        calibs_save_path=args.calibs_save_path,
    )

    layer_keep_ratios = allocate_global_sparsity(
        bi_scores, compression_ratio=args.compression_ratio
    )


def compress():
    global rotary_mask

    slice_dims = True
    ridge_lambda = 1e-2

    # compress_mlp(
    #     model=model,
    #     cov=cov_mlp,
    #     keep_ratios=layer_keep_ratios,
    #     ridge_lambda=1e-3,
    #     slice_dims=True,
    # )

    logger.info("Compress QK")
    rotary_mask = compress_qk(
        model=model,
        cov=(cov_q, cov_k),
        keep_ratios=layer_keep_ratios,
        ridge_lambda=ridge_lambda,
        slice_dims=slice_dims,
    )

    logger.info("Compress VO")
    compress_vo(
        model=model,
        cov=cov_x,
        keep_ratios=layer_keep_ratios,
        ridge_lambda=ridge_lambda,
        slice_dims=slice_dims,
    )


def reload_save_comp():
    global model, model_compressed, tokenizer_compressed

    og_config = patch_config(model)

    rebuild_path = "./patchers/LlamaRebuild.py"

    if model_compressed is not None:
        torch.cuda.empty_cache()

    save_compressed_model(
        model,
        tokenizer,
        rotary_masks=rotary_mask,
        rebuild_path=rebuild_path,
        save_dir=args.output_dir,
        source_model_name=args.model,
    )

    model.config = og_config

    model_compressed, tokenizer_compressed = reload_compressed_model(args.output_dir)

In [10]:
loader()
compress()

2025-11-09 20:47:27,859 - INFO - Loading calibration and evaluation texts...


2025-11-09 20:47:42,925 - INFO - Calibrating model...
2025-11-09 20:47:42,926 - INFO - Calibrating model
2025-11-09 20:47:42,926 - INFO - n_layers=32, n_heads=32, d_model=4096, head_dim=128
2025-11-09 20:47:42,927 - INFO - Detected architecture: llama
2025-11-09 20:47:42,928 - INFO - n_inner = 11008
2025-11-09 20:47:42,928 - INFO - n_inner = 11008
2025-11-09 20:47:42,977 - INFO - n_inner = 11008
2025-11-09 20:47:42,977 - INFO - n_inner = 11008
2025-11-09 20:47:43,018 - INFO - n_inner = 11008
2025-11-09 20:47:43,018 - INFO - n_inner = 11008
2025-11-09 20:47:43,059 - INFO - n_inner = 11008
2025-11-09 20:47:43,059 - INFO - n_inner = 11008
2025-11-09 20:47:43,100 - INFO - n_inner = 11008
2025-11-09 20:47:43,101 - INFO - n_inner = 11008


i wonder if i'll see this
hello from __calibrate_model


2025-11-09 20:47:43,141 - INFO - n_inner = 11008
2025-11-09 20:47:43,142 - INFO - n_inner = 11008
2025-11-09 20:47:43,183 - INFO - n_inner = 11008
2025-11-09 20:47:43,183 - INFO - n_inner = 11008
2025-11-09 20:47:43,224 - INFO - n_inner = 11008
2025-11-09 20:47:43,225 - INFO - n_inner = 11008
2025-11-09 20:47:43,268 - INFO - n_inner = 11008
2025-11-09 20:47:43,269 - INFO - n_inner = 11008
2025-11-09 20:47:43,309 - INFO - n_inner = 11008
2025-11-09 20:47:43,310 - INFO - n_inner = 11008
2025-11-09 20:47:43,351 - INFO - n_inner = 11008
2025-11-09 20:47:43,352 - INFO - n_inner = 11008
2025-11-09 20:47:43,392 - INFO - n_inner = 11008
2025-11-09 20:47:43,393 - INFO - n_inner = 11008
2025-11-09 20:47:43,433 - INFO - n_inner = 11008
2025-11-09 20:47:43,434 - INFO - n_inner = 11008
2025-11-09 20:47:43,474 - INFO - n_inner = 11008
2025-11-09 20:47:43,475 - INFO - n_inner = 11008
2025-11-09 20:47:43,515 - INFO - n_inner = 11008
2025-11-09 20:47:43,516 - INFO - n_inner = 11008
2025-11-09 20:47:43,

: 

In [32]:
reload_save_comp()

2025-11-09 19:43:56,892 - INFO - n_layers=32, n_heads=32, d_model=4096, head_dim=128
2025-11-09 19:44:18,980 - INFO - ✔ Model, tokenizer, and tokenizer_source.txt saved to ./compressed_output/llama2-7b
2025-11-09 19:44:18,982 - INFO - Reloading compressed model from: ./compressed_output/llama2-7b


mask_path = /blue/sgao1/cc22bc.fsu/prog/MoDeGPT/compressed_output/llama2-7b/rotary_masks.pt


Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00,  4.44it/s]
2025-11-09 19:44:29,413 - INFO - ✔ Reloaded compressed model to cuda:0 successfully.


In [34]:
model_compressed.cuda()
compressed_ppl = compute_perplexity(
    model_compressed, tokenizer_compressed, eval_texts, device=args.device
)
logger.info(f"Compressed model perplexity on WikiText2: {compressed_ppl:.2f}")

1
inputs['input_ids'].shape = torch.Size([8, 2048])
q.shape = torch.Size([8, 32, 2048, 110]), k.shape = torch.Size([8, 32, 2048, 110])
rotary_mask.numel() = 3520
tensor([[[[ 47,  48,  41,  ...,  71,  87,  81]],

         [[ 31,  40,  22,  ...,  93,  66, 100]],

         [[ 47,  61,  63,  ...,  86,  84,  73]],

         ...,

         [[ 52,  58,  57,  ...,  95,  74,  75]],

         [[ 19,   2,  12,  ..., 115, 111, 114]],

         [[  2,   0,   5,  ...,  70,  90, 116]]]], device='cuda:0')
seq_len = 2048, n_heads = 32, head_dims = 110
original: rotary_mask.shape = torch.Size([1, 32, 1, 110])
original: cos.shape = torch.Size([1, 2048, 128])
original: sin.shape = torch.Size([1, 2048, 128])
transformed: rotary_mask.shape = torch.Size([1, 32, 2048, 110])
transformed: cos.shape = torch.Size([1, 32, 2048, 128])
transformed: sin.shape = torch.Size([1, 32, 2048, 128])
post-gather: cos.shape = torch.Size([1, 32, 2048, 110])
post-gather: sin.shape = torch.Size([1, 32, 2048, 110])
q_embed.shape =

2025-11-09 19:45:24,400 - INFO - Compressed model perplexity on WikiText2: 3620.45


q.shape = torch.Size([8, 32, 2048, 64]), k.shape = torch.Size([8, 32, 2048, 64])
rotary_mask.numel() = 2048
tensor([[[[57, 49, 36,  ..., 71, 72, 74]],

         [[60, 62, 59,  ..., 75, 76, 57]],

         [[60, 52, 59,  ..., 82, 77, 66]],

         ...,

         [[60, 61, 49,  ..., 77, 72, 51]],

         [[61, 60, 46,  ..., 73, 69, 62]],

         [[56, 63, 50,  ..., 79, 75, 57]]]], device='cuda:0')
seq_len = 2048, n_heads = 32, head_dims = 64
original: rotary_mask.shape = torch.Size([1, 32, 1, 64])
original: cos.shape = torch.Size([1, 2048, 128])
original: sin.shape = torch.Size([1, 2048, 128])
transformed: rotary_mask.shape = torch.Size([1, 32, 2048, 64])
transformed: cos.shape = torch.Size([1, 32, 2048, 128])
transformed: sin.shape = torch.Size([1, 32, 2048, 128])
post-gather: cos.shape = torch.Size([1, 32, 2048, 64])
post-gather: sin.shape = torch.Size([1, 32, 2048, 64])
q_embed.shape = torch.Size([8, 32, 2048, 64]), k_embed.shape = torch.Size([8, 32, 2048, 64])
q.shape = torch.S

In [None]:
logger.info(f"Compressed model perplexity on WikiText2: {compressed_ppl:.2f}")

2025-11-09 19:45:47,425 - INFO - Compressed model perplexity on WikiText2: 3620.45


In [None]:
a = torch.arange(16).view(4, 4)
a

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11],
        [12, 13, 14, 15]])

In [20]:
torch.sum(a, dim=1)

tensor([ 6, 22, 38, 54])

In [1]:
from model_utils import reload_compressed_model

import torch
from eval import compute_perplexity, load_calibration_texts, load_eval_texts

model, tokenizer = reload_compressed_model("./compressed_output/llama2-7b", device=0)
model.to("cuda:0")

eval_texts = load_eval_texts(
    2, model, tokenizer, batch_size=2
)



  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


mask_path = /blue/sgao1/cc22bc.fsu/prog/MoDeGPT/compressed_output/llama2-7b/rotary_masks.pt


Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.64s/it]


In [None]:
compressed_ppl = compute_perplexity(model, tokenizer, eval_texts, device=0)

In [None]:
from compression_utils import get_Q_K_weights, get_V_O_weights
for i in range(32):
    W_q, W_k = get_Q_K_weights(model, i)
    W_v, W_o = get_V_O_weights(model, i)

    if (~torch.isfinite(W_q)).any():
        print(f"W_q has Nan/Inf")

    # Check W_k
    if (~torch.isfinite(W_k)).any():
        print(f"W_k has Nan/Inf")

    # Check W_v
    if (~torch.isfinite(W_v)).any():
        print(f"W_v has Nan/Inf")

    # Check W_o
    if (~torch.isfinite(W_o)).any():
        print(f"W_o has Nan/Inf")

In [None]:
a = torch.tensor([True, False], dtype=torch.bool)
if not a:
    print(f"hi")

RuntimeError: Boolean value of Tensor with more than one value is ambiguous