In [1]:
%load_ext autoreload
%autoreload 2

# Stable imports (won't reload)
import logging
import os

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import torch
from dataclasses import dataclass
from typing import Optional

# Force reload of project modules
import importlib
import sys

modules_to_reload = [
    'calibration', 'compress_qk', 'compress_mlp', 'compress_vo',
    'compression_utils', 'eval', 'model_utils', 'patchers.patch'
]

for module_name in modules_to_reload:
    if module_name in sys.modules:
        del sys.modules[module_name]

from calibration import get_model_attrs, load_calibs
from compress_qk import compress_qk, compress_qk_svd
from compress_mlp import compress_mlp
from compress_vo import compress_vo
from compression_utils import allocate_global_sparsity
from eval import compute_perplexity, load_calibration_texts, load_eval_texts
from model_utils import load_model, reload_compressed_model, save_compressed_model, save_model
from patchers.patch import patch_config

logger = logging.getLogger("MoDeGPT")
logger.setLevel(logging.INFO)
if not logger.handlers:
    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
    console = logging.StreamHandler()
    console.setFormatter(formatter)
    logger.addHandler(console)
    os.makedirs("logs", exist_ok=True)
    file = logging.FileHandler("logs/run_modegpt.log")
    file.setFormatter(formatter)
    logger.addHandler(file)


@dataclass
class Config:
    model: str = "meta-llama/Llama-2-7b-hf"
    compression_ratio: float = 0.4
    calib_size: str = "16"
    eval_size: str = "8"
    output_dir: str = "./compressed_output/llama2-7b"
    device: int = 0
    skip: str = "mlp"
    local_model_path: str = ""
    load_calibs_from: str = "./calibs/llama2-7b_sz16.pt"
    calibs_save_path: str = ""
    calibs_batch_size: int = 8

  from .autonotebook import tqdm as notebook_tqdm


over here too


In [2]:
args = Config(
    model="meta-llama/Llama-2-7b-hf",
    compression_ratio=0.4,
    calib_size="16",
    eval_size="8",
    output_dir="./compressed_output/llama2-7b",
    device=0,
    skip="mlp",
    load_calibs_from="./calibs/llama2-7b_sz16.pt",
    calibs_batch_size=8,
)

## Setup

In [3]:
model, tokenizer, config = load_model(args.model, device=args.device)

2025-11-09 15:10:07,380 - INFO - Loading model from: meta-llama/Llama-2-7b-hf


`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.59it/s]
2025-11-09 15:10:25,291 - INFO - ✔ Loaded model on cuda:0 with float16.
2025-11-09 15:10:25,292 - INFO - No pad_token found. Set pad_token = eos_token.


In [4]:
calib_texts, eval_texts, cov_mlp, cov_q, cov_k, cov_x, bi_scores, layer_keep_ratios = (
    None,
    None,
    None,
    None,
    None,
    None,
    None,
    None,
)

In [5]:
rotary_mask = None

In [6]:
model_compressed, tokenizer_compressed = None, None

In [11]:
def loader():
    global calib_texts, eval_texts, cov_mlp, cov_q, cov_k, cov_x, bi_scores, layer_keep_ratios
    logger.info("Loading calibration and evaluation texts...")
    calib_texts = load_calibration_texts(
        args.calib_size, model, tokenizer, batch_size=int(args.calibs_batch_size)
    )
    eval_texts = load_eval_texts(
        args.eval_size, model, tokenizer, batch_size=args.calibs_batch_size
    )

    cov_mlp, cov_q, cov_k, cov_x, bi_scores = load_calibs(
        model,
        tokenizer,
        calib_texts,
        int(args.calibs_batch_size),
        load_calibs_from=args.load_calibs_from,
        calibs_save_path=args.calibs_save_path,
    )

    layer_keep_ratios = allocate_global_sparsity(
        bi_scores, compression_ratio=args.compression_ratio
    )


def compress():
    global rotary_mask

    slice_dims = True
    ridge_lambda = 1e-2

    # compress_mlp(
    #     model=model,
    #     cov=cov_mlp,
    #     keep_ratios=layer_keep_ratios,
    #     ridge_lambda=1e-3,
    #     slice_dims=True,
    # )

    logger.info("Compress QK")
    rotary_mask = compress_qk(
        model=model,
        cov=(cov_q, cov_k),
        keep_ratios=layer_keep_ratios,
        ridge_lambda=ridge_lambda,
        slice_dims=slice_dims,
    )

    logger.info("Compress VO")
    compress_vo(
        model=model,
        cov=cov_x,
        keep_ratios=layer_keep_ratios,
        ridge_lambda=ridge_lambda,
        slice_dims=slice_dims,
    )


def reload_save_comp():
    global model, model_compressed, tokenizer_compressed

    og_config = patch_config(model)

    rebuild_path = "./patchers/LlamaRebuild.py"

    if model_compressed is not None:
        torch.cuda.empty_cache()

    save_compressed_model(
        model,
        tokenizer,
        rotary_masks=rotary_mask,
        rebuild_path=rebuild_path,
        save_dir=args.output_dir,
        source_model_name=args.model,
    )

    model.config = og_config

    model_compressed, tokenizer_compressed = reload_compressed_model(args.output_dir)

In [None]:
# loader()
# compress()
reload_save_comp()

# model_compressed.cuda()
# compressed_ppl = compute_perplexity(
#     model_compressed, tokenizer_compressed, eval_texts, device=args.device
# )
# logger.info(f"Compressed model perplexity on WikiText2: {compressed_ppl:.2f}")

2025-11-09 15:18:42,074 - INFO - n_layers=32, n_heads=32, d_model=4096, head_dim=128


2025-11-09 15:19:04,413 - INFO - ✔ Model, tokenizer, and tokenizer_source.txt saved to ./compressed_output/llama2-7b
2025-11-09 15:19:04,413 - INFO - Reloading compressed model from: ./compressed_output/llama2-7b


mask_path = /blue/sgao1/cc22bc.fsu/prog/MoDeGPT/compressed_output/llama2-7b/rotary_masks.pt


Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 147.07it/s]
2025-11-09 15:19:09,003 - INFO - ✔ Reloaded compressed model to cuda:0 successfully.


1
inputs['input_ids'].shape = torch.Size([8, 2048])
q.shape = torch.Size([8, 32, 2048, 110]), k.shape = torch.Size([8, 32, 2048, 110])
rotary_mask.numel() = 3520
tensor([[[[ 47,  48,  41,  ...,  71,  87,  81]],

         [[ 31,  40,  22,  ...,  93,  66, 100]],

         [[ 47,  61,  63,  ...,  86,  84,  73]],

         ...,

         [[ 52,  58,  57,  ...,  95,  74,  75]],

         [[ 19,   2,  12,  ..., 115, 111, 114]],

         [[  2,   0,   5,  ...,  70,  90, 116]]]], device='cuda:0')
seq_len = 2048, n_heads = 32, head_dims = 110
original: rotary_mask.shape = torch.Size([1, 32, 1, 110])
original: cos.shape = torch.Size([1, 2048, 128])
original: sin.shape = torch.Size([1, 2048, 128])


RuntimeError: The expanded size of the tensor (110) must match the existing size (128) at non-singleton dimension 3.  Target sizes: [-1, 32, -1, 110].  Tensor sizes: [1, 1, 2048, 128]