In [1]:
# Copyright 2023 Ontocord.AI, Apache 2 License
# Create Use sparsification on a specific data distribution, and SVD to create Loras from sparsified network.

!git clone https://github.com/IST-DASLab/sparsegpt


txt = """Abraham Lincoln (/ˈlɪŋkən/ LINK-ən; February 12, 1809 – April 15, 1865) was an American lawyer, politician, and statesman who served as the 16th president of the United States from 1861 until his assassination in 1865. Lincoln led the Union through the American Civil War to defend the nation as a constitutional union and succeeded in abolishing slavery, bolstering the federal government, and modernizing the U.S. economy.

Lincoln was born into poverty in a log cabin in Kentucky and was raised on the frontier, primarily in Indiana. He was self-educated and became a lawyer, Whig Party leader, Illinois state legislator, and U.S. Congressman from Illinois. In 1849, he returned to his successful law practice in Springfield, Illinois. In 1854, he was angered by the Kansas–Nebraska Act, which opened the territories to slavery, and he re-entered politics. He soon became a leader of the new Republican Party. He reached a national audience in the 1858 Senate campaign debates against Stephen A. Douglas. Lincoln ran for president in 1860, sweeping the North to gain victory. Pro-slavery elements in the South viewed his election as a threat to slavery, and Southern states began seceding from the nation. During this time, the newly formed Confederate States of America began seizing federal military bases in the south. Just over one month after Lincoln assumed the presidency, the Confederate States attacked Fort Sumter, a U.S. fort in South Carolina. Following the bombardment, Lincoln mobilized forces to suppress the rebellion and restore the union.


Marriage and children

Lincoln had pledged in 1846 to serve only one term in the House. Realizing Clay was unlikely to win the presidency, he supported General Zachary Taylor for the Whig nomination in the 1848 presidential election.[85] Taylor won and Lincoln hoped in vain to be appointed Commissioner of the General Land Office.[86] The administration offered to appoint him secretary or governor of the Oregon Territory as consolation.[87] This distant territory was a Democratic stronghold, and acceptance of the post would have disrupted his legal and political career in Illinois, so he declined and resumed his law practice.[88]

Lincoln's second child was named"""

try:
  import accelerate, bitsandbytes
  from transformers import AutoTokenizer, AutoModelForCausalLM
except:
  !pip install -q transformers accelerate bitsandbytes
  !pip install -q datasets
  !pip install -q sentencepiece
  !pip install -q zstandard
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch



Cloning into 'sparsegpt'...
remote: Enumerating objects: 37, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 37 (delta 18), reused 9 (delta 9), pack-reused 11[K
Receiving objects: 100% (37/37), 21.78 KiB | 10.89 MiB/s, done.
Resolving deltas: 100% (18/18), done.
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m60.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

In [2]:
%%writefile sparsegpt/datautils.py
import random

import numpy as np
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, LlamaTokenizer


def set_seed(seed):
    np.random.seed(seed)
    torch.random.manual_seed(seed)

def get_tokenizer(model):
    if "llama" in model.lower():
        tokenizer = LlamaTokenizer.from_pretrained(model, use_fast=False)
        # fix for transformer 4.28.0.dev0 compatibility
        if tokenizer.bos_token_id != 1 or tokenizer.eos_token_id != 2:
            try:
                tokenizer.bos_token_id = 1
                tokenizer.eos_token_id = 2
            except AttributeError:
                pass
    else:
        tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
    return tokenizer

def get_wikitext2(nsamples, seed, seqlen, model, tokenizer):

    traindata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')
    testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')

    trainenc = tokenizer(" ".join(traindata['text']), return_tensors='pt')
    testenc = tokenizer("\n\n".join(testdata['text']), return_tensors='pt')

    random.seed(seed)
    trainloader = []
    for _ in range(nsamples):
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        tar = inp.clone()
        tar[:, :-1] = -100
        trainloader.append((inp, tar))
    return trainloader, testenc

def get_ptb(nsamples, seed, seqlen, model, tokenizer):
    traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train')
    testdata = load_dataset('ptb_text_only', 'penn_treebank', split='test')

    trainenc = tokenizer(" ".join(traindata['sentence']), return_tensors='pt')
    testenc = tokenizer(" ".join(testdata['sentence']), return_tensors='pt')

    random.seed(seed)
    trainloader = []
    for _ in range(nsamples):
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        tar = inp.clone()
        tar[:, :-1] = -100
        trainloader.append((inp, tar))
    return trainloader, testenc

import tqdm
def get_c4(nsamples, seed, seqlen, model, tokenizer):
    traindata = load_dataset(
        'allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train'
    )
    valdata = load_dataset(
        'allenai/c4', 'allenai--c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation'
    )

    random.seed(seed)
    trainloader = []
    for _ in range(nsamples):
        while True:
            i = random.randint(0, len(traindata) - 1)
            trainenc = tokenizer(traindata[i]['text'], return_tensors='pt')
            if trainenc.input_ids.shape[1] > seqlen:
                break
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        tar = inp.clone()
        tar[:, :-1] = -100
        trainloader.append((inp, tar))

    valenc = tokenizer(' '.join(valdata[:1100]['text']), return_tensors='pt')
    valenc = valenc.input_ids[:, :(256 * seqlen)]

    class TokenizerWrapper:
        def __init__(self, input_ids):
            self.input_ids = input_ids
    valenc = TokenizerWrapper(valenc)

    return trainloader, valenc


def get_generic(nsamples, seed, seqlen, model, tokenizer, dataset_name, train, validation):

    traindata = load_dataset(
        dataset_name, split=train,
    )
    valdata = load_dataset(
        dataset_name, split=validation,
    )
    random.seed(seed)
    trainloader = []
    for _ in tqdm.tqdm(range(nsamples)):
        while True:
            i = random.randint(0, len(traindata) - 1)
            trainenc = tokenizer(traindata[i]['text'], return_tensors='pt')
            if trainenc.input_ids.shape[1] > seqlen:
                break
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        tar = inp.clone()
        tar[:, :-1] = -100
        trainloader.append((inp, tar))

    valenc = tokenizer(' '.join(valdata[:1100]['text']), return_tensors='pt')
    valenc = valenc.input_ids[:, :(256 * seqlen)]

    class TokenizerWrapper:
        def __init__(self, input_ids):
            self.input_ids = input_ids
    valenc = TokenizerWrapper(valenc)

    return trainloader, valenc

def get_loaders(name, nsamples=128, seed=0, seqlen=2048, model=''):
    tokenizer = get_tokenizer(model)
    if 'wikitext2' in name:
        return get_wikitext2(nsamples, seed, seqlen, model, tokenizer)
    elif 'ptb' in name:
        return get_ptb(nsamples, seed, seqlen, model, tokenizer)
    elif 'c4' in name:
        return get_c4(nsamples, seed, seqlen, model, tokenizer)
    else:
        name, train, validiation = name.split(",")
        return get_generic(nsamples, seed, seqlen, model, tokenizer, name, train, validiation)


Overwriting sparsegpt/datautils.py


In [3]:
%%writefile sparsegpt/llama.py
import time

import torch
import torch.nn as nn

from sparsegpt import *
from modelutils import *
from quant import *

try:
    import wandb
    has_wandb = True
except:
    has_wandb = False


def get_llama(model):
    import torch
    def skip(*args, **kwargs):
        pass
    torch.nn.init.kaiming_uniform_ = skip
    torch.nn.init.uniform_ = skip
    torch.nn.init.normal_ = skip
    from transformers import LlamaForCausalLM
    model = LlamaForCausalLM.from_pretrained(model, torch_dtype='auto')
    model.seqlen = 2048
    return model


@torch.no_grad()
def llama_sequential(model, dataloader, dev):
    print("Starting...")

    use_cache = model.config.use_cache
    model.config.use_cache = False
    layers = model.model.layers

    model.model.embed_tokens = model.model.embed_tokens.to(dev)
    model.model.norm = model.model.norm.to(dev)
    layers[0] = layers[0].to(dev)

    dtype = next(iter(model.parameters())).dtype
    inps = torch.zeros(
        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
    )
    cache = {"i": 0, "attention_mask": None}

    class Catcher(nn.Module):
        def __init__(self, module):
            super().__init__()
            self.module = module

        def forward(self, inp, **kwargs):
            inps[cache["i"]] = inp
            cache["i"] += 1
            cache["attention_mask"] = kwargs["attention_mask"]
            raise ValueError

    layers[0] = Catcher(layers[0])
    for batch in dataloader:
        try:
            model(batch[0].to(dev))
        except ValueError:
            pass
    layers[0] = layers[0].module

    layers[0] = layers[0].cpu()
    model.model.embed_tokens = model.model.embed_tokens.cpu()
    model.model.norm = model.model.norm.cpu()
    torch.cuda.empty_cache()

    outs = torch.zeros_like(inps)
    attention_mask = cache["attention_mask"]

    print("Ready.")

    quantizers = {}
    for i in range(len(layers)):
        layer = layers[i].to(dev)
        full = find_layers(layer)

        if args.true_sequential:
            sequential = [
                ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
                ["self_attn.o_proj"],
                ["mlp.up_proj", "mlp.gate_proj"],
                ["mlp.down_proj"],
            ]
        else:
            sequential = [list(full.keys())]

        for names in sequential:
            subset = {n: full[n] for n in names}

            gpts = {}
            for name in subset:
                if (
                    not (args.minlayer <= i < args.maxlayer and args.prune_only in name)
                ) == (not args.invert):
                    continue
                gpts[name] = SparseGPT(subset[name])
                if args.wbits < 16:
                    gpts[name].quantizer = Quantizer()
                    gpts[name].quantizer.configure(
                        args.wbits, perchannel=True, sym=False, mse=False
                    )

            def add_batch(name):
                def tmp(_, inp, out):
                    gpts[name].add_batch(inp[0].data, out.data)

                return tmp

            handles = []
            for name in subset:
                handles.append(subset[name].register_forward_hook(add_batch(name)))
            for j in range(args.nsamples):
                outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
            for h in handles:
                h.remove()

            for name in subset:
                print(i, name)
                print("Pruning ...")
                sparsity = args.sparsity
                gpts[name].fasterprune(
                    sparsity,
                    prunen=args.prunen,
                    prunem=args.prunem,
                    percdamp=args.percdamp,
                    blocksize=args.blocksize,
                )
                gpts[name].free()

        for j in range(args.nsamples):
            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]

        layers[i] = layer.cpu()
        del layer
        del gpts
        torch.cuda.empty_cache()

        inps, outs = outs, inps

    model.config.use_cache = use_cache

    return quantizers


@torch.no_grad()
def llama_eval(model, testenc, dev,  dataset: str, log_wandb: bool = False):
    print("Evaluating ...")

    testenc = testenc.input_ids
    nsamples = testenc.numel() // model.seqlen

    use_cache = model.config.use_cache
    model.config.use_cache = False
    layers = model.model.layers

    model.model.embed_tokens = model.model.embed_tokens.to(dev)
    layers[0] = layers[0].to(dev)

    dtype = next(iter(model.parameters())).dtype
    inps = torch.zeros(
        (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
    )
    cache = {"i": 0, "attention_mask": None}

    class Catcher(nn.Module):
        def __init__(self, module):
            super().__init__()
            self.module = module

        def forward(self, inp, **kwargs):
            inps[cache["i"]] = inp
            cache["i"] += 1
            cache["attention_mask"] = kwargs["attention_mask"]
            raise ValueError

    layers[0] = Catcher(layers[0])
    for i in range(nsamples):
        batch = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)].to(dev)
        try:
            model(batch)
        except ValueError:
            pass
    layers[0] = layers[0].module

    layers[0] = layers[0].cpu()
    model.model.embed_tokens = model.model.embed_tokens.cpu()
    torch.cuda.empty_cache()

    outs = torch.zeros_like(inps)
    attention_mask = cache["attention_mask"]

    for i in range(len(layers)):
        print(i)
        layer = layers[i].to(dev)

        if args.gmp:
            subset = find_layers(layer)
            for name in subset:
                W = subset[name].weight.data
                thresh = torch.sort(torch.abs(W.flatten()))[0][
                    int(W.numel() * args.sparsity)
                ]
                W.data[torch.abs(W.data) <= thresh] = 0

        for j in range(nsamples):
            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
        layers[i] = layer.cpu()
        del layer
        torch.cuda.empty_cache()
        inps, outs = outs, inps

    if model.model.norm is not None:
        model.model.norm = model.model.norm.to(dev)
    model.lm_head = model.lm_head.to(dev)

    testenc = testenc.to(dev)
    nlls = []
    for i in range(nsamples):
        hidden_states = inps[i].unsqueeze(0)
        if model.model.norm is not None:
            hidden_states = model.model.norm(hidden_states)
        lm_logits = model.lm_head(hidden_states)
        shift_logits = lm_logits[:, :-1, :].contiguous()
        shift_labels = testenc[:, (i * model.seqlen) : ((i + 1) * model.seqlen)][:, 1:]
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(
            shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
        )
        neg_log_likelihood = loss.float() * model.seqlen
        nlls.append(neg_log_likelihood)
    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
    print(f"Perplexity: {ppl.item():3f}")
    if log_wandb:
        wandb.log({f"{dataset}/perplexity": ppl.item()})

    model.config.use_cache = use_cache


if __name__ == "__main__":
    import argparse
    from datautils import *

    parser = argparse.ArgumentParser()

    parser.add_argument("model", type=str, help="LlaMA model to load")
    parser.add_argument(
        "dataset",
        type=str,
        #choices=["wikitext2", "ptb", "c4"],
        help="Where to extract calibration data from.",
    )
    parser.add_argument(
        "--seed", type=int, default=0, help="Seed for sampling the calibration data."
    )
    parser.add_argument(
        "--nsamples", type=int, default=128, help="Number of calibration data samples."
    )
    parser.add_argument(
        "--percdamp",
        type=float,
        default=0.01,
        help="Percent of the average Hessian diagonal to use for dampening.",
    )
    parser.add_argument("--sparsity", type=float, default=0, help="Target sparsity")
    parser.add_argument("--prunen", type=int, default=0, help="N for N:M pruning.")
    parser.add_argument("--prunem", type=int, default=0, help="M for N:M pruning.")
    parser.add_argument(
        "--blocksize",
        type=int,
        default=128,
        help="Blocksize to use for adaptive mask selection.",
    )
    parser.add_argument(
        "--gmp", action="store_true", help="Whether to run the GMP baseline."
    )
    parser.add_argument(
        "--wbits", type=int, default=16, help="Whether to quantize as well."
    )
    parser.add_argument(
        "--minlayer", type=int, default=-1, help="Prune all layers with id >= this."
    )
    parser.add_argument(
        "--maxlayer", type=int, default=1000, help="Prune all layers with id < this."
    )
    parser.add_argument(
        "--prune_only",
        type=str,
        default="",
        help="Prune only layers that contain this text.",
    )
    parser.add_argument("--invert", action="store_true", help="Invert subset.")
    parser.add_argument("--save", type=str, default="", help="Path to saved model.")
    parser.add_argument(
        "--true-sequential",
        action="store_true",
        help="Whether to run in true sequential model.",
    )
    parser.add_argument(
        "--log_wandb", action="store_true", help="Whether to log to wandb."
    )

    args = parser.parse_args()

    # init W&B logging
    if args.log_wandb:
        assert has_wandb, "wandb not installed try `pip install wandb`"
        wandb.init(config=args)

    model = get_llama(args.model)
    model.eval()

    dataloader, testloader = get_loaders(
        args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen
    )

    if (args.sparsity or args.prunen) and not args.gmp:
        tick = time.time()
        llama_sequential(model, dataloader, DEV)
        for n, p in model.named_parameters():
            print(n, torch.mean((p == 0).float()))
            if 'down_proj' in n:
                break
        print(time.time() - tick)

    for dataset in ["wikitext2", "ptb", "c4"]:
        dataloader, testloader = get_loaders(
            dataset, seed=args.seed, model=args.model, seqlen=model.seqlen
        )
        print("Dataset:", dataset)
        llama_eval(model, testloader, DEV, dataset, args.log_wandb)

    if args.save:
        model.save_pretrained(args.save)


Overwriting sparsegpt/llama.py


In [4]:
import torch
import torch.nn as nn

class LoraLinear(nn.Linear):
  def __init__(self, in_features, out_features, bias, linear, lora):
    super().__init__(in_features, out_features, bias)
    self.weight.data = linear.weight.data
    if bias:
      self.bias.data = linear.bias.data
    self.lora = lora

  def forward(self, input_tensor):
    #print(f"input_tensor.shape = {input_tensor.shape}")
    out = super().forward(input_tensor)
    #print(f"out.shape = {out.shape} , self.lora(input_tensor).shape = {self.lora(input_tensor).shape}")
    out = (out + self.lora(input_tensor))/2.0
    return out

In [5]:
def create_factorized_compression_for_linear(source_linear, rank=None, rank_factor=0.3,  dtype=torch.float32):
    with torch.no_grad():
      if rank is None:
        rank = max(1, int(min(source_linear.weight.shape)*rank_factor))
      if hasattr(source_linear, 'bias'):
        bias = source_linear.bias
      else:
        bias = None
      source_linear = source_linear.weight.data
      device=source_linear.device
      assert rank < min(source_linear.shape)
      source_linear = source_linear.float()
      U, S, Vh = torch.linalg.svd(source_linear)
      U = U[:, :rank]
      S = S[:rank]
      U = U @ torch.diag(S)
      Vh = Vh[:rank, :]
      U_flatten = U.flatten()
      Vh_flatten = Vh.flatten()
      max_quant_size = 2^23
      #print ("ranked")
      if len(U_flatten) + len(Vh_flatten) >= max_quant_size:
        dist2 = U_flatten[:min(len(U_flatten), max_quant_size)]
        dist3 = Vh_flatten[:min(len(Vh_flatten), max_quant_size)]
        hi_val = max(torch.quantile(dist3, 1), torch.quantile(dist2, 1))
      else:
        dist = torch.cat([U_flatten, Vh_flatten])
        hi_val = torch.quantile(dist, 1)
      low_val = -hi_val
      #print ("quantile")
      U = U.clamp(low_val, hi_val)
      Vh = Vh.clamp(low_val, hi_val)
      #print ("clammped")
      print(f"U.shape = {U.shape}")
      print(f"Vh.shape = {Vh.shape}")

      lora_down = nn.Linear(Vh.shape[1], Vh.shape[0], dtype=dtype, bias=False, device=source_linear.device)
      lora_up = nn.Linear(U.shape[1], U.shape[0], dtype=dtype, bias=bias is not None, device=source_linear.device)
      #print ("Set up linear")
      lora_up.weight.data = U.to(device=device, dtype=dtype)
      lora_down.weight.data = Vh.to(device=device, dtype=dtype)
      if bias is not None:
        lora_up.bias = nn.Parameter(bias.clone())
      return nn.Sequential(lora_down, lora_up)

In [6]:
def lord_decompose(layer, proxy_data, rank):
    """
    Be aware when performing LoRD/AFM on Square Matrices:

        In square matrices, input and output dimensions are coupled so compression is more challenging.
        The optimal low rank approximation may differ significantly from the original matrix due to the coupling.
        Square matrices have less intrinsic redundancy between inputs and outputs to exploit.
        Decomposing square matrices risks distorting dimensions that interact in complex ways.
        The approximation error of AFM tends to be lowest for tall matrices and highest for square ones.
        For square matrices, it can help to decompose blocks of interactions rather than the whole matrix.

    Paper reference:
        Low Rank Decomposition Of Monolingual Code LLMs For One-Shot Compression
        ( http://arxiv.org/abs/2309.14021 )

    Credit: AI chatbot
    """

    y = layer(proxy_data) # Forward proxy data

    cov_y = torch.cov(y.T) # Compute output covariance
    cov_y = cov_y.float()

    # The matrix to be eigendecomposed is symmetric,
    # so we can use torch.linalg.eigh instead of torch.linalg.eig
    # We shouldn't get imaginary part.
    eigenvalues, eigenvectors = torch.linalg.eigh(cov_y)

    # Take top rank eigenvectors in descending order
    # selects the last rank indices, which correspond to the largest rank values.
    top_idx = torch.argsort(eigenvalues, descending=True)
    U = eigenvectors[:, top_idx]

    # Convert layer weight to complex before decomposition, needed if using torch.linalg.eig()
    #layer.weight = nn.Parameter(layer.weight.to(torch.complex64))
    layer.weight = nn.Parameter(layer.weight.to(torch.float32))

    # Decompose
    w1 = U.T @ layer.weight
    w2 = U
    #print(f"w1.shape = {w1.shape}")
    #print(f"w2.shape = {w2.shape}")

    # Create LoRD layers
    lord_up = nn.Linear(w2.shape[1], w2.shape[0])
    lord_down = nn.Linear(w1.shape[1], w1.shape[0])

    lord_up.weight.data = w2
    lord_down.weight.data = w1
    #lord_up.weight.data = torch.real(torch.abs(w2))
    #lord_down.weight.data = torch.real(torch.abs(w1))

    return nn.Sequential(lord_down, lord_up)

In [7]:
from typing import Optional, Tuple, Union
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.models.gpt_neox.configuration_gpt_neox import *


model_name = "EleutherAI/pythia-410m" #"EleutherAI/pythia-70m"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True ).cuda()
tokenizer = AutoTokenizer.from_pretrained(model_name)
#model.embed_out  = create_factorized_compression_for_linear(model.embed_out, rank_factor=0.9).cuda().to(torch.bfloat16)
#model.gpt_neox.embed_in = create_factorized_compression_for_linear(model.gpt_neox.embed_in, rank_factor=0.2).cuda().to(torch.bfloat16)

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/911M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [8]:
# Generate proxy dataset

import torch
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


proxy_texts = [
"The cat sat on the mat.",
"The quick brown fox jumps over the lazy dog.",
"It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.",
"In my younger and more vulnerable years my father gave me some advice that I've been turning over in my mind ever since.",
"Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much.",
"In a hole in the ground there lived a hobbit.",
"Happy families are all alike; every unhappy family is unhappy in its own way.",
"It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of light, it was the season of darkness, it was the spring of hope, it was the winter of despair.",
"I have a dream that my four little children will one day live in a nation where they will not be judged by the color of their skin but by the content of their character.",
"In the beginning God created the heaven and the earth.",
"To be, or not to be: that is the question.",
"Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal.",
"We hold these truths to be self-evident, that all men are created equal, that they are endowed by their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit of Happiness.",
"I only know that while I am asleep and you are awake, we are fine.",
"The Answer to the Great Question Of...Life, the Universe and Everything...Is...Forty-two,' said Deep Thought, with infinite majesty and calm.",
"Not all those who wander are lost.",
"I took a deep breath and listened to the old brag of my heart. I am, I am, I am.",
"If you want to know what a man's like, take a good look at how he treats his inferiors, not his equals.",
"Don't cry because it's over, smile because it happened.",
"You have brains in your head. You have feet in your shoes. You can steer yourself any direction you choose.",
"Think left and think right and think low and think high. Oh, the thinks you can think up if only you try!",
"Everything you can imagine is real.",
"We accept the love we think we deserve.",
"May the odds be ever in your favor.",
"Keep your face always toward the sunshine - and shadows will fall behind you.",
"Be kind whenever possible. It is always possible.",
"Darkness cannot drive out darkness: only light can do that. Hate cannot drive out hate: only love can do that.",
"Don't walk behind me; I may not lead. Don't walk in front of me; I may not follow. Just walk beside me and be my friend.",
"You've gotta dance like there's nobody watching, love like you'll never be hurt, sing like there's nobody listening, and live like it's heaven on earth.",
"You know you're in love when you can't fall asleep because reality is finally better than your dreams.",
"Being deeply loved by someone gives you strength, while loving someone deeply gives you courage.",
"It matters not what someone is born, but what they grow to be.",
"Love looks not with the eyes, but with the mind.",
"We are what we repeatedly do. Excellence, then, is not an act, but a habit.",
"The mind is everything. What you think you become.",
"Simplicity is the ultimate sophistication.",
"Whatever you do, do it well.",
"What we think, we become.",
"Change will not come if we wait for some other person or some other time. We are the ones we've been waiting for. We are the change that we seek.",
"The question isn't who is going to let me; it's who is going to stop me.",
"The best and most beautiful things in the world cannot be seen or even touched - they must be felt with the heart.",
"It's no use going back to yesterday, because I was a different person then.",
"Logic will get you from A to Z; imagination will get you everywhere.",
"One small step for man, one giant leap for mankind.",
"Either write something worth reading or do something worth writing.",
"You can never cross the ocean until you have the courage to lose sight of the shore.",
"A room without books is like a body without a soul.",
"You only live once, but if you do it right, once is enough.",
"Be who you are and say what you feel, because those who mind don't matter and those who matter don't mind.",
"Everybody is a genius. But if you judge a fish by its ability to climb a tree, it will live its whole life believing that it is stupid.",
"Do what you can, with what you have, where you are.",
"Do not go where the path may lead, go instead where there is no path and leave a trail.",
"There is no greater agony than bearing an untold story inside you.",
"If you want to make peace with your enemy, you have to work with your enemy. Then he becomes your partner.",
"I have always imagined that Paradise will be a kind of library.",
"I am no bird; and no net ensnares me: I am a free human being with an independent will.",
"Beware; for I am fearless, and therefore powerful.",
"The truth is, everyone is going to hurt you. You just got to find the ones worth suffering for.",
"Not all those who wander are lost.",
"I am the wisest man alive, for I know one thing, and that is that I know nothing.",
"It is never too late to be what you might have been.",
"We must use time wisely and forever realize that the time is always ripe to do right.",
"Darkness cannot drive out darkness: only light can do that. Hate cannot drive out hate: only love can do that.",
"The truth is rarely pure and never simple.",
"The trouble with having an open mind, of course, is that people will insist on coming along and trying to put things in it.",
"If you hear a voice within you say 'you cannot paint,' then by all means paint and that voice will be silenced.",
"Act as if what you do makes a difference. It does.",
"Do what you can, with what you have, where you are.",
"Happiness can be found, even in the darkest of times, if one only remembers to turn on the light.",
"The best and most beautiful things in the world cannot be seen or even touched - they must be felt with the heart.",
"The opposite of love is not hate, it's indifference.",
"Always do what is right. It will gratify half of mankind and astound the other.",
"Tell the truth, work hard, and come to dinner on time.",
"Courage is found in unlikely places.",
"The truth is you don't know what is going to happen tomorrow. Life is a crazy ride, and nothing is guaranteed.",
"You never really understand a person until you consider things from his point of view.",
"Do one thing every day that scares you.",
"Darkness cannot drive out darkness: only light can do that. Hate cannot drive out hate: only love can do that.",
"Not all those who wander are lost.",
"I have not failed. I've just found 10,000 ways that won't work.",
"Love is that condition in which the happiness of another person is essential to your own.",
"You are confined only by the walls you build yourself.",
"The question isn't who is going to let me; it's who is going to stop me.",
"There is some good in this world, and it's worth fighting for.",
"Words are, in my not-so-humble opinion, our most inexhaustible source of magic.",
"It matters not what someone is born, but what they grow to be.",
"I fear not the man who has practiced 10,000 kicks once, but I fear the man who has practiced one kick 10,000 times.",
"You have power over your mind, not outside events. Realize this and you will find strength.",
"Knowing what must be done does away with fear.",
"Life is really simple, but we insist on making it complicated.",
"You miss 100% of the shots you don't take.",
"Life isn't about finding yourself. Life is about creating yourself.",
"Simplicity is the ultimate sophistication.",
"In the end, we will remember not the words of our enemies, but the silence of our friends.",
"Never let the fear of striking out keep you from playing the game.",
"You can do anything, but not everything.",
"The truth is, everyone is going to hurt you. You just got to find the ones worth suffering for.",
"True terror is to wake up one morning and discover that your high school class is running the country.",
"Remember that happiness is a way of travel, not a destination.",
"If you can't explain it simply, you don't understand it well enough.",
"You are never too old to set another goal or to dream a new dream.",
"Our lives begin to end the day we become silent about things that matter.",
"I learned that courage was not the absence of fear, but the triumph over it.",
"You may not control all the events that happen to you, but you can decide not to be reduced by them.",
"The future belongs to those who believe in the beauty of their dreams.",
"It is during our darkest moments that we must focus to see the light.",
"Whoever is happy will make others happy too.",
"Do not go where the path may lead, go instead where there is no path and leave a trail.",
"You will face many defeats in life, but never let yourself be defeated.",
"In the end, it's not the years in your life that count. It's the life in your years.",
"Life is what happens to us while we are making other plans.",
"The only impossible journey is the one you never begin.",
"Your time is limited, so don't waste it living someone else's life.",
"No one can make you feel inferior without your consent.",
"What you do makes a difference, and you have to decide what kind of difference you want to make.",
"The only way to do great work is to love what you do.",
"If you look at what you have in life, you'll always have more. If you look at what you don't have in life, you'll never have enough.",
"When one door of happiness closes, another opens; but often we look so long at the closed door that we do not see the one which has been opened for us.",
"Life isn't about getting and having, it's about giving and being.",
"Strive not to be a success, but rather to be of value.",
"The best and most beautiful things in the world cannot be seen or even touched - they must be felt with the heart.",
"In the end, we will remember not the words of our enemies, but the silence of our friends.",
"Not all those who wander are lost.",
"It does not do to dwell on dreams and forget to live.",
"To the well-organized mind, death is but the next great adventure.",
"Of course it is happening inside your head, Harry, but why on earth should that mean that it is not real?",
"Knowing what must be done does away with fear.",
"You only live once, but if you do it right, once is enough.",
"Be the change that you wish to see in the world.",
"Happiness can be found, even in the darkest of times, if one only remembers to turn on the light.",
"We are what we repeatedly do. Excellence, then, is not an act, but a habit.",
"There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.",
"You have to write the book that wants to be written. And if the book will be too difficult for grown-ups, then you write it for children.",
"It is never too late to be what you might have been.",
"A room without books is like a body without a soul.",
"You've gotta dance like there's nobody watching.",
"It matters not what someone is born, but what they grow to be.",
"Don't walk behind me; I may not lead. Don't walk in front of me; I may not follow. Just walk beside me and be my friend.",
"You know you're in love when you can't fall asleep because reality is finally better than your dreams.",
"Stay afraid, but do it anyway. What's important is the action. You don't have to wait to be confident. Just do it and eventually the confidence will follow.",
"Love all, trust a few, do wrong to none.",
"The truth is rarely pure and never simple.",
"Be bold. If not you, who else?",
"We gotta dance our way to freedom.",
"Don't talk about your problems. Eighty percent of people don't care; the other twenty percent will think you deserve them.",
"Friendship marks a life even more deeply than love. Love risks degenerating into obsession, friendship is never anything but sharing.",
"You cannot find peace by avoiding life.",
"Always forgive your enemies - nothing annoys them so much.",
"However difficult life may seem, there is always something you can do and succeed at.",
"Keep love in your heart. A life without it is like a sunless garden when the flowers are dead.",
"You cannot swim for new horizons until you have courage to lose sight of the shore.",
"You gain strength, courage, and confidence by every experience in which you really stop to look fear in the face.",
"I can't give you a sure-fire formula for success, but I can give you a formula for failure: try to please everybody all the time.",
"The universe is change; our life is what our thoughts make it.",
"No one is useless in this world who lightens the burdens of another.",
"Don't let the noise of others' opinions drown out your own inner voice.",
"Darkness cannot drive out darkness: only light can do that. Hate cannot drive out hate: only love can do that.",
"Either write something worth reading or do something worth writing about."
# Add more samples from diverse books, speeches, genres etc.
]

In [9]:
USE_LORD = 1

for param in model.parameters():
  param.requires_grad = False

if USE_LORD:
  rank_factor=0.5

  print(f"len(model.gpt_neox.layers) = {len(model.gpt_neox.layers)}")

  #for layer in model.gpt_neox.layers:
  for i in range(0, len(model.gpt_neox.layers), 2):
    layer = model.gpt_neox.layers[i]

    print(f"now processing layer [{i}]")

    input_dim = layer.attention.dense.in_features
    rank = int(input_dim * rank_factor)

    input_dim_4h = layer.mlp.dense_4h_to_h.in_features
    rank_4h = int(input_dim_4h * rank_factor)


    max_length_dense_h_to_4h = layer.mlp.dense_h_to_4h.in_features # Max sequence length
    proxy_data_h = tokenizer(proxy_texts, padding="max_length", truncation=True, max_length=max_length_dense_h_to_4h, return_tensors="pt").to("cuda")
    proxy_data_h_bf16 = proxy_data_h['input_ids'].to(torch.bfloat16)

    max_length_dense_4h_to_h = layer.mlp.dense_4h_to_h.in_features # Max sequence length
    proxy_data_4h = tokenizer(proxy_texts, padding="max_length", truncation=True, max_length=max_length_dense_4h_to_h, return_tensors="pt").to("cuda")
    proxy_data_4h_bf16 = proxy_data_4h['input_ids'].to(torch.bfloat16)


    layer.attention.dense = LoraLinear(layer.attention.dense.in_features, layer.attention.dense.out_features, layer.attention.dense.bias is not None,  layer.attention.dense, \
                                      lord_decompose(layer.attention.dense, proxy_data_h_bf16, rank)).cuda().to(torch.bfloat16)

    #layer.attention.query_key_value = LoraLinear(layer.attention.query_key_value.in_features, layer.attention.query_key_value.out_features, layer.attention.query_key_value.bias is not None, layer.attention.query_key_value, \
    #                                   lord_decompose(layer.attention.query_key_value, proxy_data, rank)).cuda().to(torch.bfloat16)

    layer.mlp.dense_h_to_4h = LoraLinear(layer.mlp.dense_h_to_4h.in_features, layer.mlp.dense_h_to_4h.out_features, layer.mlp.dense_h_to_4h.bias is not None, layer.mlp.dense_h_to_4h, \
                                      lord_decompose(layer.mlp.dense_h_to_4h, proxy_data_h_bf16, rank)).cuda().to(torch.bfloat16)

    layer.mlp.dense_4h_to_h = LoraLinear(layer.mlp.dense_4h_to_h.in_features, layer.mlp.dense_4h_to_h.out_features, layer.mlp.dense_4h_to_h.bias is not None, layer.mlp.dense_4h_to_h, \
                                      lord_decompose(layer.mlp.dense_4h_to_h, proxy_data_4h_bf16, rank_4h)).cuda().to(torch.bfloat16)

else:

  #for layer in model.gpt_neox.layers:
  for i in range(0, len(model.gpt_neox.layers), 2):
    layer = model.gpt_neox.layers[i]

    print(f"now processing layer [{i}]")

    layer.attention.dense = LoraLinear(layer.attention.dense.in_features, layer.attention.dense.out_features, layer.attention.dense.bias is not None,  layer.attention.dense, \
                                      create_factorized_compression_for_linear(layer.attention.dense, rank_factor=0.2)).cuda().to(torch.bfloat16)

    #layer.attention.query_key_value = LoraLinear(layer.attention.query_key_value.in_features, layer.attention.query_key_value.out_features, layer.attention.query_key_value.bias is not None, layer.attention.query_key_value, \
    #                                   create_factorized_compression_for_linear(layer.attention.query_key_value, rank_factor=0.5)).cuda().to(torch.bfloat16)

    layer.mlp.dense_h_to_4h = LoraLinear(layer.mlp.dense_h_to_4h.in_features, layer.mlp.dense_h_to_4h.out_features, layer.mlp.dense_h_to_4h.bias is not None, layer.mlp.dense_h_to_4h, \
                                      create_factorized_compression_for_linear(layer.mlp.dense_h_to_4h, rank_factor=0.5)).cuda().to(torch.bfloat16)

    layer.mlp.dense_4h_to_h = LoraLinear(layer.mlp.dense_4h_to_h.in_features, layer.mlp.dense_4h_to_h.out_features, layer.mlp.dense_4h_to_h.bias is not None, layer.mlp.dense_4h_to_h, \
                                      create_factorized_compression_for_linear(layer.mlp.dense_4h_to_h, rank_factor=0.5)).cuda().to(torch.bfloat16)


len(model.gpt_neox.layers) = 24
now processing layer [0]
now processing layer [2]
now processing layer [4]
now processing layer [6]
now processing layer [8]
now processing layer [10]
now processing layer [12]
now processing layer [14]
now processing layer [16]
now processing layer [18]
now processing layer [20]
now processing layer [22]


In [10]:
input_ids = tokenizer(txt, return_tensors="pt").to("cuda")
print(f"input_ids.input_ids.shape = {input_ids.input_ids.shape}")

with torch.no_grad():
  print(tokenizer.batch_decode(model.generate(**input_ids,  no_repeat_ngram_size=2, repetition_penalty=1.1, min_length=input_ids.input_ids.shape[1]+256, max_new_tokens=512))[0])


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


input_ids.input_ids.shape = torch.Size([1, 477])
Abraham Lincoln (/ˈlɪŋkən/ LINK-ən; February 12, 1809 – April 15, 1865) was an American lawyer, politician, and statesman who served as the 16th president of the United States from 1861 until his assassination in 1865. Lincoln led the Union through the American Civil War to defend the nation as a constitutional union and succeeded in abolishing slavery, bolstering the federal government, and modernizing the U.S. economy.

Lincoln was born into poverty in a log cabin in Kentucky and was raised on the frontier, primarily in Indiana. He was self-educated and became a lawyer, Whig Party leader, Illinois state legislator, and U.S. Congressman from Illinois. In 1849, he returned to his successful law practice in Springfield, Illinois. In 1854, he was angered by the Kansas–Nebraska Act, which opened the territories to slavery, and he re-entered politics. He soon became a leader of the new Republican Party. He reached a national audience in the 

In [11]:
model_orig = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True ).cuda()
print('compression', sum(x.shape[0]*x.shape[1] if len(x.shape) == 2 else x.shape[0] for x in model.parameters())/ sum(x.shape[0]*x.shape[1] if len(x.shape) == 2 else x.shape[0] for x in model_orig.parameters()))

compression 1.8385333245754534


In [12]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print_trainable_parameters(model)

trainable params: 453206016 || all params: 745220096 || trainable%: 60.81505563693226


In [13]:
model

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 1024)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0): GPTNeoXLayer(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): LoraLinear(
            in_features=1024, out_features=1024, bias=True
            (lora): Sequential(
              (0): Linear(in_features=1024, out_features=1024, bias=True)
              (1): Linear(in_features=1024, out_features=1024, bias=True)
            )
          )
          (attention_dropout): Dropout(p=0.0, inpl

In [14]:
model_orig

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 1024)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
  