In [1]:
from functools import partial

import torch
import torch.nn as nn
import torch.nn.functional as F

import datasets
from datasets import load_dataset

import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    LlamaTokenizer,
    LlamaTokenizerFast,
    Trainer,
    DataCollatorForLanguageModeling,
    TrainingArguments
)

from transformers import GPTQConfig, BitsAndBytesConfig

from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    PeftModel,
    TaskType
)

from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 16})

IGNORE_INDEX = -100

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
from transformers import OPTForCausalLM
model = "/home/sparse_quant_methods/weights/opt350m_gptq_w4_a16"
model = OPTForCausalLM.from_pretrained(model, torch_dtype=torch.bfloat16)

In [13]:
model.model.decoder.layers[0].self_attn.k_proj.weight

Parameter containing:
tensor([[-0.0669, -0.1167, -0.0835,  ...,  0.0334,  0.0000, -0.1001],
        [ 0.0669,  0.1167,  0.0835,  ..., -0.0334,  0.0334,  0.0835],
        [ 0.0500,  0.1338,  0.0835,  ..., -0.0334,  0.0167,  0.0835],
        ...,
        [-0.0669,  0.0669,  0.0334,  ..., -0.0669,  0.0167,  0.0835],
        [ 0.0500,  0.0669,  0.0167,  ...,  0.0669, -0.0500, -0.0669],
        [-0.0669,  0.0669, -0.0167,  ..., -0.0669,  0.0167,  0.0669]],
       dtype=torch.bfloat16, requires_grad=True)

In [11]:
model.model.decoder.layers[0].self_attn.k_proj.weight

Parameter containing:
tensor([[-0.0659, -0.1113, -0.0879,  ...,  0.0640, -0.0153, -0.0859],
        [ 0.0586,  0.1182,  0.0894,  ..., -0.0498,  0.0315,  0.0791],
        [ 0.0466,  0.1250,  0.0845,  ..., -0.0337,  0.0084,  0.0884],
        ...,
        [-0.0640,  0.0640,  0.0334,  ..., -0.0635,  0.0344,  0.0625],
        [ 0.0562,  0.0654,  0.0208,  ...,  0.0623, -0.0625, -0.0625],
        [-0.0625,  0.0620, -0.0104,  ..., -0.0630,  0.0337,  0.0625]],
       dtype=torch.bfloat16, requires_grad=True)

In [2]:
model_name = "/home/LLaMA/huggingface/Llama-2-7b-hf"

# Load pretrained tokenizer
tokenizer_kwargs = {
    "use_fast": True,
    "revision": 'main',
    "trust_remote_code": True,
}

In [None]:
python  /home/spars_quant/sparsegpt/opt.py \
    --model /home/LLaMA/huggingface/opt-350m \
    --dataset wikitext2 \
    --sparsity 0.5 \
    --wbits 4 \
    --save /home/sparse_quant_methods/weights/opt350m_sparsegpt_w4_a16

In [None]:
python /home/Quantization/smoothquant/examples/generate_act_scales.py \
    --model-name /home/LLaMA/huggingface/tulu-2-7b \
    --output-path /home/LLaMA/huggingface/act_scales/tulut-2-7b-hf.pt \
    --dataset-path /home/LLM_compression/outliers_identification/datasets/val.jsonl.zst

In [None]:
python /home/sparse_quant_methods/quik/experiments/fake_quant/opt.py \
    --model /home/LLaMA/huggingface/opt-350m \
    --path_to_act_scales /home/sparse_quant_methods/quik/experiments/act_scales/opt_350m.pt \
    --path_to_save_quant_model /home/sparse_quant_methods/weights/opt350m_w4_a16 \
    --fp_features 128 \
    --a_bits 16 \
    --w_bits 4 \
    --w_clip \
    --dataset wikitext2

In [None]:
python /home/sparse_quant_methods/wanda/main_opt.py \
    --model /home/LLaMA/huggingface/opt-350m \
    --prune_method wanda \
    --sparsity_ratio 0.5 \
    --sparsity_type unstructured \
    --save_model /home/sparse_quant_methods/weights/opt350m_wanda_50

In [3]:
python /home/sparse_quant_methods/wanda/main_opt.py \
    --model /home/LLaMA/huggingface/opt-350m \
    --prune_method sparsegpt \
    --sparsity_ratio 0.5 \
    --sparsity_type unstructured \
    --save_model /home/sparse_quant_methods/weights/opt350m_sparsegpt_50

SyntaxError: invalid decimal literal (2429943922.py, line 2)

In [None]:
lm_eval --model hf \
    --model_args "pretrained=/home/sparse_quant_methods/weights/opt350m_gptq_w4_a16" \
    --tasks winogrande \
    --batch_size 4 \
    --num_fewshot 0 \
    --device cuda

In [None]:
    (python /home/LLM_Compression/QUIK/experiments/fake_quant/llama.py --model /home/llm_compression/LLaMA/Llama-2-13b-hf --path_to_act_scales /home/LLM_Compression/QUIK/experiments/act_scales/Llama-2-13b-hf.pt --path_to_save_quant_model /home/llm_compression/Quantization/Quik/weights_llama13b/llama13b_3w_16a_quant_params --fp_features 128 --a_bits 16 --w_bits 3 --w_clip --dataset wikitext2)

In [None]:
!python /home/sparse_quant_methods/wanda/main.py --model /home/LLaMA/huggingface/Llama-2-7b-hf --prune_method wanda --sparsity_ratio 0.5 --sparsity_type unstructured --save ./wanda/llama7b_sparsity_50 --save_model ./wanda/llama7b_sparsity_50