In [1]:
import os
os.environ["OPTIMUM_GPTQ_DISABLE_TRITON_KERNEL"] = "1"  

from transformers import AutoTokenizer, AutoModelForCausalLM, GPTQConfig
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
gptq_config = GPTQConfig(
    bits=4, group_size=128, dataset="c4", tokenizer=tokenizer,
    use_triton=False         
)

model = AutoModelForCausalLM.from_pretrained(
    "facebook/opt-125m",
    device_map="auto",
    torch_dtype="float16",
    quantization_config=gptq_config
)

print("✓ Модель отквантована")
model.save_pretrained("opt-125m-gptq")
tokenizer.save_pretrained("opt-125m-gptq")

  from .autonotebook import tqdm as notebook_tqdm



[32mINFO[0m  ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.
[32mINFO[0m  ENV: Auto setting CUDA_DEVICE_ORDER=PCI_BUS_ID for correctness.          


INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Quantizing model.decoder.layers blocks :   0%|          | 0/12 [00:00<?, ?it/s]INFO:optimum.gptq.quantizer:Start quantizing block model.decoder.layers 1/12
INFO:optimum.gptq.quantizer:Module to quantize [['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.q_proj'], ['self_attn.out_proj'], ['fc1'], ['fc2']]
INFO:optimum.gptq.quantizer:Quantizing self_attn.k_proj in block 1/12...
INFO:optimum.gptq.quantizer:Quantizing self_attn.v_proj in block 1/12...
INFO:optimum.gptq.quantizer:Quantizing self_attn.q_proj in block 1/12...
INFO:optimum.gptq.quantizer:Quantizing self_attn.out_proj in block 1/12...
INFO:optimum.gptq.quantizer:Quantizing fc1 in block 1/12...
INFO:optimum.gptq.quantizer:Quantizing fc2 in block 1/12...
Quantizing model.decoder.layers blocks :   8%|▊         

[32mINFO[0m  Packing Kernel: Auto-selection: adding candidate `TritonV2QuantLinear`   


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
INFO:optimum.gptq.quantizer:model.decoder.layers.0.self_attn.k_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.0.self_attn.out_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.0.self_attn.q_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.0.self_attn.v_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.0.fc1
INFO:optimum.gptq.quantizer:model.decoder.layers.0.fc2
INFO:optimum.gptq.quantizer:model.decoder.layers.1.self_attn.k_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.1.self_attn.out_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.1.self_attn.q_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.1.self_attn.v_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.1.fc1
INFO:optimum.gptq.quantizer:model.decoder.layers.1.fc2
INFO:optimum.gptq.quantizer:model.decoder.layers.2.self_attn.k_proj
INFO:optimum.gptq.quantizer:model.decoder.layers.2.self_att

[32mINFO[0m  Optimize: `TritonV2QuantLinear` compilation triggered.                   
✓ Модель отквантована


('opt-125m-gptq/tokenizer_config.json',
 'opt-125m-gptq/special_tokens_map.json',
 'opt-125m-gptq/vocab.json',
 'opt-125m-gptq/merges.txt',
 'opt-125m-gptq/added_tokens.json',
 'opt-125m-gptq/tokenizer.json')

In [2]:
import torch
import time
from datasets import load_dataset
import math 
import os
from tqdm import tqdm 

In [3]:
quantized_model_dir = "opt-125m-gptq" 

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir)
if tokenizer.pad_token is None:

    tokenizer.pad_token = tokenizer.eos_token 
    print("Tokenizer pad_token set to eos_token.")

print(f"Tokenizer '{quantized_model_dir}' loaded.")


Tokenizer 'opt-125m-gptq' loaded.


In [4]:
def get_folder_size_mb(folder_path):
    total_size_bytes = 0
    for dirpath, dirnames, filenames in os.walk(folder_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            if not os.path.islink(fp):
                total_size_bytes += os.path.getsize(fp)
    return total_size_bytes / (1024 * 1024) 

model_disk_size_mb = get_folder_size_mb(quantized_model_dir)
print(f"Quantized model size on disk: {model_disk_size_mb:.2f} MB")

Quantized model size on disk: 123.91 MB


In [None]:
def calculate_perplexity(model, tokenizer, dataset_name="wikitext", dataset_config="wikitext-2-raw-v1", split="test", device="cpu"):
    """
    Calculates perplexity of a model on a given dataset using a sliding window approach.
    Based on Hugging Face's perplexity calculation tutorial.
    """
    
    if hasattr(model.config, 'n_positions'): # OPT
        max_length = model.config.n_positions
    elif hasattr(model.config, 'max_position_embeddings'):
        max_length = model.config.max_position_embeddings
    else:
        max_length = 1024
    
    stride = 512 

    try:
        print(f"Loading dataset: {dataset_name} ({dataset_config}), split: {split}")
        dataset = load_dataset(dataset_name, dataset_config, split=split)
        encodings = tokenizer("\n\n".join(dataset["text"]), return_tensors="pt")
        print(f"Dataset loaded and tokenized. Total tokens: {encodings.input_ids.size(1)}")
    except Exception as e:
        print(f"Failed to load dataset {dataset_name}/{dataset_config}: {e}")
        print("Using a short sample text for perplexity calculation as a fallback.")
        sample_text = "This is a sample text to calculate perplexity. Language models are fascinating because they can generate human-like text based on the patterns they learned from vast amounts of data during their training process."
        encodings = tokenizer(sample_text, return_tensors="pt")

    model.eval()

    nlls = [] 
    
    print(f"Calculating perplexity on {device} with max_length={max_length}, stride={stride}")
    for i in tqdm(range(0, encodings.input_ids.size(1), stride), desc="Perplexity Batches"):
        begin_loc = i
        end_loc = min(i + max_length, encodings.input_ids.size(1))
        
        if end_loc - begin_loc < 2: 
            continue

        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone() 

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            if outputs.loss is not None and not torch.isnan(outputs.loss):
                nlls.append(outputs.loss)
            else:
                print(f"Warning: NaN or None loss detected for batch starting at token {i}. Skipping this batch.")
    
    if not nlls:
        print("No valid NLLs collected. Cannot compute perplexity.")
        return float('nan')

    perplexity = torch.exp(torch.stack(nlls).mean())
    return perplexity.item()


perplexity_gpu = float('nan')
perplexity_cpu = float('nan') 
time_perplexity_gpu = float('nan')
time_perplexity_cpu = float('nan') 
vram_perplexity_gpu_mb = float('nan')

if torch.cuda.is_available():
    print("\n--- Calculating Perplexity on GPU ---")
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats() 
    
    target_gpu_device = "cuda:0" 
    print(f"Loading model for GPU perplexity onto device: {target_gpu_device}")
    model_gpu_ppl = AutoModelForCausalLM.from_pretrained(
        quantized_model_dir,
        device_map=target_gpu_device, 
        torch_dtype="auto"
    )
    
    start_time_gpu = time.time()
    perplexity_gpu = calculate_perplexity(model_gpu_ppl, tokenizer, device=target_gpu_device)
    time_perplexity_gpu = time.time() - start_time_gpu
    
    vram_perplexity_gpu_mb = torch.cuda.max_memory_allocated(target_gpu_device) / (1024**2) 
    
    print(f"Perplexity (GPU): {perplexity_gpu:.4f}")
    print(f"Perplexity calculation time (GPU): {time_perplexity_gpu:.2f} seconds")
    print(f"Peak VRAM for perplexity (GPU): {vram_perplexity_gpu_mb:.2f} MB")
    
    del model_gpu_ppl 
    torch.cuda.empty_cache()
else:
    print("\nGPU not available, skipping GPU perplexity calculation.")



--- Calculating Perplexity on GPU ---
Loading model for GPU perplexity onto device: cuda:0


NameError: name 'quantized_model_dir' is not defined

In [8]:
def measure_inference_speed(model, tokenizer, device="cpu", num_tokens_to_generate=50, num_runs=10, prompt="The future of AI is"):
    """
    Measures inference speed for text generation.
    Returns average time in milliseconds per generation call.
    """
    model.eval()
    
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    
    total_time_seconds = 0
    
    print(f"Performing warm-up run on {device}...")
    with torch.no_grad():
        _ = model.generate(input_ids, max_new_tokens=num_tokens_to_generate, pad_token_id=tokenizer.eos_token_id, do_sample=False) 

    print(f"Measuring inference speed on {device} for {num_runs} runs, generating {num_tokens_to_generate} new tokens each.")
    for i in tqdm(range(num_runs), desc=f"Inference Runs ({device})"):
        start_run_time = time.time()
        with torch.no_grad():
            output = model.generate(
                input_ids, 
                max_new_tokens=num_tokens_to_generate, 
                pad_token_id=tokenizer.eos_token_id,
                do_sample=False 
            )
        end_run_time = time.time()
        total_time_seconds += (end_run_time - start_run_time)
        
    avg_time_ms_per_call = (total_time_seconds / num_runs) * 1000
    
    print(f"Average time per generation call ({num_tokens_to_generate} new tokens) on {device}: {avg_time_ms_per_call:.2f} ms")
    return avg_time_ms_per_call


inf_time_gpu_ms = float('nan')
inf_time_cpu_ms = float('nan') 
vram_inference_gpu_mb = float('nan')


if torch.cuda.is_available():
    print("\n--- Measuring Inference Speed on GPU ---")
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    
    target_gpu_device = "cuda:0" 
    print(f"Loading model for GPU inference onto device: {target_gpu_device}")

    model_gpu_inf = AutoModelForCausalLM.from_pretrained(
        quantized_model_dir,
        device_map=target_gpu_device,
        torch_dtype="auto" 
    )
    
    inf_time_gpu_ms = measure_inference_speed(model_gpu_inf, tokenizer, device=target_gpu_device, num_tokens_to_generate=50, num_runs=20)
    vram_inference_gpu_mb = torch.cuda.max_memory_allocated(target_gpu_device) / (1024**2) 
    
    print(f"Final Inference Time (GPU, for 50 new tokens): {inf_time_gpu_ms:.2f} ms")
    print(f"Peak VRAM during inference (GPU): {vram_inference_gpu_mb:.2f} MB")

    del model_gpu_inf
    torch.cuda.empty_cache()
else:
    print("\nGPU not available, skipping GPU inference speed test.")




--- Measuring Inference Speed on GPU ---
Loading model for GPU inference onto device: cuda:0


NameError: name 'quantized_model_dir' is not defined

### Модель без GTPQ

In [1]:
import torch
import time
from datasets import load_dataset
import math # or from numpy import exp
import os
from tqdm import tqdm # For progress bars
from transformers import AutoTokenizer, AutoModelForCausalLM # Убедитесь, что это импортировано
import shutil # Для удаления временной папки

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
baseline_model_id = "facebook/opt-125m"
baseline_model_save_dir = "opt-125m-fp16-baseline" 
try:
    tokenizer = AutoTokenizer.from_pretrained(baseline_model_id)
except Exception as e:
    print(f"Could not load tokenizer for {baseline_model_id}, trying from 'opt-125m-gptq'. Error: {e}")
    tokenizer = AutoTokenizer.from_pretrained("opt-125m-gptq") 

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token 
    print("Tokenizer pad_token set to eos_token.")

print(f"Using Tokenizer for baseline evaluation (from '{tokenizer.name_or_path}').")

Using Tokenizer for baseline evaluation (from 'facebook/opt-125m').


In [3]:
def get_folder_size_mb(folder_path):
    total_size_bytes = 0
    for dirpath, dirnames, filenames in os.walk(folder_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            if not os.path.islink(fp):
                total_size_bytes += os.path.getsize(fp)
    return total_size_bytes / (1024 * 1024)

print(f"Loading baseline model: {baseline_model_id} with torch.float16 for size measurement...")

temp_model_for_saving = AutoModelForCausalLM.from_pretrained(
    baseline_model_id,
    torch_dtype=torch.float16, 
    low_cpu_mem_usage=True 
)

print(f"Saving baseline model to {baseline_model_save_dir} to measure disk size...")
if os.path.exists(baseline_model_save_dir):
    shutil.rmtree(baseline_model_save_dir) 
os.makedirs(baseline_model_save_dir, exist_ok=True)

temp_model_for_saving.save_pretrained(baseline_model_save_dir)
baseline_model_disk_size_mb = get_folder_size_mb(baseline_model_save_dir)
print(f"Baseline model ({baseline_model_id} in FP16) size on disk: {baseline_model_disk_size_mb:.2f} MB")

# Очистка
del temp_model_for_saving
if os.path.exists(baseline_model_save_dir):
    print(f"Cleaning up temporary directory: {baseline_model_save_dir}")
    shutil.rmtree(baseline_model_save_dir)
torch.cuda.empty_cache() 

Loading baseline model: facebook/opt-125m with torch.float16 for size measurement...
Saving baseline model to opt-125m-fp16-baseline to measure disk size...
Baseline model (facebook/opt-125m in FP16) size on disk: 238.90 MB
Cleaning up temporary directory: opt-125m-fp16-baseline


In [None]:

baseline_perplexity_gpu = float('nan')
baseline_time_perplexity_gpu = float('nan')
baseline_vram_perplexity_gpu_mb = float('nan')

if torch.cuda.is_available():
    print(f"\n--- Calculating Perplexity for Baseline Model ({baseline_model_id} FP16) on GPU ---")
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats() 
    
    target_gpu_device = "cuda:0" 
    print(f"Loading baseline model for GPU perplexity onto device: {target_gpu_device}")
    
    model_gpu_ppl_baseline = AutoModelForCausalLM.from_pretrained(
        baseline_model_id,
        device_map=target_gpu_device, 
        torch_dtype=torch.float16
    )
    
    start_time_gpu = time.time()
    baseline_perplexity_gpu = calculate_perplexity(model_gpu_ppl_baseline, tokenizer, device=target_gpu_device)
    baseline_time_perplexity_gpu = time.time() - start_time_gpu
    
    baseline_vram_perplexity_gpu_mb = torch.cuda.max_memory_allocated(target_gpu_device) / (1024**2) 
    
    print(f"Baseline Perplexity (GPU, FP16): {baseline_perplexity_gpu:.4f}")
    print(f"Baseline Perplexity calculation time (GPU, FP16): {baseline_time_perplexity_gpu:.2f} seconds")
    print(f"Baseline Peak VRAM for perplexity (GPU, FP16): {baseline_vram_perplexity_gpu_mb:.2f} MB")
    
    del model_gpu_ppl_baseline 
    torch.cuda.empty_cache()
else:
    print("\nGPU not available, skipping GPU perplexity calculation for baseline.")

print("\n--- Skipping Perplexity Calculation on CPU for Baseline ---")


--- Calculating Perplexity for Baseline Model (facebook/opt-125m FP16) on GPU ---
Loading baseline model for GPU perplexity onto device: cuda:0
Loading dataset: wikitext (wikitext-2-raw-v1), split: test
Dataset loaded and tokenized. Total tokens: 287645
Calculating perplexity on cuda:0 with max_length=2048, stride=512


Perplexity Batches: 100%|██████████| 562/562 [00:04<00:00, 117.88it/s]


Baseline Perplexity (GPU, FP16): 27.6175
Baseline Perplexity calculation time (GPU, FP16): 12.70 seconds
Baseline Peak VRAM for perplexity (GPU, FP16): 1579.18 MB

--- Skipping Perplexity Calculation on CPU for Baseline ---


In [None]:

baseline_inf_time_gpu_ms = float('nan')
baseline_vram_inference_gpu_mb = float('nan')

if torch.cuda.is_available():
    print(f"\n--- Measuring Inference Speed for Baseline Model ({baseline_model_id} FP16) on GPU ---")
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    
    target_gpu_device = "cuda:0" 
    print(f"Loading baseline model for GPU inference onto device: {target_gpu_device}")

    model_gpu_inf_baseline = AutoModelForCausalLM.from_pretrained(
        baseline_model_id,
        device_map=target_gpu_device,
        torch_dtype=torch.float16 
    )
    
    baseline_inf_time_gpu_ms = measure_inference_speed(model_gpu_inf_baseline, tokenizer, device=target_gpu_device, num_tokens_to_generate=50, num_runs=20)
    baseline_vram_inference_gpu_mb = torch.cuda.max_memory_allocated(target_gpu_device) / (1024**2) 
    
    print(f"Baseline Final Inference Time (GPU, FP16, for 50 new tokens): {baseline_inf_time_gpu_ms:.2f} ms")
    print(f"Baseline Peak VRAM during inference (GPU, FP16): {baseline_vram_inference_gpu_mb:.2f} MB")

    del model_gpu_inf_baseline
    torch.cuda.empty_cache()
else:
    print("\nGPU not available, skipping GPU inference speed test for baseline.")

print("\n--- Skipping Inference Speed Calculation on CPU for Baseline ---")


--- Measuring Inference Speed for Baseline Model (facebook/opt-125m FP16) on GPU ---
Loading baseline model for GPU inference onto device: cuda:0


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Performing warm-up run on cuda:0...
Measuring inference speed on cuda:0 for 20 runs, generating 50 new tokens each.


Inference Runs (cuda:0): 100%|██████████| 20/20 [00:05<00:00,  3.42it/s]

Average time per generation call (50 new tokens) on cuda:0: 291.26 ms
Baseline Final Inference Time (GPU, FP16, for 50 new tokens): 291.26 ms
Baseline Peak VRAM during inference (GPU, FP16): 574.84 MB

--- Skipping Inference Speed Calculation on CPU for Baseline ---





In [10]:
print(f"\n\n--- Baseline Validation Summary for {baseline_model_id} (FP16) ---")
print("--------------------------------------------------")

print(f"Размер весов (прибл. MB): {baseline_model_disk_size_mb:.2f}")

if torch.cuda.is_available():
    if not math.isnan(baseline_perplexity_gpu):
        print(f"Качество (Perplexity GPU): {baseline_perplexity_gpu:.2f} (calc time: {baseline_time_perplexity_gpu:.2f}s)")
    else:
        print(f"Качество (Perplexity GPU): Not calculated or error.")
    
    if not math.isnan(baseline_inf_time_gpu_ms):
        print(f"Время инференса (GPU, ms, 50 токенов): {baseline_inf_time_gpu_ms:.2f}")
    else:
        print(f"Время инференса (GPU, ms, 50 токенов): Not calculated or error.")

    if not math.isnan(baseline_vram_perplexity_gpu_mb):
         print(f"VRAM (Perplexity, MB): {baseline_vram_perplexity_gpu_mb:.2f}")
    else:
        print(f"VRAM (Perplexity, MB): Not calculated or error.")
    
    if not math.isnan(baseline_vram_inference_gpu_mb):
         print(f"VRAM (Inference, MB): {baseline_vram_inference_gpu_mb:.2f}")
    else:
        print(f"VRAM (Inference, MB): Not calculated or error.")
else:
    print(f"GPU метрики не доступны.")
print("--------------------------------------------------")

# Сохраняем результаты в словарь для удобства
baseline_validation_results = {
    "model_id": baseline_model_id,
    "method": "FP16",
    "disk_size_mb": round(baseline_model_disk_size_mb, 2) if not math.isnan(baseline_model_disk_size_mb) else 'N/A',
    "perplexity_gpu": round(baseline_perplexity_gpu, 2) if torch.cuda.is_available() and not math.isnan(baseline_perplexity_gpu) else 'N/A',
    "inference_time_gpu_ms_50_tokens": round(baseline_inf_time_gpu_ms, 2) if torch.cuda.is_available() and not math.isnan(baseline_inf_time_gpu_ms) else 'N/A',
    "vram_peak_perplexity_mb": round(baseline_vram_perplexity_gpu_mb, 2) if torch.cuda.is_available() and not math.isnan(baseline_vram_perplexity_gpu_mb) else 'N/A',
    "vram_peak_inference_mb": round(baseline_vram_inference_gpu_mb, 2) if torch.cuda.is_available() and not math.isnan(baseline_vram_inference_gpu_mb) else 'N/A',
}
print("\nBaseline validation results dictionary:")
import json
print(json.dumps(baseline_validation_results, indent=2, ensure_ascii=False))



--- Baseline Validation Summary for facebook/opt-125m (FP16) ---
--------------------------------------------------
Размер весов (прибл. MB): 238.90
Качество (Perplexity GPU): 27.62 (calc time: 12.70s)
Время инференса (GPU, ms, 50 токенов): 291.26
VRAM (Perplexity, MB): 1579.18
VRAM (Inference, MB): 574.84
--------------------------------------------------

Baseline validation results dictionary:
{
  "model_id": "facebook/opt-125m",
  "method": "FP16",
  "disk_size_mb": 238.9,
  "perplexity_gpu": 27.62,
  "inference_time_gpu_ms_50_tokens": 291.26,
  "vram_peak_perplexity_mb": 1579.18,
  "vram_peak_inference_mb": 574.84
}


### AWQ

In [None]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
import time, os, torch, shutil, psutil, gc

base_model  = "facebook/opt-125m"
awq_dir     = "opt-125m-awq"

quant_cfg = {
    "zero_point": True,
    "q_group_size": 128,
    "w_bit": 4,
    "version": "GEMM" 
}

tic = time.time()
model = AutoAWQForCausalLM.from_pretrained(
        base_model, 
        torch_dtype="float16",
        low_cpu_mem_usage=True,
        safetensors=False
)
tok = AutoTokenizer.from_pretrained(base_model)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

model.quantize(tok, quant_config=quant_cfg) 
model.save_quantized(awq_dir)
tok.save_pretrained(awq_dir)
print(f"AWQ-модель сохранена в {awq_dir}, время: {time.time()-tic:.1f} c")

Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 78398.21it/s]
Repo card metadata block was not found. Setting CardData to empty.
Generating validation split: 100%|██████████| 214670/214670 [00:11<00:00, 18266.06 examples/s]
AWQ:   0%|          | 0/12 [00:00<?, ?it/s]


AttributeError: 'OPTModel' object has no attribute 'rotary_emb'