In [1]:
%pip install --upgrade transformers==4.46.2 bitsandbytes>0.37.0

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from tqdm import tqdm

import torch
from torch.autograd import profiler
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [17]:
# from huggingface_hub import notebook_login
# notebook_login()

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## Config

In [3]:
test_dataset_path = "/kaggle/input/ptbdataset/ptb.test.txt"

In [4]:
model_name = "allenai/OLMo-1B-hf"

In [5]:
bit4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype = torch.bfloat16
)

bit8_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype = torch.bfloat16
)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device}")

Using cuda


In [7]:
os.makedirs("models", exist_ok=True)

## Utils

In [8]:
def read_data(filepath, limit=None):
    with open(filepath, "r") as f:
        data = f.readlines()
    data = [line.strip().replace("\n", "<eos>") for line in data]
    if limit:
        data = data[:limit]
    data = "\n".join(data)
    return data

In [9]:
def calculate_perplexity(model, encodings):
    max_length = model.config.max_position_embeddings
    stride = max_length // 2  # To avoid too much truncation
    nlls = []

    encodings = encodings.to(device)
    
    with profiler.profile(use_device=str(device), use_cpu=False, use_kineto=True) as prof:
        for i in tqdm(range(0, encodings.input_ids.size(1), stride)):
            begin_loc = i
            end_loc = min(i + max_length, encodings.input_ids.size(1))
            trg_len = end_loc - begin_loc  # Target length
            input_ids = encodings.input_ids[:, begin_loc:end_loc]
            
            with torch.no_grad():
                outputs = model(input_ids, labels=input_ids)
                neg_log_likelihood = outputs.loss * trg_len
            
            nlls.append(neg_log_likelihood)

    perplexity = torch.exp(torch.stack(nlls).sum() / end_loc)
    profiler_obj = prof.total_average()
    
    return perplexity.item(), profiler_obj, len(range(0, encodings.input_ids.size(1), stride))

In [10]:
def testing(model, test_encodings, desc=""):
    # Run the perplexity calculation and profiling
    perplexity, profiler_obj, n_items = calculate_perplexity(model, test_encodings)
    memory_footprint_before_quantization = model.get_memory_footprint() / 1e6

    cuda_time_ms = profiler_obj.device_time / 1e3
    inference_latency_ms = cuda_time_ms / n_items

    print(desc)
    print(f"Perplexity: {perplexity}")
    print(f"Cuda Time: {cuda_time_ms:.4f} ms")
    print(f"Inference Latency: {inference_latency_ms:.4f} ms per inference")
    print(f"Memory Footprint: {memory_footprint_before_quantization:.2f} MB")

## Main

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

tokenizer_config.json:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.71G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [12]:
test_data = read_data(test_dataset_path)
test_encodings = tokenizer(test_data, return_tensors="pt")

In [13]:
desc = "Before Quantization:"
testing(model, test_encodings, desc)

100%|██████████| 100/100 [01:09<00:00,  1.43it/s]


Before Quantization:
Perplexity: 218.36021423339844
Cuda Time: 0.7250 ms
Inference Latency: 0.0072 ms per inference
Memory Footprint: 4707.06 MB


In [14]:
del model
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    quantization_config=bit8_config, 
    torch_dtype=torch.bfloat16, 
    low_cpu_mem_usage=True,
    device_map="auto",
)

In [15]:
desc = "After 8-bit Quantization:"
testing(model, test_encodings, desc)

100%|██████████| 100/100 [02:02<00:00,  1.22s/it]


After 8-bit Quantization:
Perplexity: 224.71824645996094
Cuda Time: 0.6221 ms
Inference Latency: 0.0062 ms per inference
Memory Footprint: 1279.79 MB


In [16]:
model.save_pretrained("models/model_after_8-bit_quantization")

In [16]:
del model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bit4_config,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
)

In [17]:
desc = "After 4-bit Quantization:"
testing(model, test_encodings, desc)

100%|██████████| 100/100 [01:40<00:00,  1.00s/it]


After 4-bit Quantization:
Perplexity: 270.70428466796875
Cuda Time: 0.5517 ms
Inference Latency: 0.0055 ms per inference
Memory Footprint: 742.92 MB


In [19]:
model.save_pretrained("models/model_after_4-bit_quantization")

In [18]:
del model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=nf4_config,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    device_map="auto",
)

In [19]:
desc = "After NF4-bit Quantization:"
testing(model, test_encodings, desc)

100%|██████████| 100/100 [01:41<00:00,  1.01s/it]


After NF4-bit Quantization:
Perplexity: 243.39186096191406
Cuda Time: 0.5557 ms
Inference Latency: 0.0056 ms per inference
Memory Footprint: 742.92 MB


In [18]:
model.save_pretrained("models/model_after_nf-4_quantization") #, push_to_hub=True, repo_id="bhavberi/OLMo-1B-NF4")

model.safetensors:   0%|          | 0.00/810M [00:00<?, ?B/s]

In [20]:
model = AutoModelForCausalLM.from_pretrained(
    "bhavberi/OLMo-1B-NF4",
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    device_map="auto",
)

config.json:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


model.safetensors:   0%|          | 0.00/810M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [21]:
desc = "After NF4-bit Quantization HF:"
testing(model, test_encodings, desc)

100%|██████████| 100/100 [01:41<00:00,  1.01s/it]


After NF4-bit Quantization HF:
Perplexity: 243.39186096191406
Cuda Time: 0.5556 ms
Inference Latency: 0.0056 ms per inference
Memory Footprint: 742.92 MB
