In [1]:
## ref : https://github.com/philschmid/deep-learning-pytorch-huggingface/blob/main/training/optimize-llama-2-gptq.ipynb 

In [2]:
!sudo pip install -q transformers --upgrade

In [3]:
!sudo -H pip install auto-gptq --no-cache-dir

In [4]:
!sudo -H pip install --upgrade optimum

In [5]:
import torch
torch.cuda.empty_cache()


In [6]:
import gc
gc.collect()

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from optimum.gptq import GPTQQuantizer, load_quantized_model
import torch

In [8]:
#load model
# model = AutoAWQForCausalLM.from_pretrained(model_path, **{"low_cpu_mem_usage":True})
# model = AutoAWQForCausalLM.from_pretrained(model_path, safetensors=True, device_map='auto')
model_name = "mistralai/Mistral-7B-v0.1"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16,device_map='auto')

In [11]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

In [12]:
quantizer = GPTQQuantizer(bits=4, dataset="wikitext2")
quantizer.quant_method = "gptq"

In [None]:
quantized_model = quantizer.quantize_model(model, tokenizer)

Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/733k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/657k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [None]:
print(os.environ["PYTORCH_CUDA_ALLOC_CONF"])


In [None]:
quant_path = "/llmmodels/quantized_model/gptq_mistral"

In [None]:
# save the quantize model to disk

quantized_model.save_pretrained(quant_path, safe_serialization=True)

### Inference on quantized model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig

In [None]:
gptq_config = GPTQConfig(bits=4, use_exllama=True)

model_id = "/llmmodels/quantized_model/gptq_mistral"
quant_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda", torch_dtype=torch.float16)

In [None]:
def predict_from_quant(user_query):
    _inputs = tokenizer.encode(user_query, return_tensors="pt").to('cuda')
    outputs = quant_model.generate(input_ids=_inputs, max_length= 1000, pad_token_id=tokenizer.eos_token_id)
    output = tokenizer.decode(outputs[0])
    return output

import time

In [None]:
# Using quant model
start = time.time()
output1 = predict_from_quant("what is science")
print("time taken is :", time.time()-start)

### Inference on original model

In [None]:
def predict_from_normal(user_query):
    _inputs = tokenizer.encode(user_query, return_tensors="pt").to('cuda')
    outputs = model.generate(input_ids=_inputs, max_length= 1000, pad_token_id=tokenizer.eos_token_id)
    output = tokenizer.decode(outputs[0])
    return output

In [None]:
# Using original model
start = time.time()
output1 = predict_from_normal("what is science")
print("time taken is :", time.time()-start)