In [1]:
import torch
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from sentence_transformers import SentenceTransformer
import cfg as c
import custom_functions as cfoos

In [2]:
# set-up GPU quantization 
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

# get model and tokenizer (will download from hf_hub if not existing)
tokenizer = AutoTokenizer.from_pretrained(c.MODEL, 
                                          trust_remote_code=True, 
                                          cache_dir=cfoos.check_create_dir(c.MODELS_DEV_PATH))
model = AutoModelForCausalLM.from_pretrained(c.MODEL, 
                                             trust_remote_code=True, 
                                             cache_dir=cfoos.check_create_dir(c.MODELS_DEV_PATH),
                                             device_map="auto",
                                             quantization_config=quant_config)

print(f"---Model & tokenizer ({c.MODEL} saved to: {c.MODELS_DEV_PATH})")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

---Model & tokenizer (meta-llama/Llama-3.1-8B-Instruct saved to: /home/adi/projects/projectx/dev_models)


In [4]:
# check size of quantized model (bytes > GB)
cfoos.check_model_size(model)

Model memory (GB): 29.92


In [None]:
# save GPU quantized model
cfoos.save_quant_model(model=model,
                       tokenizer=tokenizer,
                       # create quant model directory if it doesn't exist
                       model_path=cfoos.check_create_dir(c.MODELS_QUANT_PATH / c.MODEL_NAME),
                       # # uncomment to empyt dev_models
                       # clear_dev_models_path=c.MODELS_DEV_PATH
                    )

---Saving quantized model...
---Saved model to: /home/adi/projects/projectx/quant_models/Llama-3.1-8B-Instruct


In [3]:
# get a pre-existing CPU quantized model
hf_hub_download(
    repo_id=c.MODEL_CPU, 
    filename="Meta-Llama-3.1-8B-Instruct-Q3_K_XL.gguf",
    cache_dir=cfoos.check_create_dir(c.MODELS_DEV_PATH)
)

Meta-Llama-3.1-8B-Instruct-Q3_K_XL.gguf:   0%|          | 0.00/4.78G [00:00<?, ?B/s]

'/home/adi/projects/projectx/quant_models/Meta-Llama-3.1-8B-Instruct-GGUF/models--bartowski--Meta-Llama-3.1-8B-Instruct-GGUF/snapshots/bf5b95e96dac0462e2a09145ec66cae9a3f12067/Meta-Llama-3.1-8B-Instruct-Q3_K_XL.gguf'

In [None]:
# save the cpu model in production folder
cfoos.save_quant_cpu_model(
    model_name=c.MODEL_NAME_CPU,
    dev_models_dir=c.MODELS_DEV_PATH,
    # create quant model directory if it doesn't exist
    model_path=cfoos.check_create_dir(c.MODELS_QUANT_PATH / c.MODEL_NAME_CPU),
    # # uncomment to empyt dev_models
    # clear_dev_models_dir: c.MODELS_DEV_PATH
    )

---Saving cpu-quantized model...
---Saved model to: /home/adi/projects/projectx/quant_models/Meta-Llama-3.1-8B-Instruct-GGUF


In [2]:
# get embedding model 
emb_model = SentenceTransformer(c.MODEL_EMB, trust_remote_code=True, cache_folder=c.MODELS_DEV_PATH)

In [3]:
# save embedding model
cfoos.save_emb_model(
    c.MODEL_NAME_EMB,
    c.MODELS_DEV_PATH,
    cfoos.check_create_dir(c.MODELS_QUANT_PATH / c.MODEL_NAME_EMB)
)

---Saved embedding model to: /home/adi/projects/projectx/quant_models/paraphrase-multilingual-MiniLM-L12-v2
