In [5]:
import pickle
import os
import transformers
import torch
from transformers import(
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TrainingArguments)

In [8]:
current_dir=os.getcwd()
pkl_dir=os.path.join(current_dir, "pkl_files")
os.makedirs(pkl_dir, exist_ok=True)
pkl_files=os.listdir(pkl_dir)

In [9]:
# PEFT methods reduce the number of trainable parameters of a model while retaining performance
# LoRA decomposes a large weight matrix into two smaller, low-rank update matrices
# Original weight remains frozen, update matrices are used for training

# Quantization can compress LLMs
# QLoRA matches 16-bit fine tuning performance while reducing the memory footprint by 90%
quantization_config= BitsAndBytesConfig(
    load_in_4bit=True, # sets model to load in 4-bit quantization, resulting in a quarter of the memory usage 
    bnb_4bit_compute_dtype=torch.float16, # improves speed by changing data type
    bnb_4bit_use_double_quant=True, #double quantization, quantization of quantization constants
    #recommended when you have memory issues
    bnb_4bit_quant_type='nf4'#this data type distributes an equal number of weights 
    # higher precision
)

In [10]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [11]:
def model_to_pkl(model):
    output_model=AutoModelForCausalLM.from_pretrained(model, num_labels=3, torch_dtype="auto")
    print_trainable_parameters(output_model)
    key_name=model.split("/", 1)[1]
    file_path=os.path.join(current_dir, "pkl_files", key_name + ".pkl")
    tokenizer=AutoTokenizer.from_pretrained(model, 
                                            quantization_config=quantization_config, 
                                            device_map="auto")
    model_dict={
        "model_name": output_model,
        "tokenizer": tokenizer
    }
    with open(file_path, "wb") as f:
        pickle.dump(model_dict, f)
        

In [8]:
model_to_pkl("meta-llama/Llama-3.2-1B")

2025-03-24 15:49:04.127182: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742831344.141189 1115431 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742831344.145845 1115431 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742831344.159342 1115431 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1742831344.159361 1115431 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1742831344.159364 1115431 computation_placer.cc:177] computation placer alr

trainable params: 1235814400 || all params: 1235814400 || trainable%: 100.00


In [12]:
model_to_pkl("meta-llama/Llama-3.2-3B")

2025-03-26 17:43:47.032167: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743011027.046933 2253641 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743011027.051389 2253641 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1743011027.064552 2253641 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743011027.064567 2253641 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743011027.064570 2253641 computation_placer.cc:177] computation placer alr

trainable params: 3212749824 || all params: 3212749824 || trainable%: 100.00


In [40]:
#model_to_pkl("meta-llama/Llama-3.2-3B-Instruct")

In [39]:
#model_to_pkl("meta-llama/Llama-3.1-8B")

In [9]:
model_to_pkl("mistralai/Mistral-7B-Instruct-v0.1")

Loading checkpoint shards: 100%|██████| 2/2 [00:00<00:00,  5.43it/s]


trainable params: 7241732096 || all params: 7241732096 || trainable%: 100.00


In [None]:
# lla_model321=AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", num_labels=3, torch_dtype="auto")
# file_path=os.path.join(current_dir, "pkl_files", "llama3_2.pkl")
# with open(file_path, "wb") as f:
#     pickle.dump(lla_model321, f)