In [35]:
import pickle
import os
import transformers
import torch
from transformers import(
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TrainingArguments)

In [37]:
current_dir=os.getcwd()
pkl_dir=os.path.join(current_dir, "pkl_files")
os.makedirs(pkl_dir, exist_ok=True)
pkl_files=os.listdir(pkl_dir)

In [27]:
# PEFT methods reduce the number of trainable parameters of a model while retaining performance
# LoRA decomposes a large weight matrix into two smaller, low-rank update matrices
# Original weight remains frozen, update matrices are used for training

# Quantization can compress LLMs
# QLoRA matches 16-bit fine tuning performance while reducing the memory footprint by 90%
quantization_config= BitsAndBytesConfig(
    load_in_4bit=True, # sets model to load in 4-bit quantization, resulting in a quarter of the memory usage 
    bnb_4bit_compute_dtype=torch.bfloat16, # improves speed by changing data type
    bnb_4bit_use_double_quant=True, #double quantization, quantization of quantization constants
    #recommended when you have memory issues
    bnb_4bit_quant_type='nf4'#this data type distributes an equal number of weights 
    # higher precision
)

In [6]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [28]:
def model_to_pkl(model):
    output_model=AutoModelForCausalLM.from_pretrained(model, num_labels=3, torch_dtype="auto")
    print_trainable_parameters(output_model)
    key_name=model.split("/", 1)[1]
    file_path=os.path.join(current_dir, "pkl_files", key_name + ".pkl")
    tokenizer=AutoTokenizer.from_pretrained(model, quantization_config=quantization_config)
    model_dict={
        "model_name": output_model,
        "tokenizer": tokenizer
    }
    with open(file_path, "wb") as f:
        pickle.dump(model_dict, f)
        

In [36]:
model_to_pkl("meta-llama/Llama-3.2-1B")

trainable params: 1235814400 || all params: 1235814400 || trainable%: 100.00


In [41]:
#model_to_pkl("meta-llama/Llama-3.2-3B")

In [40]:
#model_to_pkl("meta-llama/Llama-3.2-3B-Instruct")

In [39]:
#model_to_pkl("meta-llama/Llama-3.1-8B")

In [38]:
#model_to_pkl("mistralai/Mistral-7B-Instruct-v0.1")

In [None]:
# lla_model321=AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", num_labels=3, torch_dtype="auto")
# file_path=os.path.join(current_dir, "pkl_files", "llama3_2.pkl")
# with open(file_path, "wb") as f:
#     pickle.dump(lla_model321, f)