# **Step 1:** install required Dependencies

In [1]:
pip install -q accelerate==0.34.2 peft==0.6.2 bitsandbytes transformers trl==0.9.6 torch datasets

Note: you may need to restart the kernel to use updated packages.


# **Step 2:** Clone repository required for conversion to gguf file

In [2]:
!git clone https://github.com/ggerganov/llama.cpp.git

fatal: destination path 'llama.cpp' already exists and is not an empty directory.


# **Step 3:** Merge the Lora Adapters with the base Model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Hugging Face token and model details
hf_token = 
MODEL_NAME = "codellama/CodeLlama-7b-Instruct-hf"

# Load base model
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
# Load the tokenizer for the base model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Load adapter model
adapter_model = PeftModel.from_pretrained(
    model,
    "./resources/trained_model_adapt_param",
    device_map="cuda:0"
)

# Convert adapter weights to full precision
for name, param in adapter_model.named_parameters():
    param.data = param.data.float()  # Convert back to full precision

# Merge the adapter weights into the base model
newmodel = adapter_model.merge_and_unload()

# Save the merged model and tokenizer
output_dir = "./resources/trained_model"
newmodel.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")


  from .autonotebook import tqdm as notebook_tqdm
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.65s/it]


Model and tokenizer saved to ./codellama7b_finetuning/trained_model


# **Step 4:** install required Dependencies for Model file Conversion

In [2]:
# Install dependencies
!pip install -r llama.cpp/requirements.txt

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/cpu


# **Step 5:** Convert Model to GGUF Format

NB: if you encounter the problem "chat_template is duplicated", just delete "chat_template" entry from tokenizer.config in './resources/trained_model'

In [5]:
# Convert Hugging Face model to GGUF format
!python llama.cpp/convert_hf_to_gguf.py ./resources/trained_model --outfile ./resources/ggufModelFormat/unitTUMcodeLlamaV1.gguf --outtype q8_0

INFO:hf-to-gguf:Loading model: trained_model
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00006.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.float32 --> Q8_0, shape = {4096, 32016}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.float32 --> F32, shape = {4096}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.float32 --> Q8_0, shape = {11008, 4096}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.float32 --> Q8_0, shape = {4096, 11008}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.float32 --> Q8_0, shape = {4096, 11008}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,       torch.float32 --> F32, shape = {4096}
INFO:hf-to-gguf:blk.0.attn_k.weight,         torch.float32 --> Q8_0, shape = {4096, 4096}
INFO:hf-to-gguf:blk.0.attn_output.weight,    torch.float32 --> Q8_0, sha