# **Step 1:** install required Dependencies

In [1]:
pip install -q accelerate==0.34.2 peft==0.6.2 bitsandbytes transformers trl==0.9.6 torch datasets


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# **Step 2:** Clone repository required for conversion to gguf file

In [2]:
!git clone https://github.com/ggerganov/llama.cpp.git

Cloning into 'llama.cpp'...
remote: Enumerating objects: 41356, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 41356 (delta 7), reused 2 (delta 2), pack-reused 41339 (from 2)[K
Receiving objects: 100% (41356/41356), 73.92 MiB | 7.66 MiB/s, done.
Resolving deltas: 100% (30095/30095), done.


# **Step 3:** Merge the Lora Adapters with the base Model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Hugging Face token and model details
hf_token = 
MODEL_NAME = "tiiuae/falcon-7b-instruct"

# Load the tokenizer for the base model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Load base model with 4-bit quantization configuration
print("Loading the base model...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    use_auth_token=hf_token
)

# Load the adapter model
print("Loading the adapter model...")
adapter_model = PeftModel.from_pretrained(
    model,
    "./resources/trained_model_adapt_param",
    device_map="cuda:0"
)

# Convert adapter weights to full precision
for name, param in adapter_model.named_parameters():
    param.data = param.data.float()  # Convert back to full precision

# Merge the adapter weights into the base model
print("Merging adapter weights into the base model...")
new_model = adapter_model.merge_and_unload()

# Save the merged model and tokenizer for later conversion
output_dir = "./resources/trained_model"
print(f"Saving the merged model and tokenizer to {output_dir}...")
new_model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print("Model and tokenizer have been successfully saved.")


  from .autonotebook import tqdm as notebook_tqdm
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


Loading the base model...


Loading checkpoint shards: 100%|██████████| 2/2 [00:12<00:00,  6.42s/it]


Loading the adapter model...
Merging adapter weights into the base model...
Saving the merged model and tokenizer to ./resources/trained_model...
Model and tokenizer have been successfully saved.


# **Step 4:** install required Dependencies for Model file Conversion

In [4]:
# Install dependencies
!pip install -r llama.cpp/requirements.txt

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/cpu
Collecting sentencepiece~=0.2.0 (from -r /home/amine-ben-abda/TUM/WS_2425/Praktikum/finetuning_falcon_7b/llama.cpp/requirements/requirements-convert_legacy_llama.txt (line 2))
  Using cached sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting gguf>=0.1.0 (from -r /home/amine-ben-abda/TUM/WS_2425/Praktikum/finetuning_falcon_7b/llama.cpp/requirements/requirements-convert_legacy_llama.txt (line 4))
  Using cached gguf-0.14.0-py3-none-any.whl.metadata (3.7 kB)
Collecting protobuf<5.0.0,>=4.21.0 (from -r /home/amine-ben-abda/TUM/WS_2425/Praktikum/finetuning_falcon_7b/llama.cpp/requirements/requirements-convert_legacy_llama.txt (line 5))
  Using cached protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting torch~=2.2

# **Step 5:** Convert Model to GGUF Format

In [7]:
# Convert Hugging Face model to GGUF format
!python llama.cpp/convert_hf_to_gguf.py ./resources/trained_model --outfile ./resources/ggufModelFormat/unitTUMfalconV1.gguf --outtype q8_0

INFO:hf-to-gguf:Loading model: trained_model
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00006.safetensors'
INFO:hf-to-gguf:blk.0.attn_norm.bias,      torch.float32 --> F32, shape = {4544}
INFO:hf-to-gguf:blk.0.attn_norm.weight,    torch.float32 --> F32, shape = {4544}
INFO:hf-to-gguf:blk.0.ffn_down.weight,     torch.float32 --> Q8_0, shape = {18176, 4544}
INFO:hf-to-gguf:blk.0.ffn_up.weight,       torch.float32 --> Q8_0, shape = {4544, 18176}
INFO:hf-to-gguf:blk.0.attn_output.weight,  torch.float32 --> Q8_0, shape = {4544, 4544}
Traceback (most recent call last):
  File "/home/amine-ben-abda/TUM/WS_2425/Praktikum/finetuning_falcon_7b/llama.cpp/convert_hf_to_gguf.py", line 5077, in <module>
    main()
  File "/home/amine-ben-abda/TUM/WS_2425/Praktikum/finetuning_falcon_7b/llama.cpp/convert