# Import packages

In [1]:
from huggingface_hub import create_repo, HfApi
from google.colab import userdata
import os

# Main code

In [2]:
# Variables
MODEL_ID = "model_to_quantize"
QUANTIZATION_METHODS = ["q4_k_m", "q5_k_m"]

# Constants
MODEL_NAME = MODEL_ID.split('/')[-1]

In [3]:
# Install llama.cpp
!git clone https://github.com/ggerganov/llama.cpp
!cd llama.cpp && git pull && make clean && LLAMA_CUBLAS=1 make
!pip install -r llama.cpp/requirements.txt


Cloning into 'llama.cpp'...
remote: Enumerating objects: 30876, done.[K
remote: Counting objects: 100% (6127/6127), done.[K
remote: Compressing objects: 100% (214/214), done.[K
remote: Total 30876 (delta 6030), reused 5923 (delta 5913), pack-reused 24749[K
Receiving objects: 100% (30876/30876), 53.46 MiB | 19.15 MiB/s, done.
Resolving deltas: 100% (22132/22132), done.
Already up to date.
I ccache not found. Consider installing it for faster compilation.
I llama.cpp build info: 
I UNAME_S:   Linux
I UNAME_P:   x86_64
I UNAME_M:   x86_64
I CFLAGS:    -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -std=c11   -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion 
I CXXFLAGS:  -std=c++11 -fPIC -O3 

In [4]:
# Download model
!git lfs install
!git clone https://huggingface.co/{MODEL_ID}

Git LFS initialized.
Cloning into 'EvolCodeLlama-7b'...
remote: Enumerating objects: 35, done.[K
remote: Total 35 (delta 0), reused 0 (delta 0), pack-reused 35 (from 1)[K
Unpacking objects: 100% (35/35), 483.38 KiB | 2.75 MiB/s, done.
Filtering content: 100% (5/5), 4.70 GiB | 5.02 MiB/s, done.
Encountered 1 file(s) that may not have been copied correctly on Windows:
	pytorch_model-00001-of-00002.bin

See: `git lfs help smudge` for more details.


In [14]:
# Convert to fp16
fp16 = f"{MODEL_NAME}/{MODEL_NAME.lower()}.fp16.bin"
!python llama.cpp/convert_hf_to_gguf_update.py {MODEL_NAME} --outtype f16 --outfile {fp16}

INFO:convert_hf_to_gguf_update:Usage: python convert_hf_to_gguf_update.py <huggingface_token>


In [15]:
# Quantize the model for each method in the QUANTIZATION_METHODS list
for method in QUANTIZATION_METHODS:
    qtype = f"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf"
    !./llama.cpp/quantize {fp16} {qtype} {method}

/bin/bash: line 1: ./llama.cpp/quantize: No such file or directory
/bin/bash: line 1: ./llama.cpp/quantize: No such file or directory


In [None]:
model_list = [file for file in os.listdir(MODEL_NAME) if "gguf" in file]

prompt = input("Enter your prompt: ")
chosen_method = input("Name of the model (options: " + ", ".join(model_list) + "): ")

# Verify the chosen method is in the list
if chosen_method not in model_list:
    print("Invalid name")
else:
    qtype = f"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf"
    !./llama.cpp/main -m {qtype} -n 128 --color -ngl 35 -p "{prompt}"

## Push to Hugging Face

In [None]:
# install huggingface_hub
!pip install -q huggingface_hub


In [None]:
username = "username"

# Defined in the secrets tab in Google Colab
api = HfApi(token=userdata.get("hf_secret_key"))

# Create empty repo
create_repo(
    repo_id = f"{username}/{MODEL_NAME}-GGUF",
    repo_type="model",
    exist_ok=True,
)

# Upload gguf files
api.upload_folder(
    folder_path=MODEL_NAME,
    repo_id=f"{username}/{MODEL_NAME}-GGUF",
    allow_patterns=f"*.gguf",
)