In [None]:
repo_id = "codersan/Enlighten_Instruct_merged"

In [1]:
# Login to hub
from kaggle_secrets import UserSecretsClient
import huggingface_hub
user_secrets = UserSecretsClient()
secret_hf = user_secrets.get_secret("HUGGINGFACE_TOKEN")
huggingface_hub.login(secret_hf)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


### Quantize GGUF
Based on TheBloke's script for ggml conversion and quantization

In [None]:
# Install llama.cpp
%cd /kaggle/
!git clone https://github.com/ggerganov/llama.cpp
%cd /kaggle/llama.cpp
!pip install -r requirements.txt
!make
%cd /kaggle/

/kaggle
Cloning into 'llama.cpp'...
remote: Enumerating objects: 15902, done.[K
remote: Counting objects: 100% (6331/6331), done.[K
remote: Compressing objects: 100% (520/520), done.[K
remote: Total 15902 (delta 6114), reused 5874 (delta 5810), pack-reused 9571[K
Receiving objects: 100% (15902/15902), 18.11 MiB | 20.33 MiB/s, done.
Resolving deltas: 100% (11133/11133), done.
/kaggle/llama.cpp
Collecting numpy~=1.24.4 (from -r ./requirements/requirements-convert.txt (line 1))
  Obtaining dependency information for numpy~=1.24.4 from https://files.pythonhosted.org/packages/10/be/ae5bf4737cb79ba437879915791f6f26d92583c738d7d960ad94e5c36adf/numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting gguf>=0.1.0 (from -r ./requirements/requirements-convert.txt (line 4))
  Obtaining dependency information for gguf>=0.1.0 from https://files.pythonhosted.o

In [None]:
# Full repo download model

username = repo_id.split('/')[0]
base_model_name = repo_id.split('/')[1]

# Select branch
revision="main"

# Download model
from huggingface_hub import snapshot_download
snapshot_download(repo_id=repo_id, revision=revision, local_dir=f"./{repo_id.replace('/', '_')}")

print(f"Model dir: './{repo_id.replace('/', '_')}'")

In [None]:
# This step is necessary only if your base model is a standard 32000 vocab model AND the uploader accidentally kept added_tokens.json in the repo

# Remove added_tokens.json
%cd /kaggle/TheBloke_Llama-2-13B-fp16
%rm added_tokens.json
%cd /kaggle/

In [None]:

input_dir = f"./{username}_{base_model_name}"
remove_fp16 = True

# Run quantize
import os
import subprocess

def quantize(model, outbase, outdir):
    llamabase = "/kaggle/llama.cpp"
    ggml_version = "ggufv1"

    if not os.path.isdir(model):
        raise Exception(f"Could not find model dir at {model}")

    if not os.path.isfile(f"{model}/config.json"):
        raise Exception(f"Could not find config.json in {model}")

    os.makedirs(outdir, exist_ok=True)
    fp16 = f"{outdir}/{outbase}.fp16.gguf"

    print(f"Making unquantised GGUF at {fp16}")
    if not os.path.isfile(fp16):
        subprocess.run(f"python {llamabase}/convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
    else:
        print(f"Unquantised GGUF already exists at: {fp16}")

    print("Making quants")
    for type in ["q4_K_S", "q5_K_M"]:
        outfile = f"{outdir}/{outbase}.{type}.gguf"
        print(f"Making {type} : {outfile}")
        subprocess.run(f"{llamabase}/quantize {fp16} {outfile} {type}", shell=True, check=True)
        
    if remove_fp16:
        os.remove(fp16)

quantize(input_dir, base_model_name, "quantized")

In [None]:

# Push to hub
from huggingface_hub import create_repo, HfApi
api = HfApi()

create_repo(repo_id = f"{username}/{base_model_name}-GGUF", private = True, repo_type = "model", exist_ok = True)
api.upload_folder(
    folder_path="/kaggle/quantized",
    repo_id=f"{username}/{base_model_name}-GGUF"
)