# Qunatize with GGUF

This notebook is for you to qunatize huggingface models in GGUF formate and upload them to the Hub
GGUF fromat is to run models on the CPU

## Login to Huggingface Hub

In [None]:
!pip install huggingface_hub

In [None]:
from huggingface_hub import notebook_login

noteboo_login()

In [None]:
cache_dir = ''

from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
cache_dir = "/content/drive/My Drive/huggingface_cache"
os.makedirs(cache_dir, exist_ok=True) # Ensure directory exists

In [None]:
import locale

def getpreferredencoding(do_setlocale = True):
    return "UTF-8"

locale.getpreferredencoding = getpreferredencoding

In [None]:
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/acclerate.git
!pip install -q -U einops
!pip install numpy==1.24
!pip install sentencepiece==0.1.98

In [None]:
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoConfig , AutoModelForCausalLM

## Model Loading

In [None]:
# model_name = "meta-lloma/Llama-2-7b"
model_name = ""

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    trust_remote_code = True,
    torch_dtype = torch.bfloat16,
    device_map="cpu",
    offload_folder="offload",
    cache_dir = cache_dir
)

In [None]:
!cd cache_dir

In [None]:
!git clone https://github.com/ggerganov/llama.cpp.git

In [None]:
%cd llama.cpp

In [None]:
model.save_pretrained("./models/")

## Download Tokenizer config

In [None]:
import os
import requests

def download_file_from_huggingface(
    model_name,
    filename,
    save_path
):
    url = f"https://huggingface.co/{model_name}/resolve/main/{filename}"
    r = requests.get(url, allow_redirects=True)
    if r.status_code != 200:
        print(f"Error downloading {filename}. HTTP Status Code: {r.status_code}")
        return False
    with open(os.path.join(save_path, filename), "wb") as f:
        f.write(r.content)
    return True

def main():
    files_to_download = [
        "tokenizer.json",
        "tokenizer.model",
        "tokenizer_config.json",
        "special_tokens_map.json",
        "added_tokens_map.json",
    ]
    
    save_path = "./models/"
    if not os.path.exists(save_path):
        os.makedirs(save_path)
        
    for filename in files_to_download:
        success = download_file_from_huggingface(model_name, filename, save_path)
        if success:
            print(f"Successfully downloaded {filename}")
        else:
            print(f"Error downloading {filename}")

if __name__ == "__main__":
    main()

## Converting to GGUF

In [None]:
!apt update -y
!apt install build-essential git cmake libopenblas-dev libeigen3-dev -y

In [None]:
!make LLAMA_OPENBLAS=1

In [None]:
!ls

In [None]:
!python3 -m pip install -r requirements.txt

In [None]:
!python convert.py models/

In [None]:
parts = model_name.split('/')

model_name_pure = parts[1]

quant_type = "Q4_K"
quantized_model = f'models/{model_name_pure}.{quant_type}.gguf'
print(f'Preparing {quantized_model} with {quant_type} quantization.')

import subprocess

command = ["./quantize","models/ggml-model-f16.gguf",quantized_model,quant_type]

subprocess.run(command)

## Push to Hub

In [None]:
from huggingface_hub import HfApi

api = HfApi()

quant_name = model_name.split('/')[-1] + "-GGUF"

repo_id = "AdithyaSK/"+ quant_name

base_path = "./models"

local_file_paths = [
    base_path + "/tokenizer.json",
    base_path + "/tokenizer.model",
    base_path + "/tokenizer_config.json",
    base_path + "/special_tokens_map.json",
    base_path + "/ggml-vocab-llama.gguf",
    base_path + "/" + f'{model_name_pure}.{quant_type}.gguf',
]

In [None]:
for local_file_path in local_file_paths:
    
    file_name = local_file_paths.split("/")[-1]
    
    path_in_repo = file_name
    
    api.upload_file(
        path_or_fileobj=local_file_path,        
        path_in_repo=path_in_repo,
        repo_id=repo_id,
        repo_type="model",
    )
    
    print(f"Uploaded {file_name} to {repo_id}")