# RPT - Converter GGUF (CPU, sem GPU)

**Pre-requisito:** Suba `bitnet_2b_pruned.tar.gz` na raiz do Google Drive.

Runtime: **CPU** (nao precisa GPU)

Fixes criticos aplicados:
- Bug #8: `weight_quant()` desabilitado (pesos ja ternarios)
- Bug #10: architecture `bitnet` -> `bitnet-b1.58` (grafo computacional correto)

In [None]:
# CELL 1: MONTAR DRIVE + EXTRAIR TAR + INSTALAR DEPS
import os, json, shutil

from google.colab import drive
drive.mount('/content/drive')

# Extrair tar.gz do Drive
TAR_FILE = '/content/drive/MyDrive/bitnet_2b_pruned.tar.gz'
assert os.path.exists(TAR_FILE), 'ERRO: suba bitnet_2b_pruned.tar.gz na raiz do Google Drive!'
print(f'Tar encontrado: {os.path.getsize(TAR_FILE)/1e9:.1f} GB')

print('Extraindo...')
!cd /content && tar xzf {TAR_FILE}

# Renomear pra nome que o converter espera
MODEL_DIR = '/content/BitNet-b1.58-2B-4T'
if not os.path.exists(MODEL_DIR):
    os.rename('/content/bitnet_2b_pruned', MODEL_DIR)
print(f'Modelo OK: {MODEL_DIR}')

# Fix config.json
cfg_path = os.path.join(MODEL_DIR, 'config.json')
with open(cfg_path) as f:
    cfg = json.load(f)
if cfg.get('architectures', [''])[0] == 'BitNetForCausalLM':
    cfg['architectures'] = ['BitnetForCausalLM']
    with open(cfg_path, 'w') as f:
        json.dump(cfg, f, indent=2)
    print('Fix: BitNetForCausalLM -> BitnetForCausalLM')

# Fix tokenizer_config.json
tok_path = os.path.join(MODEL_DIR, 'tokenizer_config.json')
with open(tok_path) as f:
    tok = json.load(f)
if tok.get('tokenizer_class') == 'TokenizersBackend':
    tok['tokenizer_class'] = 'PreTrainedTokenizerFast'
    with open(tok_path, 'w') as f:
        json.dump(tok, f, indent=2)
    print('Fix: TokenizersBackend -> PreTrainedTokenizerFast')

# Instalar clang
print('Instalando clang...')
!sudo apt-get update -qq && sudo apt-get install -y -qq clang lld 2>&1 | tail -1

# Clonar BitNet
if not os.path.exists('BitNet'):
    !git clone --recursive https://github.com/microsoft/BitNet.git
else:
    !cd BitNet && git submodule update --init --recursive 2>&1 | tail -1

# FIX 10: Architecture bitnet -> bitnet-b1.58 (ANTES do pip install!)
print('Fix architecture no gguf-py...')
constants_file = 'BitNet/3rdparty/llama.cpp/gguf-py/gguf/constants.py'
with open(constants_file) as f:
    code = f.read()
if 'MODEL_ARCH.BITNET:         "bitnet",' in code:
    code = code.replace(
        'MODEL_ARCH.BITNET:         "bitnet",',
        'MODEL_ARCH.BITNET:         "bitnet-b1.58",'
    )
    with open(constants_file, 'w') as f:
        f.write(code)
    print('Fix: architecture bitnet -> bitnet-b1.58')
else:
    print('Fix architecture ja aplicado')

# Instalar deps (force-reinstall gguf-py APOS o patch!)
!pip install -q -r BitNet/requirements.txt
!pip install -q --force-reinstall --no-deps BitNet/3rdparty/llama.cpp/gguf-py

print('=== SETUP OK ===')

In [None]:
# CELL 2: GERAR KERNELS + COMPILAR
import os

print('[1/3] Gerando kernels...')
!cd BitNet && python3 utils/codegen_tl2.py --model bitnet_b1_58-3B --BM 160,320,320 --BK 96,96,96 --bm 32,32,32

# Fix const correctness
print('[2/3] Fix compilacao...')
mad_file = 'BitNet/src/ggml-bitnet-mad.cpp'
if os.path.exists(mad_file):
    with open(mad_file) as f:
        code = f.read()
    code = code.replace('int8_t * y_col = y + col * by;', 'const int8_t * y_col = y + col * by;')
    with open(mad_file, 'w') as f:
        f.write(code)

print('[3/3] Compilando...')
!cd BitNet && cmake -B build -DBITNET_X86_TL2=OFF -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ 2>&1 | tail -3
!cd BitNet && cmake --build build --config Release -j$(nproc) 2>&1 | tail -3

assert os.path.exists('BitNet/build/bin/llama-quantize'), 'ERRO: compilacao falhou!'
print('Compilacao OK!')

In [None]:
# CELL 3: FIX CONVERTER + CONVERTER GGUF
import os

MODEL_DIR = '/content/BitNet-b1.58-2B-4T'

# Fix vocab fallback no converter
print('[1/4] Aplicando fix no converter...')
conv_file = 'BitNet/utils/convert-hf-to-gguf-bitnet.py'
with open(conv_file) as f:
    code = f.read()

bitnet_pos = code.rfind('class BitnetModel')
if bitnet_pos > 0:
    old_pos = code.find('self._set_vocab_sentencepiece()', bitnet_pos)
    if old_pos > 0:
        method_start = code.rfind('def set_vocab(self):', 0, old_pos)
        method_end = old_pos + len('self._set_vocab_sentencepiece()')
        code = code[:method_start] + '''def set_vocab(self):
        try:
            self._set_vocab_sentencepiece()
        except FileNotFoundError:
            try:
                self._set_vocab_llama_hf()
            except (FileNotFoundError, TypeError):
                self._set_vocab_gpt2()''' + code[method_end:]
        with open(conv_file, 'w') as f:
            f.write(code)
        print('Fix aplicado')

# Fix weight_quant (BUG 8): converter re-quantiza pesos ja ternarios com escala errada
print('[2/4] Desabilitando weight_quant (pesos ja ternarios)...')
with open(conv_file) as f:
    code = f.read()
if 'data_torch = self.weight_quant(data_torch)' in code:
    code = code.replace(
        'data_torch = self.weight_quant(data_torch)',
        'pass  # RPT fix: pesos ja ternarios, skip re-quantizacao'
    )
    with open(conv_file, 'w') as f:
        f.write(code)
    print('Fix: weight_quant desabilitado')

# Converter HF -> f32 GGUF
print('[3/4] Convertendo para GGUF f32...')
!cd BitNet && python3 utils/convert-hf-to-gguf-bitnet.py {MODEL_DIR} --outtype f32

f32_gguf = os.path.join(MODEL_DIR, 'ggml-model-f32.gguf')
assert os.path.exists(f32_gguf), 'ERRO: conversao falhou!'
print(f'f32 OK: {os.path.getsize(f32_gguf)/1e9:.1f} GB')

# Quantizar f32 -> i2_s
print('[4/4] Quantizando para i2_s...')
i2s_gguf = os.path.join(MODEL_DIR, 'ggml-model-i2_s.gguf')
!BitNet/build/bin/llama-quantize {f32_gguf} {i2s_gguf} I2_S 1

assert os.path.exists(i2s_gguf), 'ERRO: quantizacao falhou!'
print(f'\nGGUF i2_s: {os.path.getsize(i2s_gguf)/1e6:.0f} MB')

os.remove(f32_gguf)
print('CONVERSAO OK!')

In [None]:
# CELL 4: TESTAR INFERENCIA
import os
i2s_gguf = '/content/BitNet-b1.58-2B-4T/ggml-model-i2_s.gguf'
!cd BitNet && python3 run_inference.py -m {i2s_gguf} -p "The capital of France is" -n 50 -t 4

In [None]:
# CELL 5: SALVAR GGUF NO DRIVE + DOWNLOAD
import os, shutil
i2s_gguf = '/content/BitNet-b1.58-2B-4T/ggml-model-i2_s.gguf'

# Copiar pro Drive
drive_dest = '/content/drive/MyDrive/bitnet_2b_pruned_gguf/'
os.makedirs(drive_dest, exist_ok=True)
shutil.copy2(i2s_gguf, drive_dest)
print(f'GGUF salvo no Drive: {drive_dest}')
print(f'Tamanho: {os.path.getsize(os.path.join(drive_dest, "ggml-model-i2_s.gguf"))/1e6:.0f} MB')

# Download direto
from google.colab import files
files.download(i2s_gguf)