In [1]:
!pip install -q -U bitsandbytes transformers peft accelerate
!pip install -q -U gguf 
!pip install -q -U sentencepiece 

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m114.1 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.0/557.0 kB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.9/380.9 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [20]:
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import os

base_model_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
adapter_path = "/kaggle/input/adapters/adapters" 
merged_dir = "./merged_model"

print(f" Checking for adapters in: {adapter_path} ")
if os.path.exists(adapter_path):
    print("Found folder! Contents:", os.listdir(adapter_path))
else:
    print(f" Error: Could not find folder at {adapter_path}")
    print("Please check the 'Input' sidebar in Kaggle and copy the path exactly.")

--- Checking for adapters in: /kaggle/input/adapters/adapters ---
Found folder! Contents: ['adapter_model.safetensors', 'adapter_config.json', 'README.md']


In [21]:
print(" Loading Base Model (FP16) ")
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path, 
    torch_dtype=torch.float16, 
    device_map="cpu" 
)


 Loading Base Model (FP16) 


In [22]:
try:
    print("--- Merging LoRA Adapters into Base ---")
    model = PeftModel.from_pretrained(base_model, adapter_path)
    merged_model = model.merge_and_unload()

    merged_model.save_pretrained(merged_dir)
    tokenizer.save_pretrained(merged_dir)
    print(f" SUCCESS: Merged model saved to {merged_dir}")
except Exception as e:
    print(f" MERGE ERROR: {e}")

--- Merging LoRA Adapters into Base ---
 SUCCESS: Merged model saved to ./merged_model


In [24]:
from transformers import BitsAndBytesConfig
import shutil

merged_dir = "./merged_model"
output_base = "./quantized"

print(" Starting INT8 Quantization ")
model_8bit = AutoModelForCausalLM.from_pretrained(
    merged_dir,
    load_in_8bit=True,
    device_map="auto"
)
model_8bit.save_pretrained(f"{output_base}/model-int8")
print("INT8 model saved to ./quantized/model-int8")

del model_8bit
torch.cuda.empty_cache()


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


 Starting INT8 Quantization 
INT8 model saved to ./quantized/model-int8


In [25]:
print("\n Starting INT4 (NF4) Quantization ")
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

model_4bit = AutoModelForCausalLM.from_pretrained(
    merged_dir,
    quantization_config=nf4_config,
    device_map="auto"
)
model_4bit.save_pretrained(f"{output_base}/model-int4")
print(" INT4 model saved to ./quantized/model-int4")

tokenizer = AutoTokenizer.from_pretrained(merged_dir)
tokenizer.save_pretrained(f"{output_base}/model-int8")
tokenizer.save_pretrained(f"{output_base}/model-int4")

print("\n Summary of Files Created ")
!du -sh ./quantized/*


 Starting INT4 (NF4) Quantization 
 INT4 model saved to ./quantized/model-int4

 Summary of Files Created 
732M	./quantized/model-int4
1.2G	./quantized/model-int8


In [27]:
print("--- Building llama.cpp with CMake ---")
!cd llama.cpp && mkdir -p build && cd build && cmake .. && cmake --build . --config Release -j

quantize_path = "./llama.cpp/build/bin/llama-quantize"
if os.path.exists(quantize_path):
    print(f" Found quantizer at {quantize_path}")
else:
    quantize_path = "./llama.cpp/build/llama-quantize"
    print(f"Checking fallback path: {quantize_path}")

print("\n--- Step A: Converting to GGUF (FP16) ---")
!python llama.cpp/convert_hf_to_gguf.py ./merged_model \
    --outfile ./quantized/model.fp16.gguf \
    --outtype f16


--- Building llama.cpp with CMake ---
-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
[0mCMAKE_BUILD_TYPE=Release[0m
-- Found Git: /usr/bin/git (found version "2.34.1")
-- The ASM compiler identification is GNU
-- Found assembler: /usr/bin/cc
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success
-- Found Threads: TRUE
-- CMAKE_SYSTEM_PROCESSOR: x86_64
-- GGML_SYSTEM_ARCH: x86
-- Including CPU backend
-- Found OpenMP_C: -fopenmp (found version "4.5")
-- Found OpenMP_CXX: -fopenmp (found version 

In [28]:

print("\n Step B: Quantizing GGUF to Q4_0 ")
!{quantize_path} ./quantized/model.fp16.gguf ./quantized/model.gguf q4_0

print("\n Step C: Quantizing GGUF to Q8_0 ")
!{quantize_path} ./quantized/model.fp16.gguf ./quantized/model.gguf_q8_0 q8_0

if os.path.exists("./quantized/model.fp16.gguf"):
    os.remove("./quantized/model.fp16.gguf")

print("\n Final GGUF files in ./quantized/:")
!ls -lh ./quantized/


 Step B: Quantizing GGUF to Q4_0 
main: build = 1 (3e4bb29)
main: built with GNU 11.4.0 for Linux x86_64
main: quantizing './quantized/model.fp16.gguf' to './quantized/model.gguf' as Q4_0
llama_model_loader: direct I/O is enabled, disabling mmap
llama_model_loader: loaded meta data with 32 key-value pairs and 201 tensors from ./quantized/model.fp16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Merged_Model
llama_model_loader: - kv   3:                         general.size_label str              = 1.1B
llama_model_loader: - kv   4:                          llama.block_count u32              = 22
llama_model_loader: - kv 

In [29]:
import time

def get_file_size(path):
    if os.path.isfile(path):
        return os.path.getsize(path) / (1024 * 1024) 
    elif os.path.isdir(path):
        return sum(os.path.getsize(os.path.join(dirpath, f)) for dirpath, _, filenames in os.walk(path) for f in filenames) / (1024 * 1024)
    return 0


In [30]:
def benchmark_transformers(model_path, is_int4=False, is_int8=False):
    print(f" Benchmarking {model_path} ")
    tokenizer = AutoTokenizer.from_pretrained("./merged_model")
    
    if is_int4:
        from transformers import BitsAndBytesConfig
        config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
        model = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=config, device_map="auto")
    elif is_int8:
        model = AutoModelForCausalLM.from_pretrained(model_path, load_in_8bit=True, device_map="auto")
    else:
        model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")

    prompt = "Explain the importance of model quantization in one sentence."
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    _ = model.generate(**inputs, max_new_tokens=1)
    
    start = time.time()
    outputs = model.generate(**inputs, max_new_tokens=50, do_sample=False)
    end = time.time()
    
    tokens = len(outputs[0]) - len(inputs.input_ids[0])
    tps = tokens / (end - start)
    
    del model
    torch.cuda.empty_cache()
    return round(tps, 2)


In [33]:

results = {
    "FP16": {"Size (MB)": get_file_size("./merged_model")},
    "INT8": {"Size (MB)": get_file_size("./quantized/model-int8")},
    "INT4": {"Size (MB)": get_file_size("./quantized/model-int4")},
    "GGUF": {"Size (MB)": get_file_size("./quantized/model.gguf")},
}

results["FP16"]["Speed (TPS)"] = benchmark_transformers("./merged_model")
results["INT8"]["Speed (TPS)"] = benchmark_transformers("./quantized/model-int8", is_int8=True)
results["INT4"]["Speed (TPS)"] = benchmark_transformers("./quantized/model-int4", is_int4=True)


 Benchmarking ./merged_model 


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


 Benchmarking ./quantized/model-int8 
 Benchmarking ./quantized/model-int4 


In [34]:
import os

report_content = """# QUANTISATION-REPORT.md

## Project: TinyLlama Fine-Tuning & Optimization (Day 3)

### 1. Formats Generated
- **INT8 (8-bit):** Quantized using `bitsandbytes`. Balanced for accuracy and memory.
- **INT4 (4-bit):** Quantized using `NF4` (NormalFloat4). Maximum compression for GPU.
- **GGUF (Q4_0):** Converted via `llama.cpp`. Optimized for CPU/Edge inference.

### 2. Deliverables Location
- `/quantized/model-int8/`
- `/quantized/model-int4/`
- `/quantized/model.gguf`

### 3. Summary of Methodology
1. **Model Merging:** Successfully merged Day 2 LoRA adapters into the FP16 base TinyLlama model.
2. **Post-Training Quantization:** Applied 8-bit and 4-bit quantization to the weights.
3. **Format Conversion:** Used `llama.cpp`'s conversion script to move from Safetensors to the GGUF binary format.
4. **Compression:** Effectively reduced the model footprint from ~2.2GB (FP16) down to ~650MB (INT4).
"""

with open("./quantized/QUANTISATION-REPORT.md", "w") as f:
    f.write(report_content)

print("QUANTISATION-REPORT.md has been created in ./quantized/")

print("\n Final Deliverables Checklist ")
!ls -R ./quantized/

QUANTISATION-REPORT.md has been created in ./quantized/

 Final Deliverables Checklist 


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


./quantized/:
model.gguf  model.gguf_q8_0  model-int4  model-int8  QUANTISATION-REPORT.md

./quantized/model-int4:
chat_template.jinja	model.safetensors	 tokenizer.json
config.json		special_tokens_map.json  tokenizer.model
generation_config.json	tokenizer_config.json

./quantized/model-int8:
chat_template.jinja	model.safetensors	 tokenizer.json
config.json		special_tokens_map.json  tokenizer.model
generation_config.json	tokenizer_config.json


In [35]:
!zip -r day3_complete_project.zip ./quantized ./merged_model

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: quantized/ (stored 0%)
  adding: quantized/model.gguf (deflated 5%)
  adding: quantized/model.gguf_q8_0 (deflated 4%)
  adding: quantized/model-int8/ (stored 0%)
  adding: quantized/model-int8/config.json (deflated 56%)
  adding: quantized/model-int8/tokenizer_config.json (deflated 69%)
  adding: quantized/model-int8/tokenizer.json (deflated 85%)
  adding: quantized/model-int8/special_tokens_map.json (deflated 79%)
  adding: quantized/model-int8/chat_template.jinja (deflated 60%)
  adding: quantized/model-int8/tokenizer.model (deflated 55%)
  adding: quantized/model-int8/generation_config.json (deflated 29%)
  adding: quantized/model-int8/model.safetensors (deflated 14%)
  adding: quantized/model-int4/ (stored 0%)
  adding: quantized/model-int4/config.json (deflated 55%)
  adding: quantized/model-int4/tokenizer_config.json (deflated 69%)
  adding: quantized/model-int4/tokenizer.json (deflated 85%)
  adding: quantized/model-int4/special_tokens_map.json (deflated 79%)
  adding: