In [None]:
################################### Approach 1 Base Line ###################################

In [None]:
import time
import os
import glob
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
from huggingface_hub import snapshot_download

In [None]:
# ‚úÖ Select Baseline model (non-quantized)
model_name = "meta-llama/Llama-3.2-3B-Instruct"
baseline_model_dir = "./models/baseline"  # Baseline storing path
print(f"Model name (Baseline): {model_name}")

In [None]:
# **Calculate download time**
start_download = time.time()
snapshot_path = snapshot_download(repo_id=model_name, cache_dir=baseline_model_dir)
end_download = time.time()
print(f"Baseline model download time: {end_download - start_download:.2f} seconds")

In [None]:
# **Find model files**
model_files = glob.glob(f"{snapshot_path}/*")
print(f"üìÇ Downloaded model files: {model_files}")

# **Calculate loading time**
start_load = time.time()

# ‚úÖ Use vLLM for inference, without AutoModelForCausalLM
llm = LLM(model=snapshot_path, tensor_parallel_size=1)  # Single GPU
tokenizer = AutoTokenizer.from_pretrained(snapshot_path)

end_load = time.time()
print(f"Baseline model load time: {end_load - start_load:.2f} seconds")

In [None]:
# **Calculate inference time (Serve)**
start_serve = time.time()

prompt = "Hello, how are you?"
sampling_params = SamplingParams(temperature=0.7, max_tokens=100)
outputs = llm.generate([prompt], sampling_params)

end_serve = time.time()
print(f"Serve (inference) time: {end_serve - start_serve:.2f} seconds")

print(f"Generated response: {outputs[0].outputs[0].text}")

In [None]:
################################### Approach 2 Quantized model ###################################

In [None]:
import time
import os
import glob
# from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import snapshot_download

In [None]:
# model_name = "meta-llama/Llama-3.1-8B-Instruct"
model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"  # Use bnb-4bit model
quantized_model_dir = "./models/quantized"  # Storing Path
print(f"Model name (Quantized): {model_name}")

In [None]:
# **Calculate download time**
start_download = time.time()
snapshot_path = snapshot_download(repo_id=model_name, cache_dir=quantized_model_dir)
end_download = time.time()
print(f"Quantized model download time: {end_download - start_download:.2f} seconds")

In [None]:
# **Find model files**
model_files = glob.glob(f"{snapshot_path}/*")  
print(f"üìÇ Downloaded model files: {model_files}")

# **Calculate loading time**
start_load = time.time()
# Load model
model = AutoModelForCausalLM.from_pretrained(
    snapshot_path,  
    quantization_config="llama-4bit",  # ‚úÖ  4-bit Quantization
    device_map="auto"  
)
tokenizer = AutoTokenizer.from_pretrained(snapshot_path)  # ‚úÖ Use tokenizer


end_load = time.time()
print(f"Quantized model load time: {end_load - start_load:.2f} seconds")

In [None]:
# **Calculate inference time (Serve)**
start_serve = time.time()

prompt = "Hello, how are you?"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")  # ‚úÖ Infer on CUDA
outputs = model.generate(**inputs)

end_serve = time.time()
print(f"Serve (inference) time: {end_serve - start_serve:.2f} seconds")

print(f"Generated response: {tokenizer.decode(outputs[0], skip_special_tokens=True)}")

In [None]:
################################### Approach 3 Lazy Loading ###################################

In [None]:
import os
import time
import torch
from transformers import AutoTokenizer, AutoConfig, AutoModel
from huggingface_hub import hf_hub_download
from transformers.models.llama.modeling_llama import apply_rotary_pos_emb

In [None]:
# üè∑Ô∏è Select model & set storage path
model_name = "meta-llama/llama-3.2-3b-instruct"
model_cache_dir = "/content/models/llama-3.2-3b-instruct/" 

# üìå Select the range of layers to load
low_layer = 0
high_layer = 15  # Only download the weights of the first 15 layers

### **1Ô∏è‚É£ Hugging Face authentication (optional)**
use_auth = False  
if use_auth:
    from huggingface_hub import login
    huggingface_token = "your_huggingface_token_here"
    login(token=huggingface_token)

### **2Ô∏è‚É£ Create a cache directory**
if not os.path.exists(model_cache_dir):
    os.makedirs(model_cache_dir, exist_ok=True)

In [None]:
### **3Ô∏è‚É£ Download only some weights**
print("\nüì• Downloading partial model weights...")

# Download model configuration and tokenizer
hf_hub_download(repo_id=model_name, filename="config.json", cache_dir=model_cache_dir)
hf_hub_download(repo_id=model_name, filename="tokenizer.json", cache_dir=model_cache_dir)

# Download only weights from `low_layer` to `high_layer`
start_download = time.perf_counter()
for i in range(low_layer, high_layer):
    filename = f"model.layers.{i}.weight"
    hf_hub_download(repo_id=model_name, filename=filename, cache_dir=model_cache_dir)
end_download = time.perf_counter()
download_time = end_download - start_download
print(f"üì• Partial Model Download Time: {download_time:.2f} sec")


In [None]:
### **4Ô∏è‚É£ Loading some models**
print(f"\nüìÇ Loading only layers {low_layer} to {high_layer} ({high_layer - low_layer} layers)...")
start_load = time.perf_counter()

config_path = os.path.join(model_cache_dir, "config.json")
config = AutoConfig.from_pretrained(config_path)
model = AutoModel.from_config(config)

# Load only `N` layers of Transformer
for i in range(low_layer, high_layer):
    layer_path = os.path.join(model_cache_dir, f"model.layers.{i}.weight")
    model.model.layers[i].load_state_dict(torch.load(layer_path))

end_load = time.perf_counter()
load_time = end_load - start_load
print(f"üìÇ Partial Model Loading Time: {load_time:.2f} sec")

In [None]:
### **5Ô∏è‚É£ Redefine Partial Model**
class PartialLlamaModel(torch.nn.Module):
    def __init__(self, model, low_layer, high_layer):
        super().__init__()
        self.embed_tokens = model.model.embed_tokens
        self.layers = torch.nn.ModuleList(model.model.layers[low_layer:high_layer])
        self.norm = model.model.norm
        self.lm_head = model.lm_head

    def forward(self, input_ids, attention_mask=None, position_ids=None):
        if position_ids is None:
            position_ids = torch.arange(input_ids.shape[1], device=input_ids.device).unsqueeze(0)

        hidden_states = self.embed_tokens(input_ids)

        for layer in self.layers:
            query, key = hidden_states, hidden_states
            cos, sin = apply_rotary_pos_emb(query, key, position_ids)
            hidden_states = layer(hidden_states, attention_mask=attention_mask, position_embeddings=(cos, sin))[0]

        hidden_states = self.norm(hidden_states)
        return self.lm_head(hidden_states)

# Create a Partial Model
partial_model = PartialLlamaModel(model, low_layer, high_layer).to("cuda")

### **6Ô∏è‚É£ Reasoning Test**
print("\n‚ö° Running Inference with Partial Model...")
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=model_cache_dir)

prompt = "Hello, how are you?"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

start_infer = time.perf_counter()
outputs = partial_model(**inputs)
end_infer = time.perf_counter()
inference_time = end_infer - start_infer
print(f"‚ö° Inference Time: {inference_time:.2f} sec")

In [None]:
### **7Ô∏è‚É£ Output complete statistics**
print("\n=== üèÅ Timing Summary ===")
print(f"üì• Partial Download Time: {download_time:.2f} sec")
print(f"üìÇ Partial Load Time: {load_time:.2f} sec")
print(f"‚ö° Inference Time: {inference_time:.2f} sec")
print(f"üîπ Model: {model_name}")
print(f"üîπ Loaded Layers: {low_layer} to {high_layer} ({high_layer - low_layer} layers)")
print(f"üîπ Model Cache Directory: {model_cache_dir}")
print(f"üîπ Approx. GPU Memory Usage: ~{(high_layer - low_layer) * 0.4:.1f} GB (Estimate)")