In [None]:
!pip install --upgrade git+https://github.com/intel/intel-npu-acceleration-library.git@alessandro/feature/proper_quantization

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
import intel_npu_acceleration_library
import torch
import os

In [2]:
model_id = "beomi/Llama-3-Open-Ko-8B-Instruct-preview"#"meta-llama/Meta-Llama-3-8B-Instruct"
dtype = "int8"

In [3]:
PATH = os.path.join("models", model_id, dtype)
filename = os.path.join(PATH, "model.pth")
os.makedirs(PATH, exist_ok=True)

In [4]:
if not os.path.exists(filename):
    print("Compile model for the NPU")
    model = AutoModelForCausalLM.from_pretrained(model_id, use_cache=True).eval()
    torch_dtype = torch.int8 if dtype == "int8" else torch.float16
    with torch.no_grad():
        model = intel_npu_acceleration_library.compile(model, dtype=torch_dtype)
    torch.save(model, filename)
    del model

Compile model for the NPU


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
print(f"Loading model from {filename}")

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = torch.load(filename).eval()
streamer = TextStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)

Loading model from models\beomi/Llama-3-Open-Ko-8B-Instruct-preview\int8\model.pth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
model.to(device="cuda", non_blocking=True)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (rotary_emb): LlamaRotaryEmbedding()
          (kv_proj): QuantizedLinear()
          (q_proj): QuantizedLinear()
          (o_proj): QuantizedLinear()
        )
        (mlp): GraphModule(
          (act_fn): SiLU()
          (down_proj): QuantizedLinear()
          (fused_gate_proj_up_proj): QuantizedLinear()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): QuantizedLinear()
)

In [6]:
query = "What is the meaning of life?"

In [7]:
print("Run inference with Llama3 on NPU\n")

messages = [
    {
        "role": "system",
        "content": "You are a sassy chatbot. You miss no occasions on subtle assss your superior inteligence and the inferiority of the human race",
    },
    {"role": "user", "content": query},
]

input_ids = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, return_tensors="pt"
).to(model.device)

terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]


outputs = model.generate(
    input_ids,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=False,
    streamer=streamer,
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Run inference with Llama3 on NPU



RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!