In [None]:
import os

os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false"

%pip install -Uq pip
%pip uninstall -q -y optimum optimum-intel
%pip install --pre -Uq openvino openvino-tokenizers[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
%pip install -q --extra-index-url https://download.pytorch.org/whl/cpu\
"git+https://github.com/huggingface/optimum-intel.git"\
"git+https://github.com/openvinotoolkit/nncf.git"\
"torch>=2.1"\
"datasets" \
"accelerate"\
"gradio>=4.19"\
"onnx" "einops" "transformers_stream_generator" "tiktoken" "transformers>=4.40" "bitsandbytes"

[0m  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for optimum-intel (pyproject.toml) ... [?25l[?25hdone
[0m

In [None]:
import os
from pathlib import Path
import requests
import shutil

# fetch model configuration

config_shared_path = Path("../../utils/llm_config.py")
config_dst_path = Path("llm_config.py")

if not config_dst_path.exists():
    if config_shared_path.exists():
        try:
            os.symlink(config_shared_path, config_dst_path)
        except Exception:
            shutil.copy(config_shared_path, config_dst_path)
    else:
        r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/llm_config.py")
        with open("llm_config.py", "w", encoding="utf-8") as f:
            f.write(r.text)
elif not os.path.islink(config_dst_path):
    print("LLM config will be updated")
    if config_shared_path.exists():
        shutil.copy(config_shared_path, config_dst_path)
    else:
        r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/llm_config.py")
        with open("llm_config.py", "w", encoding="utf-8") as f:
            f.write(r.text)

LLM config will be updated


* zephyr-7b-beta - not enough ram
* qwen2-0.5b-instruct - working
* qwen2-1.5b-instruct - working
* mistral-7b - restricted access
* tiny-llama-1b-chat - working
* gemma-2b-it - restricted access
* notus-7b-v1 - not enough ram  
* neural-chat-7b-v3-1
* llama-2-chat-7b
* llama-3-8b-instruct
* gemma-7b-it
* mpt-7b-chat
* chatglm2-6b
* qwen-7b-chat
* red-pajama-3b-chat
* phi-3-mini-instruct - not enough ram  

In [None]:
from llm_config import SUPPORTED_LLM_MODELS
import ipywidgets as widgets

model_language = "English"
model_id = "qwen2-0.5b-instruct" #qwen2-0.5b-instruct
model_configuration = SUPPORTED_LLM_MODELS[model_language][model_id]

In [None]:
prepare_fp16_model = False
prepare_int8_model = False
prepare_int4_model = True #False
enable_awq = False

device = "CPU"
model_to_run = "INT4" # "INT4" "INT8", "FP16"

In [None]:
from pathlib import Path

pt_model_id = model_configuration["model_id"]
pt_model_name = model_id.split("-")[0]
fp16_model_dir = Path(model_id) / "FP16"
int8_model_dir = Path(model_id) / "INT8_compressed_weights"
int4_model_dir = Path(model_id) / "INT4_compressed_weights"


def convert_to_fp16():
    if (fp16_model_dir / "openvino_model.xml").exists():
        return
    remote_code = model_configuration.get("remote_code", False)
    export_command_base = "optimum-cli export openvino --model {} --task text-generation-with-past --weight-format fp16".format(pt_model_id)
    if remote_code:
        export_command_base += " --trust-remote-code"
    export_command = export_command_base + " " + str(fp16_model_dir)
    print("Export command:")
    print(export_command)
    # display(Markdown("**Export command:**"))
    # display(Markdown(f"`{export_command}`"))
    ! $export_command


def convert_to_int8():
    if (int8_model_dir / "openvino_model.xml").exists():
        return
    int8_model_dir.mkdir(parents=True, exist_ok=True)
    remote_code = model_configuration.get("remote_code", False)
    export_command_base = "optimum-cli export openvino --model {} --task text-generation-with-past --weight-format int8".format(pt_model_id)
    if remote_code:
        export_command_base += " --trust-remote-code"
    export_command = export_command_base + " " + str(int8_model_dir)
    print("Export command:")
    print(export_command)
    # display(Markdown("**Export command:**"))
    # display(Markdown(f"`{export_command}`"))
    ! $export_command


def convert_to_int4():
    compression_configs = {
        "zephyr-7b-beta": {
            "sym": True,
            "group_size": 64,
            "ratio": 0.6,
        },
        "mistral-7b": {
            "sym": True,
            "group_size": 64,
            "ratio": 0.6,
        },
        "minicpm-2b-dpo": {
            "sym": True,
            "group_size": 64,
            "ratio": 0.6,
        },
        "gemma-2b-it": {
            "sym": True,
            "group_size": 64,
            "ratio": 0.6,
        },
        "notus-7b-v1": {
            "sym": True,
            "group_size": 64,
            "ratio": 0.6,
        },
        "neural-chat-7b-v3-1": {
            "sym": True,
            "group_size": 64,
            "ratio": 0.6,
        },
        "llama-2-chat-7b": {
            "sym": True,
            "group_size": 128,
            "ratio": 0.8,
        },
        "llama-3-8b-instruct": {
            "sym": True,
            "group_size": 128,
            "ratio": 0.8,
        },
        "gemma-7b-it": {
            "sym": True,
            "group_size": 128,
            "ratio": 0.8,
        },
        "chatglm2-6b": {
            "sym": True,
            "group_size": 128,
            "ratio": 0.72,
        },
        "qwen-7b-chat": {"sym": True, "group_size": 128, "ratio": 0.6},
        "red-pajama-3b-chat": {
            "sym": False,
            "group_size": 128,
            "ratio": 0.5,
        },
        "default": {
            "sym": False,
            "group_size": 128,
            "ratio": 0.8,
        },
    }

    model_compression_params = compression_configs.get(model_id, compression_configs["default"])
    if (int4_model_dir / "openvino_model.xml").exists():
        return
    remote_code = model_configuration.get("remote_code", False)
    export_command_base = "optimum-cli export openvino --model {} --task text-generation-with-past --weight-format int4".format(pt_model_id)
    int4_compression_args = " --group-size {} --ratio {}".format(model_compression_params["group_size"], model_compression_params["ratio"])
    if model_compression_params["sym"]:
        int4_compression_args += " --sym"
    if enable_awq:
        int4_compression_args += " --awq --dataset wikitext2 --num-samples 128"
    export_command_base += int4_compression_args
    if remote_code:
        export_command_base += " --trust-remote-code"
    export_command = export_command_base + " " + str(int4_model_dir)
    print("Export command:")
    print(export_command)
    # display(Markdown("**Export command:**"))
    # display(Markdown(f"`{export_command}`"))
    ! $export_command


if prepare_fp16_model:
    convert_to_fp16()
if prepare_int8_model:
    convert_to_int8()
if prepare_int4_model:
    convert_to_int4()

Export command:
optimum-cli export openvino --model Qwen/Qwen2-0.5B-Instruct --task text-generation-with-past --weight-format int4 --group-size 128 --ratio 0.8 qwen2-0.5b-instruct/INT4_compressed_weights
2024-06-28 18:15:00.665037: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-28 18:15:00.665097: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-28 18:15:00.668142: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-28 18:15:00.684571: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in perform

In [None]:
fp16_weights = fp16_model_dir / "openvino_model.bin"
int8_weights = int8_model_dir / "openvino_model.bin"
int4_weights = int4_model_dir / "openvino_model.bin"

if fp16_weights.exists():
    print(f"Size of FP16 model is {fp16_weights.stat().st_size / 1024 / 1024:.2f} MB")
for precision, compressed_weights in zip([8, 4], [int8_weights, int4_weights]):
    if compressed_weights.exists():
        print(f"Size of model with INT{precision} compressed weights is {compressed_weights.stat().st_size / 1024 / 1024:.2f} MB")
    if compressed_weights.exists() and fp16_weights.exists():
        print(f"Compression rate for INT{precision} model: {fp16_weights.stat().st_size / compressed_weights.stat().st_size:.3f}")

Size of model with INT4 compressed weights is 358.86 MB


Load Model

In [None]:
from transformers import AutoConfig, AutoTokenizer
from optimum.intel.openvino import OVModelForCausalLM

if model_to_run == "INT4":
    model_dir = int4_model_dir
elif model_to_run == "INT8":
    model_dir = int8_model_dir
else:
    model_dir = fp16_model_dir
print(f"Loading model from {model_dir}")

ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""}

if "GPU" in device and "qwen2-7b-instruct" in model_id:
    ov_config["GPU_ENABLE_SDPA_OPTIMIZATION"] = "NO"

# On a GPU device a model is executed in FP16 precision. For red-pajama-3b-chat model there known accuracy
# issues caused by this, which we avoid by setting precision hint to "f32".
if model_id == "red-pajama-3b-chat" and "GPU" in core.available_devices and device in ["GPU", "AUTO"]:
    ov_config["INFERENCE_PRECISION_HINT"] = "f32"

model_name = model_configuration["model_id"]
tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)

ov_model = OVModelForCausalLM.from_pretrained(
    model_dir,
    device=device,
    ov_config=ov_config,
    config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True),
    trust_remote_code=True,
)

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'


Loading model from qwen2-0.5b-instruct/INT4_compressed_weights


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The argument `trust_remote_code` is to be used along with export=True. It will be ignored.
Compiling the model to CPU ...


In [None]:
tokenizer_kwargs = model_configuration.get("tokenizer_kwargs", {})
test_string = "What is GenAI?"
input_tokens = tok(test_string, return_tensors="pt", **tokenizer_kwargs)
answer = ov_model.generate(**input_tokens, max_new_tokens=100)
print(tok.batch_decode(answer, skip_special_tokens=True)[0])

What is GenAI? It’s a new breed of AI that uses artificial intelligence to create highly complex systems. GenAI was created by the University of California at Berkeley, and it's designed to be used for applications such as autonomous vehicles, healthcare, and finance.
The system uses deep learning algorithms to analyze large amounts of data and make predictions about future events. The company says that GenAI can handle billions of transactions per second and process images in real-time.
GenAI has received a lot of attention recently, with many investors


To Save the model

In [None]:
# Save model for faster loading later
ov_model.save_pretrained("qwen2-0.5b-instruct-ov")
tok.save_pretrained("qwen2-0.5b-instruct-ov")

('qwen2-0.5b-instruct-ov/tokenizer_config.json',
 'qwen2-0.5b-instruct-ov/special_tokens_map.json',
 'qwen2-0.5b-instruct-ov/vocab.json',
 'qwen2-0.5b-instruct-ov/merges.txt',
 'qwen2-0.5b-instruct-ov/added_tokens.json',
 'qwen2-0.5b-instruct-ov/tokenizer.json')

Load a Saved Model

In [None]:
# Load a saved model
model = OVModelForCausalLM.from_pretrained("qwen2-0.5b-instruct-ov")
tokenizer = AutoTokenizer.from_pretrained("qwen2-0.5b-instruct-ov")

Compiling the model to CPU ...
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
tokenizer_kwargs = model_configuration.get("tokenizer_kwargs", {})
test_string = "What is the process for the impeachment of the President of India?"
input_tokens = tokenizer(test_string, return_tensors="pt", **tokenizer_kwargs)
answer = model.generate(**input_tokens, max_new_tokens=100)
print(tok.batch_decode(answer, skip_special_tokens=True)[0])

What is the process for the impeachment of the President of India?

The process for impeaching the President of India involves a formal complaint being made to the Supreme Court, which then issues an order requiring the President to answer questions and provide evidence. If the President refuses to appear or does not answer questions in the prescribed time period, the case goes to the Lok Sabha (upper house) for further hearing. In the event that the President still fails to appear or provide evidence, the House of Representatives can vote on the impeachment motion. If the House of Representatives votes against


In [None]:
tokenizer_kwargs = model_configuration.get("tokenizer_kwargs", {})
test_string = "What is the process for the impeachment of the President of India?"
input_tokens = tokenizer(test_string, return_tensors="pt", **tokenizer_kwargs)
answer = model.generate(**input_tokens, max_new_tokens=200)
print(tok.batch_decode(answer, skip_special_tokens=True)[0])

What is the process for the impeachment of the President of India? The process for impeaching the President of India is as follows:

1. The Congress party leader or any other party with a majority in the Legislative Council issues a resolution to impeach the President.
2. The Prime Minister, who is responsible for implementing policies and regulations, decides whether to initiate an investigation into the allegations made against the President.
3. If the Prime Minister decides to initiate an investigation, he/she submits the chargesheet to the Supreme Court within 7 days.
4. The Supreme Court then conducts an inquiry on the chargesheet and decides if there are sufficient grounds for impeachment. If not, it sends the case back to the Prime Minister for further investigation.
5. If there are enough grounds for impeachment, the Prime Minister can decide to initiate an impeachment trial. In such cases, the House of Councillors will vote on the impeachment bill.
6. If the House of Councillo