<a href="https://colab.research.google.com/github/anilnbsingh/vision/blob/main/smollm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==============================================================================
# Google Colab Notebook for Running SmolLM on Qualcomm QCS8550
# This script is updated to compile the model for the NPU and perform inference
# using the `submit_inference_job` API.
#
# Prerequisites:
# - A valid Qualcomm AI Hub account.
# - An API token from your Qualcomm AI Hub account settings.
# - The target device 'QCS8550 (Proxy)' is available on the AI Hub.
# ==============================================================================

# ------------------------------------------------------------------------------
# 1. Setup the environment
# ------------------------------------------------------------------------------

# Install the required Python packages.
# 'qai-hub' is for the Qualcomm AI Hub API.
# 'qai-hub-models' provides helper utilities.
# 'transformers' and 'torch' are for loading the model.
# 'onnx' is required for exporting the model to ONNX format.
print("1. Installing required Python packages...")
#!pip install qai-hub qai-hub-models torch transformers onnx
print("Installation complete.")
print("="*80)

# ------------------------------------------------------------------------------
# 2. Configure Qualcomm AI Hub Access
# ------------------------------------------------------------------------------

import qai_hub as hub

# You must configure your API token to authenticate with the AI Hub.
# Replace "<YOUR_API_TOKEN>" with your actual token.
# DO NOT share your token.
print("2. Configuring Qualcomm AI Hub...")
api_token = "nak7kyh0inngt9vewsxy74gobp4mk6q5zeean82x" # <-- IMPORTANT: Replace with your API token

!qai-hub configure --api_token {api_token}
print("Configuration complete. Your API token is now set.")
print("="*80)

# ------------------------------------------------------------------------------
# 3. Download and Prepare the SmolLM Model
# ------------------------------------------------------------------------------

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np

# We'll use the SmolLM-135M-Instruct model from Hugging Face as an example.
model_name = "HuggingFaceTB/SmolLM-135M-Instruct"
print(f"3. Downloading and preparing model: {model_name}...")

# Load the tokenizer and the model from Hugging Face.
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Set the model to evaluation mode. This is important for conversion.
model.eval()

# To handle the dynamic nature of the past_key_values cache, we will
# create a simple wrapper class for the model's forward pass.
# This ensures the output is a simple tuple of tensors, which ONNX can handle.
class SmolLMWrapper(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.config = model.config

    def forward(self, input_ids):
        # We need to get the output from the model's forward pass.
        # We'll set return_dict to True to easily access the outputs.
        outputs = self.model(input_ids, return_dict=True)

        # The outputs contain a DynamicCache object, which the ONNX exporter
        # cannot handle. We need to convert it into a flat list of tensors.
        logits = outputs.logits
        past_key_values = outputs.past_key_values

        # Flatten the tuple of tuples of tensors into a single tuple of tensors.
        flattened_past_key_values = []
        for layer_key_value in past_key_values:
            flattened_past_key_values.extend(layer_key_value)

        # The ONNX exporter requires the output to be a tuple of tensors.
        return (logits,) + tuple(flattened_past_key_values)

# Instantiate the wrapper and get the output names
model_wrapper = SmolLMWrapper(model)
num_layers = model.config.num_hidden_layers
output_names = ['logits'] + [f'past_key_values_{i}' for i in range(num_layers * 2)]

# Define dummy input
onnx_model_path = "SmolLM-135M-Instruct.onnx"
# The tokenizer returns int64 tensors by default.
dummy_input = tokenizer("What is the capital of France?", return_tensors="pt").input_ids
input_shape = dummy_input.shape

print(f"Exporting model to ONNX with a dummy input of shape: {input_shape}...")

try:
    # Use the wrapper model for export.
    torch.onnx.export(
        model_wrapper,
        dummy_input,
        onnx_model_path,
        opset_version=14,  # Choose a compatible ONNX opset version
        input_names=['input_ids'],
        output_names=output_names,
        dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence_length'}},
    )
    print(f"Model successfully exported to {onnx_model_path}")
except Exception as e:
    print(f"An error occurred during ONNX export: {e}")
    onnx_model_path = None
print("="*80)
# ------------------------------------------------------------------------------
# 4. Compile the Model for QCS8550 NPU
# ------------------------------------------------------------------------------

# Define the target device
#target_device = hub.Device("QCS8250 (Proxy)")
target_device = hub.Device("QCS8550 (Proxy)")
# Check if the device is available before submitting the job
print(f"4. Checking for device availability: {target_device.name}...")
available_devices = hub.get_devices()
if target_device.name not in [d.name for d in available_devices]:
    print(f"ERROR: The device '{target_device.name}' is not currently available.")
    print("Please check the Qualcomm AI Hub website for device status and try again later.")
    compiled_model = None
else:
    print(f"Device '{target_device.name}' is available. Submitting compilation job...")
    # Submit a compilation job to the Qualcomm AI Hub using the ONNX model file.
    if onnx_model_path:
        try:
            # We are now targeting the QNN runtime for NPU execution.
            compile_job = hub.submit_compile_job(
                model=onnx_model_path,
                name=f"{model_name.split('/')[-1]}_qcs8550_npu", # Updated name
                device=target_device,
                # The input specs must specify the correct integer type.
                # The compiler infers int64 from the ONNX model, so we must match that.
                # The --truncate_64bit_io option handles the conversion for the NPU.
                input_specs={"input_ids": (input_shape, "int64")},
                options="--truncate_64bit_io"
            )

            print("Compilation job submitted.")
            try:
                print(f"Job ID: {compile_job.id}")
            except AttributeError:
                print("Could not retrieve job ID from the CompileJob object.")
                print(f"You can check the job status and details on the AI Hub website using the URL: {compile_job.url}")

            # Wait for the job to complete. This is a blocking call and will
            # raise an exception if the job fails.
            print("Waiting for compilation to complete...")
            compile_job.wait()

            # If wait() completes successfully, the model is ready.
            print("Compilation job completed successfully!")
            compiled_model = compile_job.get_target_model()

            # Handle the case where the Model object might not have an 'id' attribute.
            try:
                print(f"Compiled model is ready. Model ID: {compiled_model.id}")
            except AttributeError:
                print("Compiled model is ready, but could not retrieve the Model ID from the Model object.")
                print("Please check the Qualcomm AI Hub website to find the model and its ID.")


        except Exception as e:
            print(f"An error occurred during compilation: {e}")
            compiled_model = None
print("="*80)

# ------------------------------------------------------------------------------
# 5. Run Inference on the NPU using the Compiled Model
# ------------------------------------------------------------------------------
# Now we can submit an inference job to generate a response.
if compiled_model:
    print("5. Submitting inference job...")
    try:
        # Define the input prompt.
        prompt_text = "What is the capital of Japan?"
        print(f"Input prompt: '{prompt_text}'")

        # The inference job expects a dictionary of inputs. We need to tokenize
        # the prompt and convert it to a NumPy array.
        input_tokens = tokenizer(prompt_text, return_tensors="pt").input_ids
        # We pass the input data as int64, as that is what the compiler inferred from the ONNX model.
        # The --truncate_64bit_io option will handle the conversion.
        input_data = {"input_ids": [input_tokens.cpu().numpy()]}

        # Submit the inference job with the compiled model and input data.
        inference_job = hub.submit_inference_job(
            model=compiled_model,
            device=target_device,
            inputs=input_data
        )

        print("Inference job submitted.")
        try:
            print(f"Job ID: {inference_job.id}")
        except AttributeError:
            print("Could not retrieve job ID from the InferenceJob object.")
            print(f"You can check the job status and details on the AI Hub website using the URL: {inference_job.url}")

        # Wait for the inference job to complete.
        print("Waiting for inference to complete...")
        inference_job.wait()

        # If wait() completes successfully, the output is ready.
        print("Inference job completed successfully!")

        # Get the output data, which will be a dictionary of numpy arrays.
        # We've fixed the method name from get_outputs() to get_output_data().
        output_data = inference_job.get_output_data()

        # We can then process the output logits to get the generated text.
        # This part requires a custom generation loop, as the AI Hub returns
        # a single step of output. For a complete LLM inference, you would
        # loop this process. Here, we'll just show the raw output as an example.
        if 'logits' in output_data:
            logits = output_data['logits']
            # Here you would typically process the logits to generate the next token.
            # For demonstration, we'll just print a confirmation.
            print(f"Received logits with shape: {logits.shape}")
            print("The model has successfully processed the input on the NPU.")
            print("To generate a full response, you would need to implement a token generation loop.")
        else:
            print("Output data does not contain 'logits'.")


    except Exception as e:
        print(f"An error occurred during inference: {e}")
print("="*80)


1. Installing required Python packages...
Installation complete.
2. Configuring Qualcomm AI Hub...
2025-08-10 11:50:47.261 - INFO - Enabling verbose logging.
qai-hub configuration saved to /root/.qai_hub/client.ini
[api]
api_token = nak7kyh0inngt9vewsxy74gobp4mk6q5zeean82x
api_url = https://app.aihub.qualcomm.com
web_url = https://app.aihub.qualcomm.com
verbose = True


Configuration complete. Your API token is now set.
3. Downloading and preparing model: HuggingFaceTB/SmolLM-135M-Instruct...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Exporting model to ONNX with a dummy input of shape: torch.Size([1, 7])...


  is_causal = query.shape[2] > 1 and attention_mask is None and getattr(module, "is_causal", True)


Model successfully exported to SmolLM-135M-Instruct.onnx
4. Checking for device availability: QCS8550 (Proxy)...
Device 'QCS8550 (Proxy)' is available. Submitting compilation job...
Uploading SmolLM-135M-Instruct.onnx


100%|[34m██████████[0m| 622M/622M [00:05<00:00, 110MB/s]


Scheduled compile job (jp294jnxg) successfully. To see the status and results:
    https://app.aihub.qualcomm.com/jobs/jp294jnxg/

Compilation job submitted.
Could not retrieve job ID from the CompileJob object.
You can check the job status and details on the AI Hub website using the URL: https://app.aihub.qualcomm.com/jobs/jp294jnxg/
Waiting for compilation to complete...
Waiting for compile job (jp294jnxg) completion. Type Ctrl+C to stop waiting at any time.
    ✅ SUCCESS                          
Compilation job completed successfully!
Compiled model is ready, but could not retrieve the Model ID from the Model object.
Please check the Qualcomm AI Hub website to find the model and its ID.
5. Submitting inference job...
Input prompt: 'What is the capital of Japan?'


Uploading dataset: 17.7kB [00:00, 122kB/s]                    


Scheduled inference job (jpyjqn0rp) successfully. To see the status and results:
    https://app.aihub.qualcomm.com/jobs/jpyjqn0rp/

Inference job submitted.
Could not retrieve job ID from the InferenceJob object.
You can check the job status and details on the AI Hub website using the URL: https://app.aihub.qualcomm.com/jobs/jpyjqn0rp/
Waiting for inference to complete...
Waiting for inference job (jpyjqn0rp) completion. Type Ctrl+C to stop waiting at any time.
    ❌ FAILED               For input 0, expected int32 for data input dtype but got int64.            
Inference job completed successfully!
An error occurred during inference: 'InferenceJob' object has no attribute 'get_output_data'


In [None]:
# ------------------------------------------------------------------------------
# 3. Download and Prepare the SmolLM Model
# ------------------------------------------------------------------------------

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np

# We'll use the SmolLM-135M-Instruct model from Hugging Face as an example.
model_name = "HuggingFaceTB/SmolLM-135M-Instruct"
print(f"3. Downloading and preparing model: {model_name}...")

# Load the tokenizer and the model from Hugging Face.
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Set the model to evaluation mode. This is important for conversion.
model.eval()

# To handle the dynamic nature of the past_key_values cache, we will
# create a simple wrapper class for the model's forward pass.
# This ensures the output is a simple tuple of tensors, which ONNX can handle.
class SmolLMWrapper(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.config = model.config

    def forward(self, input_ids):
        # We need to get the output from the model's forward pass.
        # We'll set return_dict to True to easily access the outputs.
        outputs = self.model(input_ids, return_dict=True)

        # The outputs contain a DynamicCache object, which the ONNX exporter
        # cannot handle. We need to convert it into a flat list of tensors.
        logits = outputs.logits
        past_key_values = outputs.past_key_values

        # Flatten the tuple of tuples of tensors into a single tuple of tensors.
        flattened_past_key_values = []
        for layer_key_value in past_key_values:
            flattened_past_key_values.extend(layer_key_value)

        # The ONNX exporter requires the output to be a tuple of tensors.
        return (logits,) + tuple(flattened_past_key_values)

# Instantiate the wrapper and get the output names
model_wrapper = SmolLMWrapper(model)
num_layers = model.config.num_hidden_layers
output_names = ['logits'] + [f'past_key_values_{i}' for i in range(num_layers * 2)]

# Define dummy input
onnx_model_path = "SmolLM-135M-Instruct.onnx"
# The tokenizer returns int64 tensors by default. We explicitly cast the dummy input to int32
# to create an ONNX model that natively expects int32.
dummy_input = tokenizer("What is the capital of France?", return_tensors="pt").input_ids.to(torch.int32)
input_shape = dummy_input.shape

print(f"Exporting model to ONNX with a dummy input of shape: {input_shape}...")

try:
    # Use the wrapper model for export.
    torch.onnx.export(
        model_wrapper,
        dummy_input,
        onnx_model_path,
        opset_version=14,  # Choose a compatible ONNX opset version
        input_names=['input_ids'],
        output_names=output_names,
        dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence_length'}},
    )
    print(f"Model successfully exported to {onnx_model_path}")
except Exception as e:
    print(f"An error occurred during ONNX export: {e}")
    onnx_model_path = None
print("="*80)
# ------------------------------------------------------------------------------
# 4. Compile the Model for QCS8550 NPU
# ------------------------------------------------------------------------------

# Define the target device
#target_device = hub.Device("QCS8250 (Proxy)")
target_device = hub.Device("QCS8550 (Proxy)")
# Check if the device is available before submitting the job
print(f"4. Checking for device availability: {target_device.name}...")
available_devices = hub.get_devices()
if target_device.name not in [d.name for d in available_devices]:
    print(f"ERROR: The device '{target_device.name}' is not currently available.")
    print("Please check the Qualcomm AI Hub website for device status and try again later.")
    compiled_model = None
else:
    print(f"Device '{target_device.name}' is available. Submitting compilation job...")
    # Submit a compilation job to the Qualcomm AI Hub using the ONNX model file.
    if onnx_model_path:
        try:
            # We are now targeting the QNN runtime for NPU execution.
            compile_job = hub.submit_compile_job(
                model=onnx_model_path,
                name=f"{model_name.split('/')[-1]}_qcs8550_npu", # Updated name
                device=target_device,
                # The input specs must specify the correct integer type.
                # Since we exported the ONNX model as int32, we must specify int32 here.
                input_specs={"input_ids": (input_shape, "int32")},
                options="--truncate_64bit_io"
            )

            print("Compilation job submitted.")
            try:
                print(f"Job ID: {compile_job.id}")
            except AttributeError:
                print("Could not retrieve job ID from the CompileJob object.")
                print(f"You can check the job status and details on the AI Hub website using the URL: {compile_job.url}")

            # Wait for the job to complete. This is a blocking call and will
            # raise an exception if the job fails.
            print("Waiting for compilation to complete...")
            compile_job.wait()

            # If wait() completes successfully, the model is ready.
            print("Compilation job completed successfully!")
            compiled_model = compile_job.get_target_model()

            # Handle the case where the Model object might not have an 'id' attribute.
            try:
                print(f"Compiled model is ready. Model ID: {compiled_model.id}")
            except AttributeError:
                print("Compiled model is ready, but could not retrieve the Model ID from the Model object.")
                print("Please check the Qualcomm AI Hub website to find the model and its ID.")


        except Exception as e:
            print(f"An error occurred during compilation: {e}")
            compiled_model = None
print("="*80)

# ------------------------------------------------------------------------------
# 5. Run Inference on the NPU using the Compiled Model
# ------------------------------------------------------------------------------
# Now we can submit an inference job to generate a response.
if compiled_model:
    print("5. Submitting inference job...")
    try:
        # Define the input prompt.
        prompt_text = "What is the capital of Japan?"
        print(f"Input prompt: '{prompt_text}'")

        # The inference job expects a dictionary of inputs. We need to tokenize
        # the prompt and convert it to a NumPy array.
        input_tokens = tokenizer(prompt_text, return_tensors="pt").input_ids
        # We explicitly cast the NumPy array to int32 to match the compiled model's
        # expected data type.
        input_data = {"input_ids": [input_tokens.cpu().numpy().astype(np.int32)]}

        # Submit the inference job with the compiled model and input data.
        inference_job = hub.submit_inference_job(
            model=compiled_model,
            device=target_device,
            inputs=input_data
        )

        print("Inference job submitted.")
        try:
            print(f"Job ID: {inference_job.id}")
        except AttributeError:
            print("Could not retrieve job ID from the InferenceJob object.")
            print(f"You can check the job status and details on the AI Hub website using the URL: {inference_job.url}")

        # Wait for the inference job to complete.
        print("Waiting for inference to complete...")
        inference_job.wait()

        # If wait() completes successfully, the output is ready.
        print("Inference job completed successfully!")

        # Get the output data, which will be a dictionary of numpy arrays.
        # This has been corrected from `get_outputs()` to `get_output_data()`
        output_data = inference_job.get_output_data()

        # We can then process the output logits to get the generated text.
        # This part requires a custom generation loop, as the AI Hub returns
        # a single step of output. For a complete LLM inference, you would
        # loop this process. Here, we'll just show the raw output as an example.
        if 'logits' in output_data:
            logits = output_data['logits']
            # Here you would typically process the logits to generate the next token.
            # For demonstration, we'll just print a confirmation.
            print(f"Received logits with shape: {logits.shape}")
            print("The model has successfully processed the input on the NPU.")
            print("To generate a full response, you would need to implement a token generation loop.")
        else:
            print("Output data does not contain 'logits'.")


    except Exception as e:
        print(f"An error occurred during inference: {e}")
print("="*80)

3. Downloading and preparing model: HuggingFaceTB/SmolLM-135M-Instruct...
Exporting model to ONNX with a dummy input of shape: torch.Size([1, 7])...
Model successfully exported to SmolLM-135M-Instruct.onnx
4. Checking for device availability: QCS8550 (Proxy)...
Device 'QCS8550 (Proxy)' is available. Submitting compilation job...
Uploading SmolLM-135M-Instruct.onnx


100%|[34m██████████[0m| 622M/622M [00:05<00:00, 110MB/s]


Scheduled compile job (jp02dk725) successfully. To see the status and results:
    https://app.aihub.qualcomm.com/jobs/jp02dk725/

Compilation job submitted.
Could not retrieve job ID from the CompileJob object.
You can check the job status and details on the AI Hub website using the URL: https://app.aihub.qualcomm.com/jobs/jp02dk725/
Waiting for compilation to complete...
Waiting for compile job (jp02dk725) completion. Type Ctrl+C to stop waiting at any time.
    ✅ SUCCESS                          
Compilation job completed successfully!
Compiled model is ready, but could not retrieve the Model ID from the Model object.
Please check the Qualcomm AI Hub website to find the model and its ID.
5. Submitting inference job...
Input prompt: 'What is the capital of Japan?'


Uploading dataset: 17.7kB [00:00, 113kB/s]                    


Scheduled inference job (jp8m68vz5) successfully. To see the status and results:
    https://app.aihub.qualcomm.com/jobs/jp8m68vz5/

Inference job submitted.
Could not retrieve job ID from the InferenceJob object.
You can check the job status and details on the AI Hub website using the URL: https://app.aihub.qualcomm.com/jobs/jp8m68vz5/
Waiting for inference to complete...
Waiting for inference job (jp8m68vz5) completion. Type Ctrl+C to stop waiting at any time.
    ✅ SUCCESS                          
Inference job completed successfully!
An error occurred during inference: 'InferenceJob' object has no attribute 'get_output_data'


In [None]:

# ------------------------------------------------------------------------------
# 3. Download and Prepare the SmolLM Model
# ------------------------------------------------------------------------------

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np

# We'll use the SmolLM-135M-Instruct model from Hugging Face as an example.
model_name = "HuggingFaceTB/SmolLM-135M-Instruct"
print(f"3. Downloading and preparing model: {model_name}...")

# Load the tokenizer and the model from Hugging Face.
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Set the model to evaluation mode. This is important for conversion.
model.eval()

# To handle the dynamic nature of the past_key_values cache, we will
# create a simple wrapper class for the model's forward pass.
# This ensures the output is a simple tuple of tensors, which ONNX can handle.
class SmolLMWrapper(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.config = model.config

    def forward(self, input_ids):
        # We need to get the output from the model's forward pass.
        # We'll set return_dict to True to easily access the outputs.
        outputs = self.model(input_ids, return_dict=True)

        # The outputs contain a DynamicCache object, which the ONNX exporter
        # cannot handle. We need to convert it into a flat list of tensors.
        logits = outputs.logits
        past_key_values = outputs.past_key_values

        # Flatten the tuple of tuples of tensors into a single tuple of tensors.
        flattened_past_key_values = []
        for layer_key_value in past_key_values:
            flattened_past_key_values.extend(layer_key_value)

        # The ONNX exporter requires the output to be a tuple of tensors.
        return (logits,) + tuple(flattened_past_key_values)

# Instantiate the wrapper and get the output names
model_wrapper = SmolLMWrapper(model)
num_layers = model.config.num_hidden_layers
output_names = ['logits'] + [f'past_key_values_{i}' for i in range(num_layers * 2)]

# Define dummy input
onnx_model_path = "SmolLM-135M-Instruct.onnx"
# We explicitly cast the dummy input to int32 to create an ONNX model
# that natively expects int32. This avoids the data type mismatch during compilation.
dummy_input = tokenizer("What is the capital of France?", return_tensors="pt").input_ids.to(torch.int32)
input_shape = dummy_input.shape

print(f"Exporting model to ONNX with a dummy input of shape: {input_shape}...")

try:
    # Use the wrapper model for export.
    torch.onnx.export(
        model_wrapper,
        dummy_input,
        onnx_model_path,
        opset_version=14,  # Choose a compatible ONNX opset version
        input_names=['input_ids'],
        output_names=output_names,
        dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence_length'}},
    )
    print(f"Model successfully exported to {onnx_model_path}")
except Exception as e:
    print(f"An error occurred during ONNX export: {e}")
    onnx_model_path = None
print("="*80)
# ------------------------------------------------------------------------------
# 4. Compile the Model for QCS8550 NPU
# ------------------------------------------------------------------------------

# Define the target device
#target_device = hub.Device("QCS8250 (Proxy)")
target_device = hub.Device("QCS8550 (Proxy)")
# Check if the device is available before submitting the job
print(f"4. Checking for device availability: {target_device.name}...")
available_devices = hub.get_devices()
if target_device.name not in [d.name for d in available_devices]:
    print(f"ERROR: The device '{target_device.name}' is not currently available.")
    print("Please check the Qualcomm AI Hub website for device status and try again later.")
    compiled_model = None
else:
    print(f"Device '{target_device.name}' is available. Submitting compilation job...")
    # Submit a compilation job to the Qualcomm AI Hub using the ONNX model file.
    if onnx_model_path:
        try:
            # We are now targeting the QNN runtime for NPU execution.
            compile_job = hub.submit_compile_job(
                model=onnx_model_path,
                name=f"{model_name.split('/')[-1]}_qcs8550_npu", # Updated name
                device=target_device,
                # The input specs must now specify int32 to match the ONNX model,
                # which was created with int32 tensors.
                input_specs={"input_ids": (input_shape, "int32")},
                options="--truncate_64bit_io"
            )

            print("Compilation job submitted.")
            try:
                print(f"Job ID: {compile_job.id}")
            except AttributeError:
                print("Could not retrieve job ID from the CompileJob object.")
                print(f"You can check the job status and details on the AI Hub website using the URL: {compile_job.url}")

            # Wait for the job to complete. This is a blocking call and will
            # raise an exception if the job fails.
            print("Waiting for compilation to complete...")
            compile_job.wait()

            # If wait() completes successfully, the model is ready.
            print("Compilation job completed successfully!")
            compiled_model = compile_job.get_target_model()

            # Handle the case where the Model object might not have an 'id' attribute.
            try:
                print(f"Compiled model is ready. Model ID: {compiled_model.id}")
            except AttributeError:
                print("Compiled model is ready, but could not retrieve the Model ID from the Model object.")
                print("Please check the Qualcomm AI Hub website to find the model and its ID.")


        except Exception as e:
            print(f"An error occurred during compilation: {e}")
            compiled_model = None
print("="*80)

# ------------------------------------------------------------------------------
# 5. Run Inference on the NPU using the Compiled Model
# ------------------------------------------------------------------------------
# Now we can submit an inference job to generate a response.
if compiled_model:
    print("5. Submitting inference job...")
    try:
        # Define the input prompt.
        prompt_text = "What is the capital of Japan?"
        print(f"Input prompt: '{prompt_text}'")

        # The inference job expects a dictionary of inputs. We need to tokenize
        # the prompt and convert it to a NumPy array.
        input_tokens = tokenizer(prompt_text, return_tensors="pt").input_ids
        # We explicitly cast the NumPy array to int32 to match the compiled model's
        # expected data type, which is now int32.
        input_data = {"input_ids": [input_tokens.cpu().numpy().astype(np.int32)]}

        # Submit the inference job with the compiled model and input data.
        inference_job = hub.submit_inference_job(
            model=compiled_model,
            device=target_device,
            inputs=input_data
        )

        print("Inference job submitted.")
        try:
            print(f"Job ID: {inference_job.id}")
        except AttributeError:
            print("Could not retrieve job ID from the InferenceJob object.")
            print(f"You can check the job status and details on the AI Hub website using the URL: {inference_job.url}")

        # Wait for the inference job to complete.
        print("Waiting for inference to complete...")
        inference_job.wait()

        # If wait() completes successfully, the output is ready.
        print("Inference job completed successfully!")

        # Get the output data, which will be a dictionary of numpy arrays.
        # We've corrected the method name to download_output_data() and added a check.
        output_data = inference_job.download_output_data()

        # We can then process the output logits to get the generated text.
        # This part requires a custom generation loop, as the AI Hub returns
        # a single step of output. For a complete LLM inference, you would
        # loop this process. Here, we'll just show the raw output as an example.
        if 'logits' in output_data:
            logits = output_data['logits']
            # Here you would typically process the logits to generate the next token.
            # For demonstration, we'll just print a confirmation.
            print(f"Received logits with shape: {logits.shape}")
            print("The model has successfully processed the input on the NPU.")
            print("To generate a full response, you would need to implement a token generation loop.")
        else:
            print("Output data does not contain 'logits'. This may indicate a problem with the model's output configuration.")
            print("Here are all the keys found in the output data for debugging:")
            print(output_data.keys())


    except Exception as e:
        print(f"An error occurred during inference: {e}")
print("="*80)


3. Downloading and preparing model: HuggingFaceTB/SmolLM-135M-Instruct...
Exporting model to ONNX with a dummy input of shape: torch.Size([1, 7])...
Model successfully exported to SmolLM-135M-Instruct.onnx
4. Checking for device availability: QCS8550 (Proxy)...
Device 'QCS8550 (Proxy)' is available. Submitting compilation job...
Uploading SmolLM-135M-Instruct.onnx


100%|[34m██████████[0m| 622M/622M [00:06<00:00, 104MB/s]


Scheduled compile job (jgonomq4p) successfully. To see the status and results:
    https://app.aihub.qualcomm.com/jobs/jgonomq4p/

Compilation job submitted.
Could not retrieve job ID from the CompileJob object.
You can check the job status and details on the AI Hub website using the URL: https://app.aihub.qualcomm.com/jobs/jgonomq4p/
Waiting for compilation to complete...
Waiting for compile job (jgonomq4p) completion. Type Ctrl+C to stop waiting at any time.
