# Chapter-4 Model Optimization using ONNX Simplifier and ONNX Runtime

#### In this notebook, we will try to optimize GPT2 ONNX model using ONNX Simplifier and ONNX Runtime. We will also see the impact of these optimizations on the model.

## Part-1 : Export GPT2 ONNX Model

In [12]:
# Install prerequisites
!pip install onnx==1.18.0 onnxruntime==1.22.0 onnx-simplifier==0.4.36
!pip install netron==8.4.3 transformers==4.53.2



In [13]:
# Load GPT2 model from HuggingFace: https://huggingface.co/openai-community/gpt2

from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")

# Encode the input text (prompt) into tokens
input_text = "Once upon a time"
input_ids = tokenizer.encode(input_text, return_tensors='pt')
print("Input Ids shape: ", input_ids.shape)

# Generate text using the model
output = model.generate(input_ids, max_length=50, num_return_sequences=1, no_repeat_ngram_size=2)

# Decode the generated tokens back into text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print("--"*30)
print(f"Given input: {input_text}")
print(f"Generated output: {generated_text}")
print("--"*30)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input Ids shape:  torch.Size([1, 4])
------------------------------------------------------------
Given input: Once upon a time
Generated output: Once upon a time, the world was a place of great beauty and great danger. The world of the gods was the place where the great gods were born, and where they were to live.

The world that was created was not the same
------------------------------------------------------------


In [14]:
# Export GPT2 model to ONNX

import os
import torch
os.makedirs("./exported_models/", exist_ok=True)
static_shape_output_path = "./exported_models/gpt2_hf_static_shape.onnx"
dynamic_shape_output_path = "./exported_models/gpt2_hf_dynamic_shape.onnx"

# Export the model to ONNX with static shapes
dummy_static_input_ids = torch.ones([1, 128], dtype=torch.int32)
torch.onnx.export(
    model,
    args=(dummy_static_input_ids,),
    f=static_shape_output_path,
    kwargs={'logits_to_keep': None},
    input_names=["input_ids"],
    output_names=["logits"],
    opset_version=14
)

# Export the model to ONNX with dynamic shapes
torch.onnx.export(
    model,
    args=(input_ids,),
    f=dynamic_shape_output_path,
    kwargs={'logits_to_keep': None},
    input_names=["input_ids"],
    output_names=["logits"],
    # Dynamic axes for batch size and sequence length
    dynamic_axes={"input_ids": {0: "batch_size", 1: "sequence_length"},  
                  "logits": {0: "batch_size", 1: "sequence_length"}},
    opset_version=14
)

print(f"Model with static shapes successfully exported to {static_shape_output_path}")
print(f"Model with dynamic shapes successfully exported to {dynamic_shape_output_path}")

Model with static shapes successfully exported to ./exported_models/gpt2_hf_static_shape.onnx
Model with dynamic shapes successfully exported to ./exported_models/gpt2_hf_dynamic_shape.onnx


In [15]:
# Visualize the exported model with static shapes

import IPython
import netron

port = 6006
netron.start(static_shape_output_path, port, browse=False)
IPython.display.IFrame(f"http://localhost:{port}", width=1000, height=500)

In [16]:
# Visualize the exported model with dynamic shapes

import IPython
import netron

port = 6006
netron.start(dynamic_shape_output_path, port, browse=False)
IPython.display.IFrame(f"http://localhost:{port}", width=1000, height=500)

In [17]:
# Below code is used to compare the original model with optimized model

import onnxruntime as ort
import numpy as np
import random
import time

def check_performance(model_path, input_data, num_iter=100):
    # Perform inference and measure timing
    session = ort.InferenceSession(model_path)
    start = time.time()
    for i in range(num_iter):
        outputs = session.run(None, input_data)
    end = time.time()

    time_diff = (end-start)/num_iter
    print(f"Inference time: {time_diff:.4f} seconds")

## Part-2 : Optimize model using ONNX Simplifier

In [18]:
import onnx
from onnxsim import simplify

def optimize_model_using_simplifier(model_path, output_path):
    # Load onnx model
    onnx_model = onnx.load(model_path)

    # Simplify model using ONNX Simplifier
    simplified_model, status = simplify(onnx_model)

    # Save simplified model
    onnx.save(simplified_model, output_path)
    print(f"Before Nodes: {len(onnx_model.graph.node)}")
    print(f"After Nodes: {len(simplified_model.graph.node)}")

print("Model with static shapes:")
opt_model_onnxsim_static_shape = "./exported_models/gpt2_hf_static_shapes_onnxsim.onnx"
optimize_model_using_simplifier(static_shape_output_path, opt_model_onnxsim_static_shape)

print("Model with dynamic shapes:")
opt_model_onnxsim_dynamic_shape = "./exported_models/gpt2_hf_dynamic_shapes_onnxsim.onnx"
optimize_model_using_simplifier(dynamic_shape_output_path, opt_model_onnxsim_dynamic_shape)

Model with static shapes:
Before Nodes: 1168
After Nodes: 673
Model with dynamic shapes:
Before Nodes: 2766
After Nodes: 1435


In [19]:
# Visualize the optimized model with static shapes

import IPython
import netron

port = 6006
netron.start(opt_model_onnxsim_static_shape, port, browse=False)
IPython.display.IFrame(f"http://localhost:{port}", width=1000, height=500)

In [20]:
# Visualize the optimized model with dynamic shapes

import IPython
import netron

port = 6006
netron.start(opt_model_onnxsim_dynamic_shape, port, browse=False)
IPython.display.IFrame(f"http://localhost:{port}", width=1000, height=500)

In [21]:
# Let us make a dummy input tensor of shape [1, 128] for checking the performance of the models
input_data_for_static_shape = {"input_ids" : np.random.randint(low=0, high=100, size=(1, 128), dtype=np.int32)}

# Check performance for static shape model
print("Original model with static shapes")
check_performance(static_shape_output_path, input_data_for_static_shape)

print("Optimized model with static shapes")
check_performance(opt_model_onnxsim_static_shape, input_data_for_static_shape)

Original model with static shapes
Inference time: 0.6316 seconds
Optimized model with static shapes
Inference time: 0.5367 seconds


In [22]:
input_data_for_dynamic_shape = {"input_ids" : np.random.randint(low=0, high=100, size=(1, 128), dtype=np.int64)}

# Check performance for dynamic shape model
print("Original model with dynamic shapes")
check_performance(dynamic_shape_output_path, input_data_for_dynamic_shape)

print("Optimized model with dynamic shapes")
check_performance(opt_model_onnxsim_dynamic_shape, input_data_for_dynamic_shape)

Original model with dynamic shapes
Inference time: 0.5671 seconds
Optimized model with dynamic shapes
Inference time: 0.5315 seconds


## Part-3 : Optimize model using ONNX Runtime

In [23]:
import onnxruntime as rt

def optimize_model_using_ort(model_path, output_path):
    # Load onnx model
    sess_options = rt.SessionOptions()
    sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL

    # Below are the different levels of optimizations in onnxruntime
    # rt.GraphOptimizationLevel.ORT_DISABLE_ALL -> Disables all optimizations
    # rt.GraphOptimizationLevel.ORT_ENABLE_BASIC -> Enables basic optimizations
    # rt.GraphOptimizationLevel.ORT_ENABLE_EXTENDED -> Enables basic and extended optimizations
    # rt.GraphOptimizationLevel.ORT_ENABLE_ALL -> Enables all available optimizations including layout optimizations

    # To enable model serialization after graph optimization set this
    sess_options.optimized_model_filepath = output_path

    session = rt.InferenceSession(model_path, sess_options)

    # No need to run the model. Initializing the session will generate the optimized model

    orig_model = onnx.load(model_path)
    opt_model = onnx.load(output_path)
    print(f"Before Nodes: {len(orig_model.graph.node)}")
    print(f"After Nodes: {len(opt_model.graph.node)}")


In [24]:
print("Model with static shapes:")
opt_model_ort_static_shape = "./exported_models/gpt2_hf_static_shapes_ort.onnx"
optimize_model_using_ort(static_shape_output_path, opt_model_ort_static_shape)

print("Model with dynamic shapes:")
opt_model_ort_dynamic_shape = "./exported_models/gpt2_hf_dynamic_shapes_ort.onnx"
optimize_model_using_ort(dynamic_shape_output_path, opt_model_ort_dynamic_shape)

Model with static shapes:
Before Nodes: 1168
After Nodes: 365
Model with dynamic shapes:
Before Nodes: 2766
After Nodes: 1120


In [25]:
# Visualize the optimized model with dynamic shapes

import IPython
import netron

port = 6006
netron.start(opt_model_ort_static_shape, port, browse=False)
IPython.display.IFrame(f"http://localhost:{port}", width=1000, height=500)

In [26]:
# Visualize the optimized model with dynamic shapes

import IPython
import netron

port = 6006
netron.start(opt_model_ort_dynamic_shape, port, browse=False)
IPython.display.IFrame(f"http://localhost:{port}", width=1000, height=500)

In [27]:
# Check performance for static shape model
print("Original model with static shapes")
check_performance(static_shape_output_path, input_data_for_static_shape)

print("Optimized model with static shapes")
check_performance(opt_model_ort_static_shape, input_data_for_static_shape)

Original model with static shapes
Inference time: 0.5829 seconds
Optimized model with static shapes
Inference time: 0.5348 seconds


In [28]:
# Check performance for dynamic shape model
print("Original model with dynamic shapes")
check_performance(dynamic_shape_output_path, input_data_for_dynamic_shape)

print("Optimized model with dynamic shapes")
check_performance(opt_model_ort_dynamic_shape, input_data_for_dynamic_shape)

Original model with dynamic shapes
Inference time: 0.5383 seconds
Optimized model with dynamic shapes
Inference time: 0.5283 seconds
