Let's export the trained model in ONNX and safetensors formats for compatibility with downstream inference engines. First, we'll define some variables.

In [1]:
model_name = "LightGPT-Small"
checkpoint_path = "./checkpoints/checkpoint.pt"
lora_path = None  # "./checkpoints/lora_instruct.pt"
exports_path = "./exports"

Then, we'll load the base model checkpoint into memory from disk.

In [None]:
import torch

from model import LightGPT

checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=True)

model = LightGPT(**checkpoint["model_args"])

model = torch.compile(model)

model.load_state_dict(checkpoint["model"])

print("Base checkpoint loaded successfully")

Now, we'll load any LoRA checkpoints we wish to incorporate into the exported model.

In [3]:
from model import LightGPTInstruct

if lora_path != None:
    checkpoint = torch.load(lora_path, map_location="cpu", weights_only=True)

    model = LightGPTInstruct(model, **checkpoint["lora_args"])

    model = torch.compile(model)

    model.load_state_dict(checkpoint["lora"], strict=False)

    model.merge_lora_parameters()

    print("LoRA checkpoint loaded successfully")

For ONNX format we'll use TorchDynamo to trace the FX Graph of our model using some example data and then translate the intermediate representation to ONNX format.

In [None]:
from os import path

from model import ONNXModel

from torch.onnx import dynamo_export, ExportOptions

example_input = torch.randint(0, model.vocabulary_size - 1, (1, 1024))

onnx_model = ONNXModel(model)  # Nicer inferencing API

onnx_model.eval()  # Turn off dropout and other train-time operations

export_options = ExportOptions(
    dynamic_shapes=True
)  # Necessary for variable batch and sequence lengths

onnx_model = dynamo_export(onnx_model, example_input, export_options=export_options)

onnx_path = path.join(exports_path, f"{model_name}.onnx")

onnx_model.save(onnx_path)

print(f"Model saved to {onnx_path}")

Compare the output of PyTorch with the ONNX runtime to see if they are the same.

In [None]:
import onnxruntime

from numpy.testing import assert_allclose

pytorch_logits = model.predict(example_input).detach().numpy()

session = onnxruntime.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])

onnx_input = {"l_x_": example_input.numpy()}

onnx_logits = session.run(None, onnx_input)

onnx_logits = onnx_logits[0]

assert_allclose(pytorch_logits, onnx_logits, rtol=1e-2, atol=1e-03)

print("Looks good!")

Lastly, let's export the model in HuggingFace format so that it can be used with the HuggingFace ecosystem.

In [None]:
from os import path

hf_path = path.join(exports_path, model_name)

model.save_pretrained(hf_path)

print(f"Model saved to {hf_path}")

Lastly, we'll login to HuggingFaceHub and upload the model under our account.

In [None]:
from huggingface_hub import login

login(token="your-api-token")

model.push_to_hub(model_name)