In [1]:
# Command to free neuron core (forces Kernel restart)
# import IPython
# IPython.Application.instance().kernel.do_shutdown(True)

In [1]:
IMAGE_FOLDER = 'images'

RESIZE_IMAGE = True
# RESIZE_DIMENSIONS = [968, 1296]
RESIZE_DIMENSIONS = [1080, 1920]

NUM_KEYPOINTS = 2000
DEVICE = 'cpu'

# Input preparation

## Load images

In [2]:
import torch
import cv2
import os
import pprint
import sys

sys.path.append('/home/ubuntu/SuperGluePretrainedNetwork')

from models.superpoint import SuperPoint
from models.superglue import SuperGlue
from torch.utils.benchmark import Timer

In [3]:
# Disable gradient computation
torch.set_grad_enabled(False)
print(f"Running inference on device {DEVICE}")

Running inference on device cpu


In [4]:
# Help functions
def group_image_files(folder):
    """Group image files in the given folder by their prefix."""
    files = os.listdir(IMAGE_FOLDER)
    groups = {}
    for file in files:
        if '_' in file:
            prefix = file.split('_')[0]
        else:
            prefix = 'no_underscore'
            
        if prefix not in groups:
            groups[prefix] = []
        groups[prefix].append(file)

    return list(groups.values())

def read_test_image(file_path, resize_image=False, resize_to_dimensions=[968, 1296]):
    print(f"Reading image from {file_path}")
    image = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
    print(f"Original image size: {image.shape}")
    if resize_image:
        image = cv2.resize(image.astype('float32'), (resize_to_dimensions[1], resize_to_dimensions[0]))
        print(f"Image resized to {image.shape}")
        # Save the resized image
        original_folder_path = os.path.dirname(file_path)
        parent_folder_path = os.path.dirname(original_folder_path)
        resized_folder_path = os.path.join(parent_folder_path, 'image_resized')
        os.makedirs(resized_folder_path, exist_ok=True)
        file_name = os.path.basename(file_path)
        resized_file_name = f"resized_{resize_to_dimensions[0]}_{resize_to_dimensions[1]}_{file_name}"
        resized_file_path = os.path.join(resized_folder_path, resized_file_name)
        cv2.imwrite(resized_file_path, image)
        print(f"Saved resized image to {resized_file_path}")
    else:
        print("Not resizing image")
    return {
        "file_name": file_path.split("/")[-1],
        'image': image,
        'inp': torch.from_numpy(image/255.).float()[None, None].to(DEVICE)
    }


In [5]:
# Image Loading and Preprocessing
test_groups = group_image_files(IMAGE_FOLDER)

img1 = read_test_image(file_path=os.path.join(IMAGE_FOLDER, "IMG_9320.jpg"), resize_image=RESIZE_IMAGE, resize_to_dimensions=RESIZE_DIMENSIONS)
img2 = read_test_image(file_path=os.path.join(IMAGE_FOLDER, "IMG_9321.jpg"), resize_image=RESIZE_IMAGE, resize_to_dimensions=RESIZE_DIMENSIONS)

img3 = read_test_image(file_path=os.path.join(IMAGE_FOLDER, "IMG_9323.jpg"), resize_image=RESIZE_IMAGE, resize_to_dimensions=RESIZE_DIMENSIONS)
img4 = read_test_image(file_path=os.path.join(IMAGE_FOLDER, "IMG_9324.jpg"), resize_image=RESIZE_IMAGE, resize_to_dimensions=RESIZE_DIMENSIONS)

Reading image from images/IMG_9320.jpg
Original image size: (3024, 4032)
Image resized to (1080, 1920)
Saved resized image to image_resized/resized_1080_1920_IMG_9320.jpg
Reading image from images/IMG_9321.jpg
Original image size: (3024, 4032)
Image resized to (1080, 1920)
Saved resized image to image_resized/resized_1080_1920_IMG_9321.jpg
Reading image from images/IMG_9323.jpg
Original image size: (3024, 4032)
Image resized to (1080, 1920)
Saved resized image to image_resized/resized_1080_1920_IMG_9323.jpg
Reading image from images/IMG_9324.jpg
Original image size: (3024, 4032)
Image resized to (1080, 1920)
Saved resized image to image_resized/resized_1080_1920_IMG_9324.jpg


[ WARN:0@5.552] global loadsave.cpp:1063 imwrite_ Unsupported depth image for selected encoder is fallbacked to CV_8U.


## SuperPoint inference

In [6]:
super_point_model = SuperPoint( {"max_keypoints":NUM_KEYPOINTS}).eval().to(DEVICE)

Loaded SuperPoint model


In [7]:
# Process images with SuperPoint
def process_image_with_superpoint(image):
    """Process an image with the SuperPoint model."""
    points = super_point_model({'image': image['inp']})
    print(f"\n{image['file_name']}:")
    print(f"\tKey Points shape: {points['keypoints'][0].shape}")
    print(f"\tDescriptors shape: {points['descriptors'][0].shape}")
    return points

In [8]:
points_image_1 = process_image_with_superpoint(img1)
points_image_2 = process_image_with_superpoint(img2)
points_image_3 = process_image_with_superpoint(img3)
points_image_4 = process_image_with_superpoint(img4)


IMG_9320.jpg:
	Key Points shape: torch.Size([2000, 2])
	Descriptors shape: torch.Size([256, 2000])

IMG_9321.jpg:
	Key Points shape: torch.Size([2000, 2])
	Descriptors shape: torch.Size([256, 2000])

IMG_9323.jpg:
	Key Points shape: torch.Size([2000, 2])
	Descriptors shape: torch.Size([256, 2000])

IMG_9324.jpg:
	Key Points shape: torch.Size([2000, 2])
	Descriptors shape: torch.Size([256, 2000])


In [14]:
def save_superpoint_outputs(points, file_name, folder='assets/superpoint_outputs'):    
    if not file_name.endswith('.pt'):
        file_name += '.pt'
    os.makedirs(folder, exist_ok=True)
    file_path = os.path.join(folder, file_name)
    torch.save(points, file_path)
    print(f"saved superpoints outputs {file_path}")

for points_image, img in zip([points_image_1,points_image_2,points_image_3,points_image_4], [img1,img2,img3,img4]):
    save_superpoint_outputs(points_image, f'{img["file_name"].split(".")[0]}_{NUM_KEYPOINTS}_{img["image"].shape[0]}_{img["image"].shape[1]}')

saved superpoints outputs assets/superpoint_outputs/IMG_9320_2000_1080_1920.pt
saved superpoints outputs assets/superpoint_outputs/IMG_9321_2000_1080_1920.pt
saved superpoints outputs assets/superpoint_outputs/IMG_9323_2000_1080_1920.pt
saved superpoints outputs assets/superpoint_outputs/IMG_9324_2000_1080_1920.pt


In [10]:
def load_points_image(file_name='points_image_1', folder='assets/superpoint_outputs'):
    if not file_name.endswith('.pt'):
        file_name += '.pt'
    
    file_path = os.path.join(folder, file_name)    
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"No file found at {file_path}")
    
    points = torch.load(file_path)
    print(f"Points loaded from {file_path}")
    return points

points_image_1 = load_points_image(f'{img1["file_name"].split(".")[0]}_{NUM_KEYPOINTS}_{img1["image"].shape[0]}_{img1["image"].shape[1]}')
points_image_2 = load_points_image(f'{img2["file_name"].split(".")[0]}_{NUM_KEYPOINTS}_{img2["image"].shape[0]}_{img2["image"].shape[1]}')
points_image_3 = load_points_image(f'{img3["file_name"].split(".")[0]}_{NUM_KEYPOINTS}_{img3["image"].shape[0]}_{img3["image"].shape[1]}')
points_image_4 = load_points_image(f'{img4["file_name"].split(".")[0]}_{NUM_KEYPOINTS}_{img4["image"].shape[0]}_{img4["image"].shape[1]}')

Points loaded from assets/superpoint_outputs/IMG_9320_2000_1080_1920.pt
Points loaded from assets/superpoint_outputs/IMG_9321_2000_1080_1920.pt
Points loaded from assets/superpoint_outputs/IMG_9323_2000_1080_1920.pt
Points loaded from assets/superpoint_outputs/IMG_9324_2000_1080_1920.pt


## Prepare SuperGlue inputs

In [9]:
import torch

def make_superglue_input(image_pairs):
    """
    Create SuperGlue input for multiple pairs of images.
    :param image_pairs: A list of tuples, each containing a pair of (superpoint_data, img_data)
                        for two images to be matched.
    :return: A dictionary with batched SuperGlue input.
    """

    super_glue_input = {
        'keypoints0': [],
        'keypoints1': [],
        'descriptors0': [],
        'descriptors1': [],
        'scores0': [],
        'scores1': [],
        'image0': [],
        'image1': []
    }

    for (data1, img_1), (data2, img_2) in image_pairs:
        super_glue_input['keypoints0'] += data1['keypoints']
        super_glue_input['keypoints1'] += data2['keypoints']
        super_glue_input['descriptors0']+= data1['descriptors']
        super_glue_input['descriptors1']+= data2['descriptors']
        super_glue_input['scores0']+= data1['scores']
        super_glue_input['scores1']+= data2['scores']
        super_glue_input['image0']+= img_1['inp']
        super_glue_input['image1']+= img_2['inp']

    # Concatenate all tensors along the batch dimension (dim=0)
    for k in super_glue_input:
        if isinstance(super_glue_input[k], (list, tuple)):
            super_glue_input[k] = torch.stack(super_glue_input[k], dim=0)
    for k in super_glue_input.keys():
        print(f"{k} shape: {super_glue_input[k].shape}")
        
    return super_glue_input

In [10]:
# batch size 1
superglue_raw_input_bs1_1 = [((points_image_1, img1), (points_image_2, img2))] # pair 1 (img1 and img2)
superglue_raw_input_bs1_2 = [((points_image_3, img3), (points_image_4, img4))] # pair 2 (img3 and img4)

print("\nBatch size 1 - Pair 1")
super_glue_input_bs1_1 = make_superglue_input(superglue_raw_input_bs1_1)
print("\nBatch size 1 - Pair 2")
super_glue_input_bs1_2 = make_superglue_input(superglue_raw_input_bs1_2)


Batch size 1 - Pair 1
keypoints0 shape: torch.Size([1, 2000, 2])
keypoints1 shape: torch.Size([1, 2000, 2])
descriptors0 shape: torch.Size([1, 256, 2000])
descriptors1 shape: torch.Size([1, 256, 2000])
scores0 shape: torch.Size([1, 2000])
scores1 shape: torch.Size([1, 2000])
image0 shape: torch.Size([1, 1, 1080, 1920])
image1 shape: torch.Size([1, 1, 1080, 1920])

Batch size 1 - Pair 2
keypoints0 shape: torch.Size([1, 2000, 2])
keypoints1 shape: torch.Size([1, 2000, 2])
descriptors0 shape: torch.Size([1, 256, 2000])
descriptors1 shape: torch.Size([1, 256, 2000])
scores0 shape: torch.Size([1, 2000])
scores1 shape: torch.Size([1, 2000])
image0 shape: torch.Size([1, 1, 1080, 1920])
image1 shape: torch.Size([1, 1, 1080, 1920])


In [11]:
# batch size 2
superglue_raw_input_bs2 = [((points_image_1, img1), (points_image_2, img2)), ((points_image_3, img3), (points_image_4, img4))]

print("\nBatch size 2 - Pair 1 & 2")
super_glue_input_bs2 = make_superglue_input(superglue_raw_input_bs2)


Batch size 2 - Pair 1 & 2
keypoints0 shape: torch.Size([2, 2000, 2])
keypoints1 shape: torch.Size([2, 2000, 2])
descriptors0 shape: torch.Size([2, 256, 2000])
descriptors1 shape: torch.Size([2, 256, 2000])
scores0 shape: torch.Size([2, 2000])
scores1 shape: torch.Size([2, 2000])
image0 shape: torch.Size([2, 1, 1080, 1920])
image1 shape: torch.Size([2, 1, 1080, 1920])


## Benchmark Utils

In [12]:
import matplotlib.cm as cm
import numpy as np

def plot_matches(
        img_0,
        img_1,
        kpts_0,
        kpts_1,
        matches_0,
        scores_0,
        num_keypoints,
        hardware_type,
    ):
    image0 = img_0['image']
    image1 = img_1['image']
    kpts0 = kpts_0.cpu().numpy()
    kpts1 = kpts_1.cpu().numpy()
    mkpts0 = kpts0
    text = []

    mkpts1 = kpts1[matches_0.cpu().numpy()]
    conf = scores_0.cpu()
    color = cm.jet(conf)
    filename=f'{img_0["file_name"].split(".")[0]}_{img_1["file_name"].split(".")[0]}_matches_{hardware_type}_{num_keypoints}_{img_0["image"].shape[0]}_{img_0["image"].shape[1]}'
    
    os.makedirs('assets/matches', exist_ok=True)
    path = os.path.join('assets/matches', f'{filename}.png')
    make_matching_plot(image0, image1, kpts0, kpts1, mkpts0, mkpts1, color, text, matches_0, path, opencv_display=False)

# Visualize the matches.
def make_matching_plot(image0, image1, kpts0, kpts1, mkpts0,
                            mkpts1, color, text, matches_0, path=None,
                            show_keypoints=False, margin=10,
                            opencv_display=False, opencv_title='',
                            small_text=[]):
    H0, W0 = image0.shape
    H1, W1 = image1.shape
    H, W = max(H0, H1), W0 + W1 + margin

    out = 255*np.ones((H, W), np.uint8)
    out[:H0, :W0] = image0
    out[:H1, W0+margin:] = image1
    out = np.stack([out]*3, -1)

    if show_keypoints:
        kpts0, kpts1 = np.round(kpts0).astype(int), np.round(kpts1).astype(int)
        white = (255, 255, 255)
        black = (0, 0, 0)
        for x, y in kpts0:
            cv2.circle(out, (x, y), 2, black, -1, lineType=cv2.LINE_AA)
            cv2.circle(out, (x, y), 1, white, -1, lineType=cv2.LINE_AA)
        for x, y in kpts1:
            cv2.circle(out, (x + margin + W0, y), 2, black, -1,
                       lineType=cv2.LINE_AA)
            cv2.circle(out, (x + margin + W0, y), 1, white, -1,
                       lineType=cv2.LINE_AA)

    mkpts0, mkpts1 = np.round(mkpts0).astype(int), np.round(mkpts1).astype(int)
    color = (np.array(color[:, :3])*255).astype(int)[:, ::-1]
    for (x0, y0), (x1, y1), c, m in zip(mkpts0, mkpts1, color, matches_0):
        if m != -1:  # Only draw if the match is valid
            c = c.tolist()
            cv2.line(out, (x0, y0), (x1 + margin + W0, y1),
                    color=c, thickness=1, lineType=cv2.LINE_AA)
            # display line end-points as circles
            cv2.circle(out, (x0, y0), 2, c, -1, lineType=cv2.LINE_AA)
            cv2.circle(out, (x1 + margin + W0, y1), 2, c, -1,
                    lineType=cv2.LINE_AA)

    # Scale factor for consistent visualization across scales.
    sc = min(H / 640., 2.0)

    # Big text.
    Ht = int(30 * sc)  # text height
    txt_color_fg = (255, 255, 255)
    txt_color_bg = (0, 0, 0)
    for i, t in enumerate(text):
        cv2.putText(out, t, (int(8*sc), Ht*(i+1)), cv2.FONT_HERSHEY_DUPLEX,
                    1.0*sc, txt_color_bg, 2, cv2.LINE_AA)
        cv2.putText(out, t, (int(8*sc), Ht*(i+1)), cv2.FONT_HERSHEY_DUPLEX,
                    1.0*sc, txt_color_fg, 1, cv2.LINE_AA)

    # Small text.
    Ht = int(18 * sc)  # text height
    for i, t in enumerate(reversed(small_text)):
        cv2.putText(out, t, (int(8*sc), int(H-Ht*(i+.6))), cv2.FONT_HERSHEY_DUPLEX,
                    0.5*sc, txt_color_bg, 2, cv2.LINE_AA)
        cv2.putText(out, t, (int(8*sc), int(H-Ht*(i+.6))), cv2.FONT_HERSHEY_DUPLEX,
                    0.5*sc, txt_color_fg, 1, cv2.LINE_AA)

    if path is not None:
        print(f"Writing to Path {str(path)}")
        cv2.imwrite(str(path), out)
    else:
        print("Not writing to Path")

    if opencv_display:
        print("Displaying")
        cv2.imshow(opencv_title, out)
        cv2.waitKey(1)
    else:
        print("Not Displaying")

    return out

In [13]:
import time

def run_benchmark(model_to_benchmark, super_glue_input, iterations=10, warm_up=20):
    # Warm-up phase
    print("Warming up...")
    for _ in range(warm_up):
        model_to_benchmark(super_glue_input)
    
    # Benchmark phase
    print(f"Running benchmark for {iterations} iterations...")
    total_time = 0
    
    for i in range(iterations):
        start_time = time.time()
        model_to_benchmark(super_glue_input)
        end_time = time.time()
        
        iteration_time = end_time - start_time
        total_time += iteration_time
    
    average_time = total_time / iterations
    print(f"\nAverage time: {average_time:.6f} seconds")
    print(f"Total time for {iterations} iterations: {total_time:.6f} seconds")
    
    return average_time

In [26]:
import pickle


def save_data(data, filepath):
    # Create the directory if it doesn't exist
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    
    with open(filepath, 'wb') as f:
        pickle.dump(data, f)
    print(f"Data saved in {filepath}")

def load_data(filepath):
    with open(filepath, 'rb') as f:
        return pickle.load(f)

# Benchmark

## CPU

In [14]:
super_glue_model = SuperGlue({}).to(DEVICE)

Loaded SuperGlue model ("indoor" weights)


### Single execution

Note: on CPU, batch size 2 gives weird results that do not match with results from batch size 1. 
That's why we use only batch size of 1 for CPU

In [15]:
cpu_matches_bs1_1 = super_glue_model(super_glue_input_bs1_1)

In [16]:
cpu_matches_bs1_2 = super_glue_model(super_glue_input_bs1_2)

In [20]:
save_data(cpu_matches_bs1_1, 'pkl_files/matches/cpu_bs1_1.pkl')
save_data(cpu_matches_bs1_2, 'pkl_files/matches/cpu_bs1_2.pkl')

Data saved in pkl_files/matches/cpu_bs1_1.pkl
Data saved in pkl_files/matches/cpu_bs1_2.pkl


In [17]:
plot_matches(
        img_0= img1,
        img_1 = img2,
        kpts_0 = super_glue_input_bs1_1['keypoints0'][0],
        kpts_1 = super_glue_input_bs1_1['keypoints1'][0],
        matches_0 = cpu_matches_bs1_1['matches0'][0],
        scores_0 = super_glue_input_bs1_1['scores0'][0],
        num_keypoints = NUM_KEYPOINTS,
        hardware_type = "cpu"
)

Writing to Path assets/matches/IMG_9320_IMG_9321_matches_cpu_2000_1080_1920.png
Not Displaying


In [18]:
plot_matches(
        img_0= img3,
        img_1 = img4,
        kpts_0 = super_glue_input_bs1_2['keypoints0'][0],
        kpts_1 = super_glue_input_bs1_2['keypoints1'][0],
        matches_0 = cpu_matches_bs1_2['matches0'][0],
        scores_0 = super_glue_input_bs1_2['scores0'][0],
        num_keypoints = NUM_KEYPOINTS,
        hardware_type = "cpu"
)

Writing to Path assets/matches/IMG_9323_IMG_9324_matches_cpu_2000_1080_1920.png
Not Displaying


### Multiple executions

In [None]:
average_time = run_benchmark(super_glue_model, super_glue_input_bs1_1, iterations=10, warm_up=10)

## Neuron

In [19]:
# Check if a neuron library is available
try:
    import torch_neuronx
    neuron_library_available = True
except ImportError:
    neuron_library_available = False

In [20]:
import shutil
from pathlib import Path

import torch_neuronx

def compile_neuron_model(super_glue_model, super_glue_input, neuron_model_filename, flags=[]):

    full_model_path = f'models/{neuron_model_filename}.pt'
    os.makedirs('models', exist_ok=True)

    compiler_workdir = 'custom_neuron_workdir'
    compiler_workdir_path = Path.cwd() / compiler_workdir
    if os.path.exists(compiler_workdir_path):
        print(f'Clearing compiler dir: {compiler_workdir_path}')
        shutil.rmtree(compiler_workdir_path)
    print(f'Making compiler dir: {compiler_workdir_path}')
    compiler_workdir_path.mkdir(exist_ok=True)
    print(f'Setting compiler dir: {compiler_workdir_path.absolute()}')
    
    if os.path.isfile(full_model_path):
        print(f'Pre-Saved Neuron Model File exists at: {full_model_path}')            
    else:
        print("Pre-Saved Neuron Model File does not exist. Compiling it")

        print("Clearing neuron cache")
        cache_path = "/var/tmp/neuron-compile-cache"
        try:
            if os.path.exists(cache_path):
                shutil.rmtree(cache_path)
                print(f"Successfully removed {cache_path}")
            else:
                print(f"Folder {cache_path} does not exist")
        except Exception as e:
            print(f"Error removing {cache_path}: {e}")

        print(f'Compiling with flags: {flags}')
        neuron_traced_super_glue_model = torch_neuronx.trace(super_glue_model, super_glue_input, compiler_args=flags, compiler_workdir=compiler_workdir_path.absolute())
        torch.jit.save(neuron_traced_super_glue_model, full_model_path)

    return torch.jit.load(full_model_path)


def create_random_inputs(nb_keypoints=1000, image_size=(1920, 1080)):
    """
    Create random inputs for testing the neuron model.
    
    Args:
        nb_keypoints: Number of keypoints to generate
        image_size: Size of the images (height, width)
        
    Returns:
        dict: Randomly generated inputs
    """
    return {
        'keypoints0': torch.rand((1, nb_keypoints, 2)),
        'keypoints1': torch.rand((1, nb_keypoints, 2)),
        'descriptors0':  torch.rand((1, 256, nb_keypoints)),
        'descriptors1':  torch.rand((1, 256, nb_keypoints)),
        'scores0':  torch.rand((1, nb_keypoints)),
        'scores1':  torch.rand((1, nb_keypoints)),
        'image0': torch.rand((1, 1, *image_size)),
        'image1': torch.rand((1, 1, *image_size))
    }

In [21]:
random_inputs = create_random_inputs(nb_keypoints=NUM_KEYPOINTS, image_size=img1["image"].shape)


In [22]:
def flags_to_filename_string(flags):
    """
    Convert a list of flags to a filename-friendly string.
    
    Args:
    flags (list): List of flag strings.
    
    Returns:
    str: A filename-friendly string representation of the flags.
    """
    # Remove leading dashes and replace remaining dashes with underscores
    processed_flags = [flag.lstrip('-').replace('-', '_') for flag in flags]
    
    # Join the processed flags with underscores
    filename_string = '_'.join(processed_flags)
    
    # Replace '=' with '_' to avoid issues in some file systems
    filename_string = filename_string.replace('=', '_')
    
    return filename_string    

In [23]:
flags = ['--auto-cast=none', '--model-type=unet-inference']
#flags = ['--auto-cast=none', '--model-type=generic']
if neuron_library_available:
    neuron_model = compile_neuron_model(super_glue_model, random_inputs, f"neuron_model_fp32_{NUM_KEYPOINTS}_{img1['image'].shape[0]}_{img1['image'].shape[1]}_{flags_to_filename_string(flags)}", flags= flags)


Making compiler dir: /home/ubuntu/superglue-neuron/custom_neuron_workdir
Setting compiler dir: /home/ubuntu/superglue-neuron/custom_neuron_workdir
Pre-Saved Neuron Model File does not exist. Compiling it
Clearing neuron cache
Folder /var/tmp/neuron-compile-cache does not exist
Compiling with flags: ['--auto-cast=none', '--model-type=unet-inference']
2025-07-10 10:39:49.000271:  8518  INFO ||NEURON_CC_WRAPPER||: Call compiler with cmd: neuronx-cc compile --framework=XLA /tmp/ubuntu/neuroncc_compile_workdir/bce750bc-8337-4a11-b065-6466159dedd6/model.MODULE_10682217524436566897+e30acd3a.hlo_module.pb --output /tmp/ubuntu/neuroncc_compile_workdir/bce750bc-8337-4a11-b065-6466159dedd6/model.MODULE_10682217524436566897+e30acd3a.neff --target=trn1 --verbose=35




.Completed run_backend_driver.

Compiler status PASS
2025-07-10 10:39:53.000135:  8518  INFO ||NEURON_CC_WRAPPER||: Call compiler with cmd: neuronx-cc compile --framework=XLA /tmp/ubuntu/neuroncc_compile_workdir/a19ce7f0-e366-49de-8636-6a27435ce448/model.MODULE_15164771439026754743+e30acd3a.hlo_module.pb --output /tmp/ubuntu/neuroncc_compile_workdir/a19ce7f0-e366-49de-8636-6a27435ce448/model.MODULE_15164771439026754743+e30acd3a.neff --target=trn1 --verbose=35
.Completed run_backend_driver.

Compiler status PASS
2025-07-10 10:39:56.000508:  8518  INFO ||NEURON_CC_WRAPPER||: Call compiler with cmd: neuronx-cc compile --framework=XLA /tmp/ubuntu/neuroncc_compile_workdir/55fe9259-361e-4784-86bd-a3b0db9d7532/model.MODULE_3625415799110469568+e30acd3a.hlo_module.pb --output /tmp/ubuntu/neuroncc_compile_workdir/55fe9259-361e-4784-86bd-a3b0db9d7532/model.MODULE_3625415799110469568+e30acd3a.neff --target=trn1 --verbose=35
.Completed run_backend_driver.

Compiler status PASS
2025-07-10 10:39:58.0

### Batch size 1

#### Single execution

In [27]:
if neuron_library_available:
    neuron_matches_bs1_1 = neuron_model(super_glue_input_bs1_1)
    save_data(neuron_matches_bs1_1, 'pkl_files/matches/neuron_bs1_1.pkl')
    neuron_matches_bs1_2 = neuron_model(super_glue_input_bs1_2)
    save_data(neuron_matches_bs1_2, 'pkl_files/matches/neuron_bs1_2.pkl')

Data saved in pkl_files/matches/neuron_bs1_1.pkl
Data saved in pkl_files/matches/neuron_bs1_2.pkl


In [28]:
plot_matches(
        img_0= img1,
        img_1 = img2,
        kpts_0 = super_glue_input_bs1_1['keypoints0'][0],
        kpts_1 = super_glue_input_bs1_1['keypoints1'][0],
        matches_0 = neuron_matches_bs1_1['matches0'][0],
        scores_0 = super_glue_input_bs1_1['scores0'][0],
        num_keypoints = NUM_KEYPOINTS,
        hardware_type = "neuron_bs1_1"
)

plot_matches(
        img_0= img3,
        img_1 = img4,
        kpts_0 = super_glue_input_bs1_2['keypoints0'][0],
        kpts_1 = super_glue_input_bs1_2['keypoints1'][0],
        matches_0 = neuron_matches_bs1_2['matches0'][0],
        scores_0 = super_glue_input_bs1_2['scores0'][0],
        num_keypoints = NUM_KEYPOINTS,
        hardware_type = "neuron_bs1_2"
)

Writing to Path assets/matches/IMG_9320_IMG_9321_matches_neuron_bs1_1_2000_1080_1920.png
Not Displaying
Writing to Path assets/matches/IMG_9323_IMG_9324_matches_neuron_bs1_2_2000_1080_1920.png
Not Displaying


#### Multiple executions - benchmark

In [29]:
if neuron_library_available:
    average_time = run_benchmark(neuron_model, super_glue_input_bs1_1, iterations=100, warm_up=50)

Warming up...
Running benchmark for 100 iterations...

Average time: 0.061135 seconds
Total time for 100 iterations: 6.113550 seconds


### Batch size 2

In [30]:
if neuron_library_available:
    print("Using DataParallel")
    neuron_model = torch_neuronx.DataParallel(neuron_model, set_dynamic_batching=True)

Using DataParallel


#### Single execution

In [31]:
if neuron_library_available:
    neuron_matches_bs2 = neuron_model(super_glue_input_bs2)

In [32]:
save_data(neuron_matches_bs2, 'pkl_files/matches/neuron_bs2.pkl')

Data saved in pkl_files/matches/neuron_bs2.pkl


In [33]:
plot_matches(
        img_0= img1,
        img_1 = img2,
        kpts_0 = super_glue_input_bs2['keypoints0'][0],
        kpts_1 = super_glue_input_bs2['keypoints1'][0],
        matches_0 = neuron_matches_bs2['matches0'][0],
        scores_0 = super_glue_input_bs2['scores0'][0],
        num_keypoints = NUM_KEYPOINTS,
        hardware_type = "neuron_bs2_1"
)

Writing to Path assets/matches/IMG_9320_IMG_9321_matches_neuron_bs2_1_2000_1080_1920.png
Not Displaying


In [34]:
plot_matches(
        img_0= img3,
        img_1 = img4,
        kpts_0 = super_glue_input_bs2['keypoints0'][1],
        kpts_1 = super_glue_input_bs2['keypoints1'][1],
        matches_0 = neuron_matches_bs2['matches0'][1],
        scores_0 = super_glue_input_bs2['scores0'][1],
        num_keypoints = NUM_KEYPOINTS,
        hardware_type = "neuron_bs2_2"
)

Writing to Path assets/matches/IMG_9323_IMG_9324_matches_neuron_bs2_2_2000_1080_1920.png
Not Displaying


#### Multiple executions - benchmark

In [35]:
if neuron_library_available:
    average_time = run_benchmark(neuron_model, super_glue_input_bs2, iterations=100, warm_up=50)

Warming up...
Running benchmark for 100 iterations...

Average time: 0.074402 seconds
Total time for 100 iterations: 7.440212 seconds


# Sanity check - outputs comparison (CPU vs Neuron)

In [36]:
def compare_tensor_dicts(dict1, dict2, atol=1e-8):
    """
    Compare two dictionaries containing tensors with tolerance for small differences.
    
    Args:
        dict1, dict2: Dictionaries to compare
        atol: Absolute tolerance
        
    Returns:
        dict: Dictionary with comparison results for each key
    """
    if dict1.keys() != dict2.keys():
        return {"keys_match": False, "missing_keys": set(dict1.keys()) ^ set(dict2.keys())}
    
    results = {"keys_match": True, "all_close": True, "details": {}}
    
    for key in dict1:
        if not isinstance(dict1[key], torch.Tensor) or not isinstance(dict2[key], torch.Tensor):
            results["details"][key] = {
                "close": dict1[key] == dict2[key],
                "error": "Not tensors"
            }
            if dict1[key] != dict2[key]:
                results["all_close"] = False
            continue
            
        # Check shapes first
        if dict1[key].shape != dict2[key].shape:
            results["details"][key] = {
                "close": False,
                "error": f"Shape mismatch: {dict1[key].shape} vs {dict2[key].shape}"
            }
            results["all_close"] = False
            continue
        
        # Check values with tolerance
        is_close = torch.allclose(dict1[key], dict2[key], atol=atol)
        
        if not is_close:
            # Calculate differences for detailed reporting
            abs_diff = (dict1[key] - dict2[key]).abs()
            max_diff = abs_diff.max().item()
            mean_diff = abs_diff.mean(dtype=torch.float).item()  # Specify dtype=torch.float
            
            # Collate all indices where the values are different
            diff_indices = torch.nonzero(abs_diff > atol, as_tuple=False)
            diff_values = []
            for idx in diff_indices:
                # Convert index to tuple for multi-dimensional tensors
                idx_tuple = tuple(idx.tolist())
                val1 = dict1[key][idx_tuple].item()
                val2 = dict2[key][idx_tuple].item()
                diff = abs(val1 - val2)
                diff_values.append({
                    "index": idx_tuple,
                    "value1": val1,
                    "value2": val2,
                    "diff": diff
                })

            results["details"][key] = {
                "close": False,
                "max_diff": max_diff,
                "mean_diff": mean_diff,
                "max_diff_index": abs_diff.argmax().item() if abs_diff.numel() > 0 else None,
                "diff_count": len(diff_indices),
                "total_count": len(abs_diff[0]),
                "diff_values": diff_values[:10]  # Limit to first 10 differences to avoid huge outputs
            }

            results["all_close"] = False
        else:
            results["details"][key] = {"close": True}
    
    return results

In [37]:
if neuron_library_available:
    comparison_of_original_model_vs_neuron = compare_tensor_dicts(cpu_matches_bs1_1, neuron_matches_bs1_1, 1e-4)
    pprint.pprint(comparison_of_original_model_vs_neuron)

{'all_close': False,
 'details': {'matches0': {'close': True},
             'matches1': {'close': True},
             'matching_scores0': {'close': False,
                                  'diff_count': 1,
                                  'diff_values': [{'diff': 0.0004196465015411377,
                                                   'index': (0, 355),
                                                   'value1': 0.462631493806839,
                                                   'value2': 0.46221184730529785}],
                                  'max_diff': 0.0004196465015411377,
                                  'max_diff_index': 355,
                                  'mean_diff': 2.8205802209413378e-06,
                                  'total_count': 2000},
             'matching_scores1': {'close': False,
                                  'diff_count': 1,
                                  'diff_values': [{'diff': 0.0004196465015411377,
                                          

In [38]:
if neuron_library_available:
    comparison_of_original_model_vs_neuron = compare_tensor_dicts(cpu_matches_bs1_2, neuron_matches_bs1_2, 1e-4)
    pprint.pprint(comparison_of_original_model_vs_neuron)

{'all_close': True,
 'details': {'matches0': {'close': True},
             'matches1': {'close': True},
             'matching_scores0': {'close': True},
             'matching_scores1': {'close': True}},
 'keys_match': True}


In [39]:
# concatenating batch size 1 tensors to compare with batch size 2 tensor
contat_tensors_bs1 = {
    "matches0": torch.cat([neuron_matches_bs1_1["matches0"],neuron_matches_bs1_2["matches0"]],dim=0),
    "matches1": torch.cat([neuron_matches_bs1_1["matches1"],neuron_matches_bs1_2["matches1"]],dim=0),
    "matching_scores0": torch.cat([neuron_matches_bs1_1["matching_scores0"],neuron_matches_bs1_2["matching_scores0"]],dim=0),
    "matching_scores1": torch.cat([neuron_matches_bs1_1["matching_scores1"],neuron_matches_bs1_2["matching_scores1"]],dim=0),
}

if neuron_library_available:
    comparison_of_original_model_vs_neuron = compare_tensor_dicts(contat_tensors_bs1, neuron_matches_bs2, 1e-4)
    pprint.pprint(comparison_of_original_model_vs_neuron)

{'all_close': True,
 'details': {'matches0': {'close': True},
             'matches1': {'close': True},
             'matching_scores0': {'close': True},
             'matching_scores1': {'close': True}},
 'keys_match': True}


In [None]:
# Profile the neuron model
!neuron-profile capture --enable-dge-notifs -n custom_neuron_workdir/graph.neff -s profile.ntff