<a href="https://colab.research.google.com/github/amitdoda1983/iith_GPU_inference/blob/main/2_pytorch_inference_cuda_copy_from_mem_location.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Simulate by allocating GPU memory for image being written by smartnic.
2. Create a random torch tensor image data in GPU.
3. Associate this tensor data to the above GPU memory.
4. uptill now, we have image tensor in gpu and memory pointer say gpu_memory
5. Now create an zeros torch tensor.
6. copy from  gpu_memory to this tensor address.
7. work on this tensor for rest of the processing and inference.

Note: since pytorch cant access the memory pointer directly, we use pycuda to copy the image tensor from memory pointer (passed by smartnic) to a new torch tensor (in GPU only) which holds zeros.we use this tensor for rest of the flow.


In [33]:
!pip install pycuda



In [34]:
import torch
import ctypes
import torch
import pycuda.driver as cuda
import pycuda.autoinit

class_names = ["person", "bicycle", "car", "motorbike", "airplane", "bus", "train", "truck", "boat", "traffic light",
    "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep",
    "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase",
    "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
    "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana",
    "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
    "potted plant", "bed", "dining table", "toilet", "TV", "laptop", "mouse", "remote", "keyboard", "cell phone",
    "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
    "hair drier", "toothbrush"]

In [35]:
def load_yolov5_model():
    # Load YOLOv5 model (can be replaced with other variants like yolov5m, yolov5l)
    model = torch.hub.load('ultralytics/yolov5:v6.2', 'yolov5s')
    model = model.cuda()  # Move model to GPU
    return model

# Run inference using PyTorch (YOLOv5)
def infer_on_gpu(model, image_tensor):
    model.eval()
    with torch.no_grad():
        predictions = model(image_tensor)
    return predictions


def postprocess(predictions, conf_threshold=0.5):
    pred = predictions[0]
    boxes = pred[:, :4]  # Bounding box coordinates
    confidences = pred[:, 4]  # Confidence scores
    class_probs = pred[:, 5:]  # Class probabilities

    # Get predicted class ID
    class_ids = torch.argmax(class_probs, dim=1)
    overall_confidences = confidences * class_probs.max(dim=1).values

    keep = overall_confidences > conf_threshold
    boxes = boxes[keep]
    confidences = overall_confidences[keep]
    class_ids = class_ids[keep]

    # Print boxes, confidences, and class ids
    for i in range(len(boxes)):
        print(f"Box {i}: {boxes[i]}, Confidence: {confidences[i].item():.2f}, Class ID: {class_ids[i].item()}")

    return boxes, confidences, class_ids

### Simulating image from smartnic

Assuming the smartnic will pass the pointer to memory

In [None]:
# Step 1: Simulate GPU memory passed from SmartNIC (replace actual GPU memory pointer)

height, width, channels = 720, 1280, 3  # HWC format
image_size = height * width * channels * 4  # Assuming float32 (4 bytes per element)
gpu_memory = cuda.mem_alloc(image_size)

# Simulate data from the SmartNIC (e.g., randomly generated data)
dummy_data = torch.rand(height, width, channels, dtype=torch.float32, device="cuda")


# Step 2: Copy data directly from the PyTorch tensor to the GPU memory
# Use device-to-device memory copy (dtod)
cuda.memcpy_dtod(gpu_memory, dummy_data.data_ptr(), image_size)

# Step 3: Access the GPU memory using PyCUDA
#use the memory pointer to directly access the data in GPU memory
gpu_pointer = int(gpu_memory)  # This is the pointer to GPU memory passed from SmartNIC

### copy smartnic image to torch tensor

In [None]:
# Step 4: Create a PyTorch zeros tensor directly on the GPU
# Allocate a tensor on the GPU with the same shape as the image
image_tensor = torch.zeros(height, width, channels, dtype=torch.float32, device="cuda")

# Step 5: Copy data from the GPU memory (from SmartNIC) into the PyTorch tensor
cuda.memcpy_dtod(image_tensor.data_ptr(), gpu_pointer, image_size)  # Copy data from SmartNIC GPU memory to the tensor

### pre-process and inference

In [36]:
# Step 6: Preprocess the image for YOLOv5
# Convert the image tensor to CHW format (from HWC format)
image_tensor = image_tensor.permute(2, 0, 1)  # HWC -> CHW

# Resize the image (YOLOv5 expects a 640x640 image for inference)
image_tensor = torch.nn.functional.interpolate(image_tensor.unsqueeze(0), size=(640, 640), mode='bilinear', align_corners=False)

# Normalize the image (YOLOv5 uses the following normalization)
mean = torch.tensor([0.485, 0.456, 0.406], device=image_tensor.device).view(1, 3, 1, 1)
std = torch.tensor([0.229, 0.224, 0.225], device=image_tensor.device).view(1, 3, 1, 1)
image_tensor = (image_tensor / 255.0 - mean) / std  # Normalize the image to YOLOv5 standards

# Step 7: Load YOLOv5 model and perform inference
model = load_yolov5_model()
predictions = infer_on_gpu(model, image_tensor)

# Step 8: Postprocess results
print(predictions)


Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_v6.2
YOLOv5 🚀 2024-12-8 Python-3.10.12 torch-2.5.1+cu121 CUDA:0 (Tesla T4, 15102MiB)

Fusing layers... 


torch.Size([3, 720, 1280])
torch.Size([1, 3, 640, 640])
torch.Size([1, 3, 640, 640])


  globals().clear()
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


tensor([[[3.51057e+00, 4.11853e+00, 9.09811e+00,  ..., 6.63133e-04, 5.46850e-04, 7.67264e-04],
         [9.88525e+00, 3.39440e+00, 1.96917e+01,  ..., 4.27912e-04, 5.07814e-04, 9.80709e-04],
         [1.97024e+01, 2.78587e+00, 2.14639e+01,  ..., 4.71157e-04, 6.51654e-04, 2.23510e-03],
         ...,
         [5.63638e+02, 6.12640e+02, 1.82928e+02,  ..., 1.46065e-03, 1.72118e-03, 1.45068e-03],
         [5.87000e+02, 6.07563e+02, 1.37786e+02,  ..., 1.54971e-03, 2.18710e-03, 1.78705e-03],
         [6.14991e+02, 6.20093e+02, 1.51012e+02,  ..., 1.91309e-03, 2.60656e-03, 2.49427e-03]]], device='cuda:0')


  with amp.autocast(autocast):


### post process

In [38]:
# Post-process the results
boxes, confidences, class_ids = postprocess(predictions, conf_threshold=0.01)

for i, class_id in enumerate(class_ids):
    print(f"Class {class_names[class_id.item()]} with confidence {confidences[i].item():.2f}: {boxes[i]}")


Box 0: tensor([287.92484, 285.01569, 651.25946, 581.83923], device='cuda:0'), Confidence: 0.01, Class ID: 6
Box 1: tensor([287.02585, 307.47607, 653.79816, 629.22656], device='cuda:0'), Confidence: 0.02, Class ID: 6
Box 2: tensor([286.52527, 335.21878, 653.52380, 623.64349], device='cuda:0'), Confidence: 0.02, Class ID: 6
Class train with confidence 0.01: tensor([287.92484, 285.01569, 651.25946, 581.83923], device='cuda:0')
Class train with confidence 0.02: tensor([287.02585, 307.47607, 653.79816, 629.22656], device='cuda:0')
Class train with confidence 0.02: tensor([286.52527, 335.21878, 653.52380, 623.64349], device='cuda:0')
