<a href="https://colab.research.google.com/github/amitdoda1983/iith_GPU_inference/blob/main/1_pytorch_inference_cuda_basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Simulate by creating a random image in GPU directly using torch
2. preprocess, inference, post process in GPU.


In [1]:
!pip install torch torchvision
!pip install pycuda




In [2]:
import torch
import pycuda.driver as cuda

class_names = ["person", "bicycle", "car", "motorbike", "airplane", "bus", "train", "truck", "boat", "traffic light",
    "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep",
    "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase",
    "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
    "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana",
    "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
    "potted plant", "bed", "dining table", "toilet", "TV", "laptop", "mouse", "remote", "keyboard", "cell phone",
    "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
    "hair drier", "toothbrush"]

In [3]:
def load_yolov5_model():
    # Load YOLOv5 model (can be replaced with other variants like yolov5m, yolov5l)
    model = torch.hub.load('ultralytics/yolov5:v6.2', 'yolov5s')
    model = model.cuda()  # Move model to GPU
    return model

# Simulate DPU image buffer (this would be GPU memory in real scenario)
def simulate_dpu_image_buffer():
    return torch.rand((1, 3, 1280, 720), dtype=torch.float32, device='cuda')


# Preprocess image on GPU
def preprocess_image_gpu(image_tensor):
    """
    Preprocessing for YOLOv5 model, including resizing, normalization, etc.
    """
    # Resize image (if necessary) - YOLOv5 expects 640x640 input images.
    image_tensor = torch.nn.functional.interpolate(image_tensor, size=(640, 640))  # Resize to 640x640 if needed

    # Normalize the image (YOLOv5 uses the following normalization)
    mean = torch.tensor([0.485, 0.456, 0.406], device=image_tensor.device)  # Mean
    std = torch.tensor([0.229, 0.224, 0.225], device=image_tensor.device)   # Std
    image_tensor = (image_tensor / 255.0 - mean[None, :, None, None]) / std[None, :, None, None]  # Normalize image
    return image_tensor


# Run inference using PyTorch (YOLOv5)
def infer_on_gpu(model, image_tensor):
    model.eval()
    with torch.no_grad():
        predictions = model(image_tensor)
    return predictions

def postprocess(predictions, conf_threshold=0.5):
    pred = predictions[0]
    boxes = pred[:, :4]  # Bounding box coordinates
    confidences = pred[:, 4]  # Confidence scores
    class_probs = pred[:, 5:]  # Class probabilities

    # Get predicted class ID
    class_ids = torch.argmax(class_probs, dim=1)
    overall_confidences = confidences * class_probs.max(dim=1).values

    keep = overall_confidences > conf_threshold
    boxes = boxes[keep]
    confidences = overall_confidences[keep]
    class_ids = class_ids[keep]

    # Print boxes, confidences, and class ids
    for i in range(len(boxes)):
        print(f"Box {i}: {boxes[i]}, Confidence: {confidences[i].item():.2f}, Class ID: {class_ids[i].item()}")

    return boxes, confidences, class_ids

In [4]:
# Load YOLOv5 model (PyTorch)
model = load_yolov5_model()

# Simulate image buffer coming from the DPU (already in GPU memory)
image_tensor = simulate_dpu_image_buffer()

print(f'input image : {image_tensor.shape}')

# Preprocess image directly on GPU
image_tensor = preprocess_image_gpu(image_tensor)
print(f'processed image : {image_tensor.shape}')

# Run inference directly on GPU
predictions = infer_on_gpu(model, image_tensor)
print(f'yolo output : {predictions.shape}')

Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_v6.2
YOLOv5 🚀 2024-12-8 Python-3.10.12 torch-2.5.1+cu121 CUDA:0 (Tesla T4, 15102MiB)

  ckpt = torch.load(attempt_download(w), map_location='cpu')  # load
Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 
  with amp.autocast(autocast):


input image : torch.Size([1, 3, 1280, 720])
processed image : torch.Size([1, 3, 640, 640])
yolo output : torch.Size([1, 25200, 85])


yolo output : torch.Size([1, 25200, 85])

25200 represents the number of grid cells (each cell predicts multiple bounding boxes), and

85 is the number of values predicted for each bounding box, where:

4 values for the bounding box coordinates (x, y, width, height),

1 value for the objectness score (confidence),

80 values for the class scores (in the case of COCO dataset with 80 classes).

In [5]:
# Post-process the results
boxes, confidences, class_ids = postprocess(predictions, conf_threshold=0.01)

for i, class_id in enumerate(class_ids):
    print(f"Class {class_names[class_id.item()]} with confidence {confidences[i].item():.2f}: {boxes[i]}")


Box 0: tensor([287.93216, 285.03326, 651.11548, 581.60620], device='cuda:0'), Confidence: 0.01, Class ID: 6
Box 1: tensor([287.06616, 307.46484, 653.78400, 629.20648], device='cuda:0'), Confidence: 0.02, Class ID: 6
Box 2: tensor([286.55090, 335.25571, 653.60400, 623.77753], device='cuda:0'), Confidence: 0.02, Class ID: 6
Class train with confidence 0.01: tensor([287.93216, 285.03326, 651.11548, 581.60620], device='cuda:0')
Class train with confidence 0.02: tensor([287.06616, 307.46484, 653.78400, 629.20648], device='cuda:0')
Class train with confidence 0.02: tensor([286.55090, 335.25571, 653.60400, 623.77753], device='cuda:0')
