### **Fine-tuning YOLO-NAS-POSE on the LARD Dataset**

First let's create the database

In [None]:
import os
import json
from pathlib import Path

image_base = Path("/home/aws_install/data/yolonas_pose_base/images")
label_base = Path("/home/aws_install/data/yolonas_pose_base/labels")
output_json_path = "/home/aws_install/data/yolonas_pose_base/annotations/yolonas_pose_annotations.json"

categories = [{
    "id": 0,
    "name": "runway",
    "keypoints": ["A", "B", "C", "D"], 
    "skeleton": [
        [0, 1],
        [1, 2],
        [2, 3],
        [3, 0]
    ]
}]

annotation_id = 1
image_id = 1
json_data = {
    "info": {},
    "categories": categories,
    "images": [],
    "annotations": []
}

for split in ["train", "val", "test"]:
    split_image_folder = image_base / split
    split_label_folder = label_base / split

    for label_path in split_label_folder.glob("*.txt"):
        image_name = None
        image_path = None

        for ext in [".jpeg", ".png"]:
            candidate_name = label_path.stem + ext
            candidate_path = split_image_folder / candidate_name
            if candidate_path.exists():
                image_name = candidate_name
                image_path = candidate_path
                break

        if image_path is None or not image_path.exists():
            print(f"⚠️ Image not found for label {label_path.name}")
            continue

        with open(label_path, "r") as f:
            line = f.readline().strip()
            parts = line.split()

        if len(parts) < 5:
            print(f"⚠️ Ligne d’annotation trop courte dans {label_path.name}")
            continue

        class_id = int(parts[0])
        x_center, y_center, width, height = map(float, parts[1:5])

        x1 = x_center - width / 2
        y1 = y_center - height / 2

        # === Keypoints extraction ===
        keypoints = []
        for i in range(5, len(parts), 3):
            try:
                x, y, v = float(parts[i]), float(parts[i+1]), int(parts[i+2])
                keypoints.extend([x, y, v])
            except:
                keypoints.extend([0, 0, 0])  # fallback si ligne incomplète

        image_dict = {
            "file_name": str(image_path.relative_to(image_base)),

            "id": image_id
        }
        json_data["images"].append(image_dict)

        annotation = {
            "id": annotation_id,
            "image_id": image_id,
            "category_id": class_id,
            "bbox": [x1, y1, width, height],
            "keypoints": keypoints
        }

        json_data["annotations"].append(annotation)
        annotation_id += 1
        image_id += 1

# saving the JSON data
os.makedirs(os.path.dirname(output_json_path), exist_ok=True)
with open(output_json_path, "w") as f:
    json.dump(json_data, f, indent=2)

print(f"✅ JSON sauvegardé dans {output_json_path}")


⚠️ Image not found for label DIAP_21_500_269.txt


✅ JSON sauvegardé dans /home/aws_install/data/yolonas_pose_base/annotations/yolonas_pose_annotations.json


We build our dataset

In [37]:
import os
import json
import cv2
import numpy as np

from typing import List, Tuple, Union

from sklearn.model_selection import train_test_split

from super_gradients.common.decorators.factory_decorator import resolve_param
from super_gradients.common.factories.transforms_factory import TransformsFactory
from super_gradients.training.transforms.keypoint_transforms import AbstractKeypointTransform
from super_gradients.training.samples import PoseEstimationSample
from super_gradients.training.datasets.pose_estimation_datasets.abstract_pose_estimation_dataset import AbstractPoseEstimationDataset

from super_gradients.training.utils.distributed_training_utils import wait_for_the_master
from super_gradients.common.environment.ddp_utils import get_local_rank
from super_gradients.training.datasets.pose_estimation_datasets import YoloNASPoseCollateFN


class RunwayPoseEstimationDataset(AbstractPoseEstimationDataset):
    @classmethod
    def split_runway_pose_dataset(cls, annotation_file, train_annotation_file, val_annotation_file, val_fraction):
        """
        Splits the runway pose dataset into training and validation sets.
        :param annotation_file: Path to the original annotation file.
        :param train_annotation_file: Path to save the training annotations.
        :param val_annotation_file: Path to save the validation annotations.
        :param val_fraction: Fraction of the dataset to be used for validation.
        """
        with open(annotation_file, "r") as f:
            annotation = json.load(f)

        image_ids = [img["id"] for img in annotation["images"]]
        labels = [[ann["category_id"] for ann in annotation["annotations"] if ann["image_id"] == img_id] for img_id in image_ids]
        labels = [label[0] if len(label) else -1 for label in labels]

        train_ids, val_ids = train_test_split(image_ids, test_size=val_fraction, random_state=42, stratify=labels)

        train_annotations = {
            "info": annotation.get("info", {}),
            "categories": annotation["categories"],
            "images": [img for img in annotation["images"] if img["id"] in train_ids],
            "annotations": [ann for ann in annotation["annotations"] if ann["image_id"] in train_ids],
        }

        val_annotations = {
            "info": annotation.get("info", {}),
            "categories": annotation["categories"],
            "images": [img for img in annotation["images"] if img["id"] in val_ids],
            "annotations": [ann for ann in annotation["annotations"] if ann["image_id"] in val_ids],
        }

        with open(train_annotation_file, "w") as f:
            json.dump(train_annotations, f, indent=2)
        with open(val_annotation_file, "w") as f:
            json.dump(val_annotations, f, indent=2)

    @resolve_param("transforms", TransformsFactory())
    def __init__(
        self,
        data_dir: str,
        images_dir: str,
        json_file: str,
        transforms: List[AbstractKeypointTransform],
        edge_links: Union[List[Tuple[int, int]], np.ndarray],
        edge_colors: Union[List[Tuple[int, int, int]], np.ndarray, None],
        keypoint_colors: Union[List[Tuple[int, int, int]], np.ndarray, None],
    ):
        """
        :param data_dir: Root directory of the COCO dataset
        :param images_dir: path suffix to the images directory inside the data_dir
        :param json_file: path suffix to the json file inside the data_dir
        :param include_empty_samples: Not used, but exists for compatibility with COCO dataset config.
        :param target_generator: Target generator that will be used to generate the targets for the model.
            See DEKRTargetsGenerator for an example.
        :param transforms: Transforms to be applied to the image & keypoints
        """
        json_path = os.path.join(data_dir, json_file)
        with open(json_path, "r") as f:
            annotation = json.load(f)

        joints = annotation["categories"][0]["keypoints"]
        num_joints = len(joints)

        super().__init__( 
            transforms=transforms,
            num_joints=num_joints,
            edge_links=edge_links,
            edge_colors=edge_colors,
            keypoint_colors=keypoint_colors,
        )

        self.image_id_to_file = {img["id"]: os.path.join(data_dir, images_dir, img["file_name"]) for img in annotation["images"]}
        self.image_ids = list(self.image_id_to_file.keys())
        self.image_files = list(self.image_id_to_file.values())

        self.annotations = []
        for image_id in self.image_ids:
            anns = [ann for ann in annotation["annotations"] if ann["image_id"] == image_id]
            keypoints_list = []
            bboxes_list = []
            for ann in anns:
                kpts = np.array(ann["keypoints"]).reshape(num_joints, 3)
                x, y, w, h = ann["bbox"]
                keypoints_list.append(kpts)
                bboxes_list.append(np.array([x, y, w, h]))
            if keypoints_list:
                self.annotations.append((np.array(keypoints_list, dtype=np.float32), np.array(bboxes_list, dtype=np.float32)))
            else:
                self.annotations.append((np.zeros((0, num_joints, 3)), np.zeros((0, 4))))

    def __len__(self):
        """
        Returns the number of samples in the dataset.
        :return: Number of samples in the dataset.
        """
        return len(self.image_ids)

    def load_sample(self, index) -> PoseEstimationSample:
        """
        Loads a sample from the dataset.
        :param index: Index of the sample to load.
        :return: PoseEstimationSample object containing the image, mask, joints, areas, bounding boxes, and is_crowd.
        """
        image = cv2.imread(self.image_files[index])
        joints, bboxes = self.annotations[index]
        areas = np.array([w * h for (_, _, w, h) in bboxes], dtype=np.float32)
        iscrowd = np.zeros(len(joints), dtype=bool)
        mask = np.ones(image.shape[:2], dtype=np.float32)
        return PoseEstimationSample(image=image, mask=mask, joints=joints, areas=areas, bboxes_xywh=bboxes, is_crowd=iscrowd, additional_samples=None)


Let's split

In [38]:
RunwayPoseEstimationDataset.split_runway_pose_dataset(
    annotation_file="/home/aws_install/data/yolonas_pose_base/annotations/yolonas_pose_annotations.json",
    train_annotation_file="/home/aws_install/data/yolonas_pose_base/annotations/yolonas_pose_train.json",
    val_annotation_file="/home/aws_install/data/yolonas_pose_base/annotations/yolonas_pose_val.json",
    val_fraction=0.2
)

In [None]:
import json
import os

annotation_files = {
    "Complet": "/home/aws_install/data/yolonas_pose_base/annotations/yolonas_pose_annotations.json",
    "Train": "/home/aws_install/data/yolonas_pose_base/annotations/yolonas_pose_train.json",
    "Val": "/home/aws_install/data/yolonas_pose_base/annotations/yolonas_pose_val.json"
}

for label, path in annotation_files.items():
    if not os.path.exists(path):
        print(f"⚠️ Fichier manquant : {path}")
        continue

    print(f"\n📂 Vérification du fichier : {label}")
    with open(path, "r") as f:
        data = json.load(f)

    print(" - Nombre d'images :", len(data["images"]))
    print(" - Nombre d'annotations :", len(data["annotations"]))
    print(" - Nombre de catégories :", len(data["categories"]))

    keypoint_lengths = [len(ann["keypoints"]) for ann in data["annotations"]]
    if keypoint_lengths:
        print(" - Keypoints par annotation :")
        print(f"   > Min : {min(keypoint_lengths)}")
        print(f"   > Max : {max(keypoint_lengths)}")
        print(f"   > Moyenne : {sum(keypoint_lengths)/len(keypoint_lengths):.2f}")
    else:
        print(" - Pas de keypoints trouvés.")



📂 Vérification du fichier : Complet
 - Nombre d'images : 16747
 - Nombre d'annotations : 16747
 - Nombre de catégories : 1
 - Keypoints par annotation :
   > Min : 12
   > Max : 12
   > Moyenne : 12.00

📂 Vérification du fichier : Train


 - Nombre d'images : 13397
 - Nombre d'annotations : 13397
 - Nombre de catégories : 1
 - Keypoints par annotation :
   > Min : 12
   > Max : 12
   > Moyenne : 12.00

📂 Vérification du fichier : Val
 - Nombre d'images : 3350
 - Nombre d'annotations : 3350
 - Nombre de catégories : 1
 - Keypoints par annotation :
   > Min : 12
   > Max : 12
   > Moyenne : 12.00


In [40]:
KEYPOINT_NAMES = [
    "corner_top_left",
    "corner_top_right",
    "corner_bottom_right",
    "corner_bottom_left",
]

FLIP_INDEXES = [
    1, 0, 3, 2
]

EDGE_LINKS = [
    [0, 1],
    [1, 2],
    [2, 3],
    [3, 0],
]

EDGE_COLORS = [[0, 255, 0]] * len(EDGE_LINKS)
KEYPOINT_COLORS = [[255, 0, 0]] * len(KEYPOINT_NAMES)
NUM_JOINTS = len(KEYPOINT_NAMES)
OKS_SIGMAS = [0.07] * NUM_JOINTS

In [41]:
from super_gradients.training.transforms.keypoints import (
    KeypointsRandomHorizontalFlip,
    KeypointsHSV,
    KeypointsBrightnessContrast,
    KeypointsRandomAffineTransform,
    KeypointsLongestMaxSize,
    KeypointsPadIfNeeded,
    KeypointsImageStandardize,
    KeypointsRemoveSmallObjects,
)

IMAGE_SIZE = 640

# Define the transforms for training and validation datasets
train_transforms = [
    KeypointsRandomHorizontalFlip(flip_index=FLIP_INDEXES, prob=0.5), # Random horizontal flip with specified keypoint flip indexes
    KeypointsHSV(prob=0.5, hgain=20, sgain=20, vgain=20), # Random HSV adjustments
    KeypointsBrightnessContrast(prob=0.5, brightness_range=[0.8, 1.2], contrast_range=[0.8, 1.2]), # Random brightness and contrast adjustments
    KeypointsRandomAffineTransform(
        max_rotation=15,  # Maximum rotation in degrees
        min_scale=0.8,  # Minimum scale factor
        max_scale=1.2, # Maximum scale factor
        max_translate=0.1, # Maximum translation as a fraction of the image size
        image_pad_value=127,    # Padding value for the image
        mask_pad_value=1, # Padding value for the mask
        prob=0.75, # Probability of applying the affine transformation
        interpolation_mode=[0, 1, 2, 3, 4], # Interpolation modes to choose from
    ),
    KeypointsLongestMaxSize(max_height=IMAGE_SIZE, max_width=IMAGE_SIZE), # Resize the image to the longest side with a maximum size
    KeypointsPadIfNeeded( 
        min_height=IMAGE_SIZE, # Minimum height after padding
        min_width=IMAGE_SIZE, # Minimum width after padding
        image_pad_value=[127, 127, 127], # Padding value for the image
        mask_pad_value=1, # Padding value for the mask
        padding_mode="bottom_right", # Padding mode to use
    ),
    KeypointsImageStandardize(max_value=255),
    KeypointsRemoveSmallObjects(min_instance_area=1, min_visible_keypoints=1),
]

val_transforms = [
    KeypointsLongestMaxSize(max_height=IMAGE_SIZE, max_width=IMAGE_SIZE),
    KeypointsPadIfNeeded(
        min_height=IMAGE_SIZE,
        min_width=IMAGE_SIZE,
        image_pad_value=[127, 127, 127],
        mask_pad_value=1,
        padding_mode="bottom_right",
    ),
    KeypointsImageStandardize(max_value=255),
]


In [42]:
train_dataset = RunwayPoseEstimationDataset(
    data_dir="/home/aws_install/data/yolonas_pose_base",     # Root directory of the dataset
    images_dir="/home/aws_install/data/yolonas_pose_base/images",
    json_file="/home/aws_install/data/yolonas_pose_base/annotations/yolonas_pose_train.json",
    transforms=train_transforms, 
    edge_links=EDGE_LINKS,
    edge_colors=EDGE_COLORS,
    keypoint_colors=KEYPOINT_COLORS,
)

val_dataset = RunwayPoseEstimationDataset(
    data_dir="/home/aws_install/data/yolonas_pose_base",     # Root directory of the dataset
    images_dir="/home/aws_install/data/yolonas_pose_base/images",
    json_file="/home/aws_install/data/yolonas_pose_base/annotations/yolonas_pose_val.json",
    transforms=val_transforms,
    edge_links=EDGE_LINKS,
    edge_colors=EDGE_COLORS,
    keypoint_colors=KEYPOINT_COLORS,
)


In [43]:
from torch.utils.data import DataLoader

# Create dataloaders
train_dataloader_params = {"shuffle": True, "batch_size": 24, "drop_last": True, "pin_memory": False, "collate_fn": YoloNASPoseCollateFN()}

val_dataloader_params = {"shuffle": True, "batch_size": 24, "drop_last": True, "pin_memory": False, "collate_fn": YoloNASPoseCollateFN()}

train_dataloader = DataLoader(train_dataset, **train_dataloader_params)

val_dataloader = DataLoader(val_dataset, **val_dataloader_params)

In [44]:
from super_gradients.training.models.pose_estimation_models.yolo_nas_pose import YoloNASPosePostPredictionCallback
from super_gradients.training.utils.callbacks import ExtremeBatchPoseEstimationVisualizationCallback, Phase
from super_gradients.training.utils.early_stopping import EarlyStop
from super_gradients.training.metrics import PoseEstimationMetrics

post_prediction_callback = YoloNASPosePostPredictionCallback(
    pose_confidence_threshold=0.01,
    nms_iou_threshold=0.7,
    pre_nms_max_predictions=300,
    post_nms_max_predictions=30,
)

metrics = PoseEstimationMetrics(
    num_joints=NUM_JOINTS,
    oks_sigmas=OKS_SIGMAS,
    max_objects_per_image=30,
    post_prediction_callback=post_prediction_callback,
)

visualization_callback = ExtremeBatchPoseEstimationVisualizationCallback(
    keypoint_colors=KEYPOINT_COLORS,
    edge_colors=EDGE_COLORS,
    edge_links=EDGE_LINKS,
    loss_to_monitor="YoloNASPoseLoss/loss",
    max=True,
    freq=1,
    max_images=16,
    enable_on_train_loader=True,
    enable_on_valid_loader=True,
    post_prediction_callback=post_prediction_callback,
)

early_stop = EarlyStop(
    phase=Phase.VALIDATION_EPOCH_END,
    monitor="AP",
    mode="max",
    min_delta=0.0001,
    patience=100,
    verbose=True,
)

train_params = {
    "warmup_mode": "LinearBatchLRWarmup",
    "warmup_initial_lr": 1e-8,
    "lr_warmup_epochs": 2,
    "initial_lr": 5e-4,
    "lr_mode": "cosine",
    "cosine_final_lr_ratio": 0.05,
    "max_epochs": 10,
    "zero_weight_decay_on_bias_and_bn": True,
    "batch_accumulate": 1,
    "average_best_models": True,
    "save_ckpt_epoch_list": [],
    "loss": "yolo_nas_pose_loss",
    "criterion_params": {
        "oks_sigmas": OKS_SIGMAS,
        "classification_loss_weight": 1.0,
        "classification_loss_type": "focal",
        "regression_iou_loss_type": "ciou",
        "iou_loss_weight": 2.5,
        "dfl_loss_weight": 0.01,
        "pose_cls_loss_weight": 1.0,
        "pose_reg_loss_weight": 34.0,
        "pose_classification_loss_type": "focal",
        "rescale_pose_loss_with_assigned_score": True,
        "assigner_multiply_by_pose_oks": True,
    },
    "optimizer": "AdamW",
    "optimizer_params": {"weight_decay": 0.000001},
    "ema": True,
    "ema_params": {"decay": 0.997, "decay_type": "threshold"},
    "mixed_precision": True,
    "sync_bn": False,
    "valid_metrics_list": [metrics],
    "phase_callbacks": [visualization_callback, early_stop],
    "pre_prediction_callback": None,
    "metric_to_watch": "AP",
    "greater_metric_to_watch_is_better": True,
}

In [None]:
from super_gradients.training import models
from super_gradients.common.object_names import Models
from super_gradients.training import Trainer

CHECKPOINT_DIR = "checkpoints"
trainer = Trainer(experiment_name="lard_ft_1", ckpt_root_dir=CHECKPOINT_DIR)

yolo_nas_pose = models.get(Models.YOLO_NAS_POSE_M, num_classes=1, pretrained_weights="coco_pose").cuda()

# Note, this is training for 10 epochs to demonstrate how to do it
trainer.train(model=yolo_nas_pose, training_params=train_params, train_loader=train_dataloader, valid_loader=val_dataloader)

 It is your responsibility to determine whether you have permission to use the models for your use case.
 The model you have requested was pre-trained on the coco_pose dataset, published under the following terms: https://cocodataset.org/#termsofuse
[2025-06-24 23:43:49] INFO - checkpoint_utils.py - License Notification: YOLO-NAS-POSE pre-trained weights are subjected to the specific license terms and conditions detailed in 
https://github.com/Deci-AI/super-gradients/blob/master/LICENSE.YOLONAS-POSE.md
By downloading the pre-trained weight files you agree to comply with these terms.
[2025-06-24 23:43:49] INFO - checkpoint_utils.py - Successfully loaded pretrained weights for architecture yolo_nas_pose_m
[2025-06-24 23:43:49] INFO - sg_trainer.py - Starting a new run with `run_id=RUN_20250624_234349_753258`
[2025-06-24 23:43:49] INFO - sg_trainer.py - Checkpoints directory: checkpoints/lard_ft_1/RUN_20250624_234349_753258
[2025-06-24 23:43:49] INFO - sg_trainer.py - Using EMA with param

The console stream is now moved to checkpoints/lard_ft_1/RUN_20250624_234349_753258/console_Jun24_23_43_49.txt


  self.scaler = GradScaler(enabled=mixed_precision_enabled)
[2025-06-24 23:44:00] INFO - sg_trainer_utils.py - TRAINING PARAMETERS:
    - Mode:                         Single GPU
    - Number of GPUs:               1          (1 available on the machine)
    - Full dataset size:            13397      (len(train_set))
    - Batch size per GPU:           24         (batch_size)
    - Batch Accumulate:             1          (batch_accumulate)
    - Total batch size:             24         (num_gpus * batch_size)
    - Effective Batch size:         24         (num_gpus * batch_size * batch_accumulate)
    - Iterations per epoch:         558        (len(train_loader))
    - Gradient updates per epoch:   558        (len(train_loader) / batch_accumulate)
    - Model: YoloNASPose_M  (58.17M parameters, 58.17M optimized)
    - Learning Rates and Weight Decays:
      - default: (58.17M parameters). LR: 0.0005 (58.17M parameters) WD: 0.0, (78.17K parameters), WD: 1e-06, (58.09M parameters)

[202

Train epoch 0:  21%|██        | 117/558 [21:39<1:21:37, 11.11s/it, YoloNASPoseLoss/loss=1.7e+3, YoloNASPoseLoss/loss_cls=1.7e+3, YoloNASPoseLoss/loss_dfl=0, YoloNASPoseLoss/loss_iou=0, YoloNASPoseLoss/loss_pose_cls=0, YoloNASPoseLoss/loss_pose_reg=0, gpu_mem=15.2]
[2025-06-25 00:05:39] INFO - sg_trainer.py - 
[MODEL TRAINING EXECUTION HAS BEEN INTERRUPTED]... Please wait until SOFT-TERMINATION process finishes and saves all of the Model Checkpoints and log files before terminating...
[2025-06-25 00:05:39] INFO - sg_trainer.py - For HARD Termination - Stop the process again
[2025-06-25 00:05:39] INFO - base_sg_logger.py - [CLEANUP] - Successfully stopped system monitoring process


Evaluate 

In [None]:
post_prediction_callback = YoloNASPosePostPredictionCallback(
    pose_confidence_threshold=0.01,
    nms_iou_threshold=0.7,
    pre_nms_max_predictions=300,
    post_nms_max_predictions=30,
)

metrics = PoseEstimationMetrics(
    num_joints=NUM_JOINTS,
    oks_sigmas=OKS_SIGMAS,
    max_objects_per_image=30,
    post_prediction_callback=post_prediction_callback,
)

trainer.test(model=best_model, test_loader=test_dataloader, test_metrics_list=metrics)

Test

In [None]:
img_url = "content/images/2007_000783.jpg"
best_model.predict(img_url, conf=0.20, fuse_model=False).show()