In [3]:
import os
import os.path as osp

from mmengine.config import Config, ConfigDict, DictAction
from mmengine.registry import RUNNERS
from mmengine.runner import Runner
from mmengine.utils import digit_version
from mmengine.utils.dl_utils import TORCH_VERSION
from mmengine.registry import MODELS


In [5]:
from typing import Tuple, Union

import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor

from mmcv.cnn import ConvModule
from mmpretrain.models.backbones.base_backbone import BaseBackbone
from mmengine.runner import CheckpointLoader
from sebpretrain.models.utils import BasicBlock, Bottleneck
from sebpretrain.utils import OptConfigType


In [6]:
cfg = Config.fromfile('test.py')
cfg = Config.fromfile('configs/pretrain01/test.py')
#cfg = Config.fromfile('configs/pretrain01/pretrain01_4xb32_in1k.py')

In [7]:
cfg.work_dir = os.getcwd()
display(cfg, cfg.model, cfg.model.backbone, cfg.work_dir)

Config (path: configs/pretrain01/test.py): {'dataset_type': 'ImageNet', 'data_preprocessor': {'num_classes': 1000, 'mean': [123.675, 116.28, 103.53], 'std': [58.395, 57.12, 57.375], 'to_rgb': True}, 'train_pipeline': [{'type': 'LoadImageFromFile'}, {'type': 'RandomResizedCrop', 'scale': 224}, {'type': 'RandomFlip', 'prob': 0.5, 'direction': 'horizontal'}, {'type': 'PackInputs'}], 'test_pipeline': [{'type': 'LoadImageFromFile'}, {'type': 'ResizeEdge', 'scale': 256, 'edge': 'short'}, {'type': 'CenterCrop', 'crop_size': 224}, {'type': 'PackInputs'}], 'train_dataloader': {'batch_size': 32, 'num_workers': 5, 'dataset': {'type': 'ImageNet', 'data_root': 'data/imagenet/train', 'split': 'train', 'pipeline': [{'type': 'LoadImageFromFile'}, {'type': 'RandomResizedCrop', 'scale': 224}, {'type': 'RandomFlip', 'prob': 0.5, 'direction': 'horizontal'}, {'type': 'PackInputs'}]}, 'sampler': {'type': 'DefaultSampler', 'shuffle': True}}, 'val_dataloader': {'batch_size': 64, 'num_workers': 5, 'dataset': {

{'type': 'ImageClassifier',
 'backbone': {'type': 'SEBNet',
  'in_channels': 3,
  'channels': 64,
  'ppm_channels': 96,
  'num_stem_blocks': 2,
  'num_branch_blocks': 3,
  'align_corners': False},
 'head': {'type': 'DINOHead',
  'in_dim': 1024,
  'out_dim': 1000,
  'use_bn': False,
  'nlayers': 3,
  'hidden_dim': 2048,
  'bottleneck_dim': 256,
  'mlp_bias': True,
  'loss': {'type': 'CrossEntropyLoss', 'loss_weight': 1.0}}}

{'type': 'SEBNet',
 'in_channels': 3,
 'channels': 64,
 'ppm_channels': 96,
 'num_stem_blocks': 2,
 'num_branch_blocks': 3,
 'align_corners': False}

'/home/robert.breslin/alessandro/thesis/exp1'

In [8]:
#print(list(MODELS.module_dict.keys()))
#print(MODELS.module_dict['SEBNet'])

In [9]:
@MODELS.register_module()
class SEBNet(BaseBackbone):
    """SEBNet backbone.

    This backbone is the implementation of `SEBNet: Real-Time Semantic
    Segmentation with Semantic Boundary Detection Conditioning.

    Licensed under the MIT License.

    Args:
        in_channels (int): The number of input channels. Default: 3.
        channels (int): The number of channels in the stem layer. Default: 64.
        ppm_channels (int): The number of channels in the PPM layer.
            Default: 96.
        num_stem_blocks (int): The number of blocks in the stem layer.
            Default: 2.
        num_branch_blocks (int): The number of blocks in the branch layer.
            Default: 3.
        align_corners (bool): The align_corners argument of F.interpolate.
            Default: False.
        norm_cfg (dict): Config dict for normalization layer.
            Default: dict(type='BN').
        act_cfg (dict): Config dict for activation layer.
            Default: dict(type='ReLU', inplace=True).
        init_cfg (dict): Config dict for initialization. Default: None.
    """

    def __init__(self,
                 in_channels: int = 3,
                 channels: int = 64,
                 ppm_channels: int = 96,
                 num_stem_blocks: int = 2,
                 num_branch_blocks: int = 3,
                 align_corners: bool = False,
                 norm_cfg: OptConfigType = dict(type='BN'),
                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
                 init_cfg: OptConfigType = None,
                 **kwargs):
        super(SEBNet, self).__init__(init_cfg)
        self.norm_cfg = norm_cfg
        self.act_cfg = act_cfg
        self.align_corners = align_corners

        # stem layer - we need better granularity to integrate the SBD modules
        self.conv1 =  nn.Sequential(
             ConvModule(
                in_channels,
                channels,
                kernel_size=3,
                stride=2,
                padding=1,
                norm_cfg=self.norm_cfg,
                act_cfg=self.act_cfg),
            ConvModule(
                channels,
                channels,
                kernel_size=3,
                stride=2,
                padding=1,
                norm_cfg=self.norm_cfg,
                act_cfg=self.act_cfg)
        )
        self.stage_1 = self._make_layer(
            block=BasicBlock,
            in_channels=channels,
            channels=channels,
            num_blocks=num_stem_blocks)
        self.stage_2 = self._make_layer(
            block=BasicBlock,
            in_channels=channels,
            channels=channels * 2,
            num_blocks=num_stem_blocks,
            stride=2)
        self.relu = nn.ReLU()

        # I Branch
        self.i_branch_layers = nn.ModuleList()
        for i in range(3):
            self.i_branch_layers.append(
                self._make_layer(
                    block=BasicBlock if i < 2 else Bottleneck,
                    in_channels=channels * 2**(i + 1),
                    channels=channels * 8 if i > 0 else channels * 4,
                    num_blocks=num_branch_blocks if i < 2 else 2,
                    stride=2))
        
    def _make_stem_layer(self, in_channels: int, channels: int,
                         num_blocks: int) -> nn.Sequential:
        """Make stem layer.

        Args:
            in_channels (int): Number of input channels.
            channels (int): Number of output channels.
            num_blocks (int): Number of blocks.

        Returns:
            nn.Sequential: The stem layer.
        """

        layers = [
            ConvModule(
                in_channels,
                channels,
                kernel_size=3,
                stride=2,
                padding=1,
                norm_cfg=self.norm_cfg,
                act_cfg=self.act_cfg),
            ConvModule(
                channels,
                channels,
                kernel_size=3,
                stride=2,
                padding=1,
                norm_cfg=self.norm_cfg,
                act_cfg=self.act_cfg)
        ]

        layers.append(
            self._make_layer(BasicBlock, channels, channels, num_blocks))
        layers.append(nn.ReLU())
        layers.append(
            self._make_layer(
                BasicBlock, channels, channels * 2, num_blocks, stride=2))
        layers.append(nn.ReLU())

        return nn.Sequential(*layers)

    def _make_layer(self,
                    block: BasicBlock,
                    in_channels: int,
                    channels: int,
                    num_blocks: int,
                    stride: int = 1) -> nn.Sequential:
        """Make layer for PIDNet backbone.
        Args:
            block (BasicBlock): Basic block.
            in_channels (int): Number of input channels.
            channels (int): Number of output channels.
            num_blocks (int): Number of blocks.
            stride (int): Stride of the first block. Default: 1.

        Returns:
            nn.Sequential: The Branch Layer.
        """
        downsample = None
        if stride != 1 or in_channels != channels * block.expansion:
            downsample = ConvModule(
                in_channels,
                channels * block.expansion,
                kernel_size=1,
                stride=stride,
                norm_cfg=self.norm_cfg,
                act_cfg=None)

        layers = [block(in_channels, channels, stride, downsample)]
        in_channels = channels * block.expansion
        for i in range(1, num_blocks):
            layers.append(
                block(
                    in_channels,
                    channels,
                    stride=1,
                    act_cfg_out=None if i == num_blocks - 1 else self.act_cfg))
        return nn.Sequential(*layers)

    def _make_single_layer(self,
                           block: Union[BasicBlock, Bottleneck],
                           in_channels: int,
                           channels: int,
                           stride: int = 1) -> nn.Module:
        """Make single layer for PIDNet backbone.
        Args:
            block (BasicBlock or Bottleneck): Basic block or Bottleneck.
            in_channels (int): Number of input channels.
            channels (int): Number of output channels.
            stride (int): Stride of the first block. Default: 1.

        Returns:
            nn.Module
        """

        downsample = None
        if stride != 1 or in_channels != channels * block.expansion:
            downsample = ConvModule(
                in_channels,
                channels * block.expansion,
                kernel_size=1,
                stride=stride,
                norm_cfg=self.norm_cfg,
                act_cfg=None)
        return block(
            in_channels, channels, stride, downsample, act_cfg_out=None)

    def init_weights(self):
        """Initialize the weights in backbone.

        Since the D branch is not initialized by the pre-trained model, we
        initialize it with the same method as the ResNet.
        """
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(
                    m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
        if self.init_cfg is not None:
            assert 'checkpoint' in self.init_cfg, f'Only support ' \
                                                  f'specify `Pretrained` in ' \
                                                  f'`init_cfg` in ' \
                                                  f'{self.__class__.__name__} '
            ckpt = CheckpointLoader.load_checkpoint(
                self.init_cfg['checkpoint'], map_location='cpu')
            self.load_state_dict(ckpt, strict=False)

    def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor]]:
        """Forward function.

        Args:
            x (Tensor): Input tensor with shape (N, C, H, W).

        Returns:
            Tensor or tuple[Tensor]: If self.training is True, return
                tuple[Tensor], else return Tensor.
        """
        w_out = x.shape[-1] // 8
        h_out = x.shape[-2] // 8

        # stage 0
        x = self.conv1(x) # (N, C=64, H/4, W/4)

        # stage 1
        x_1 = self.relu(self.stage_1(x)) # (N, C=64, H/4, W/4)

        # stage 2
        x_2 = self.relu(self.stage_2(x_1)) # (N, C=128, H/8, W/8)

        # stage 3
        x_3 = self.relu(self.i_branch_layers[0](x_2)) # (N, C=256, H/16, W/16)

        # stage 4
        x_4 = self.relu(self.i_branch_layers[1](x_3)) # (N, C=512, H/32, W/32)

        # stage 5
        x_5 = self.i_branch_layers[2](x_4) # (N, C=1024, H/64, W/64)

        return x_5

In [10]:
model = MODELS.build(cfg.model.backbone)

Here 2
obj_cls: <class '__main__.SEBNet'>
**args: {'in_channels': 3, 'channels': 64, 'ppm_channels': 96, 'num_stem_blocks': 2, 'num_branch_blocks': 3, 'align_corners': False}
Here 2
obj_cls: <class 'torch.nn.modules.activation.ReLU'>
**args: {'inplace': True}
Here 2
obj_cls: <class 'torch.nn.modules.activation.ReLU'>
**args: {'inplace': True}
Here 2
obj_cls: <class 'torch.nn.modules.activation.ReLU'>
**args: {'inplace': True}
Here 2
obj_cls: <class 'torch.nn.modules.activation.ReLU'>
**args: {'inplace': True}
Here 2
obj_cls: <class 'torch.nn.modules.activation.ReLU'>
**args: {'inplace': True}
Here 2
obj_cls: <class 'torch.nn.modules.activation.ReLU'>
**args: {'inplace': True}
Here 2
obj_cls: <class 'torch.nn.modules.activation.ReLU'>
**args: {'inplace': True}
Here 2
obj_cls: <class 'torch.nn.modules.activation.ReLU'>
**args: {'inplace': True}
Here 2
obj_cls: <class 'torch.nn.modules.activation.ReLU'>
**args: {'inplace': True}
Here 2
obj_cls: <class 'torch.nn.modules.activation.ReLU'>
*

In [11]:
# Copyright (c) OpenMMLab. All rights reserved.
from typing import List, Optional, Tuple, Union

import torch
import torch.nn as nn
import torch.nn.functional as F
from mmengine.model import BaseModule

from mmpretrain.evaluation.metrics import Accuracy
from mmpretrain.registry import MODELS
from mmpretrain.structures import DataSample


#@MODELS.register_module()
class ClsHead(BaseModule):
    """Classification head.

    Args:
        loss (dict): Config of classification loss. Defaults to
            ``dict(type='CrossEntropyLoss', loss_weight=1.0)``.
        topk (int | Tuple[int]): Top-k accuracy. Defaults to ``(1, )``.
        cal_acc (bool): Whether to calculate accuracy during training.
            If you use batch augmentations like Mixup and CutMix during
            training, it is pointless to calculate accuracy.
            Defaults to False.
        init_cfg (dict, optional): the config to control the initialization.
            Defaults to None.
    """

    def __init__(self,
                 loss: dict = dict(type='CrossEntropyLoss', loss_weight=1.0),
                 topk: Union[int, Tuple[int]] = (1, ),
                 cal_acc: bool = False,
                 init_cfg: Optional[dict] = None):
        super(ClsHead, self).__init__(init_cfg=init_cfg)

        self.topk = topk
        if not isinstance(loss, nn.Module):
            loss = MODELS.build(loss)
        self.loss_module = loss
        self.cal_acc = cal_acc

    def pre_logits(self, feats: Tuple[torch.Tensor]) -> torch.Tensor:
        """The process before the final classification head.

        The input ``feats`` is a tuple of tensor, and each tensor is the
        feature of a backbone stage. In ``ClsHead``, we just obtain the feature
        of the last stage.
        """
        # The ClsHead doesn't have other module, just return after unpacking.
        return feats[-1]

    def forward(self, feats: Tuple[torch.Tensor]) -> torch.Tensor:
        """The forward process."""
        pre_logits = self.pre_logits(feats)
        # The ClsHead doesn't have the final classification head,
        # just return the unpacked inputs.
        return pre_logits

    def loss(self, feats: Tuple[torch.Tensor], data_samples: List[DataSample],
             **kwargs) -> dict:
        """Calculate losses from the classification score.

        Args:
            feats (tuple[Tensor]): The features extracted from the backbone.
                Multiple stage inputs are acceptable but only the last stage
                will be used to classify. The shape of every item should be
                ``(num_samples, num_classes)``.
            data_samples (List[DataSample]): The annotation data of
                every samples.
            **kwargs: Other keyword arguments to forward the loss module.

        Returns:
            dict[str, Tensor]: a dictionary of loss components
        """
        # The part can be traced by torch.fx
        cls_score = self(feats) # invokes __call__ method of nn.Module(), which invokes the forward() method.

        # The part can not be traced by torch.fx
        losses = self._get_loss(cls_score, data_samples, **kwargs)
        return losses

    def _get_loss(self, cls_score: torch.Tensor,
                  data_samples: List[DataSample], **kwargs):
        """Unpack data samples and compute loss."""
        # Unpack data samples and pack targets
        if 'gt_score' in data_samples[0]:
            # Batch augmentation may convert labels to one-hot format scores.
            target = torch.stack([i.gt_score for i in data_samples])
        else:
            target = torch.cat([i.gt_label for i in data_samples])

        # compute loss
        losses = dict()
        loss = self.loss_module(
            cls_score, target, avg_factor=cls_score.size(0), **kwargs)
        losses['loss'] = loss

        # compute accuracy
        if self.cal_acc:
            assert target.ndim == 1, 'If you enable batch augmentation ' \
                'like mixup during training, `cal_acc` is pointless.'
            acc = Accuracy.calculate(cls_score, target, topk=self.topk)
            losses.update(
                {f'accuracy_top-{k}': a
                 for k, a in zip(self.topk, acc)})

        return losses

    def predict(
        self,
        feats: Tuple[torch.Tensor],
        data_samples: Optional[List[Optional[DataSample]]] = None
    ) -> List[DataSample]:
        """Inference without augmentation.

        Args:
            feats (tuple[Tensor]): The features extracted from the backbone.
                Multiple stage inputs are acceptable but only the last stage
                will be used to classify. The shape of every item should be
                ``(num_samples, num_classes)``.
            data_samples (List[DataSample | None], optional): The annotation
                data of every samples. If not None, set ``pred_label`` of
                the input data samples. Defaults to None.

        Returns:
            List[DataSample]: A list of data samples which contains the
            predicted results.
        """
        # The part can be traced by torch.fx
        cls_score = self(feats)

        # The part can not be traced by torch.fx
        predictions = self._get_predictions(cls_score, data_samples)
        return predictions

    def _get_predictions(self, cls_score, data_samples):
        """Post-process the output of head.

        Including softmax and set ``pred_label`` of data samples.
        """
        pred_scores = F.softmax(cls_score, dim=1)
        pred_labels = pred_scores.argmax(dim=1, keepdim=True).detach()

        out_data_samples = []
        if data_samples is None:
            data_samples = [None for _ in range(pred_scores.size(0))]

        for data_sample, score, label in zip(data_samples, pred_scores,
                                             pred_labels):
            if data_sample is None:
                data_sample = DataSample()

            data_sample.set_pred_score(score).set_pred_label(label)
            out_data_samples.append(data_sample)
        return out_data_samples

In [12]:
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the Apache License, Version 2.0
# found in the LICENSE file in the root directory of this source tree.

import torch
import torch.nn as nn
from torch.nn.init import trunc_normal_
from torch.nn.utils import weight_norm
from mmpretrain.registry import MODELS

@MODELS.register_module()
class DINOHead(ClsHead): # Changed from nn.Module to ClsHead, which inherits from BaseModule, which inherits from nn.Module
    def __init__(
        self,
        in_dim,
        out_dim,
        use_bn=False,
        nlayers=3,
        hidden_dim=2048,
        bottleneck_dim=256,
        mlp_bias=True,
        loss: dict = dict(type='CrossEntropyLoss', loss_weight=1.0),
    ):
        super().__init__()
        nlayers = max(nlayers, 1)
        self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias)
        self.apply(self._init_weights)
        self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
        self.last_layer.weight_g.data.fill_(1)
        if not isinstance(loss, nn.Module):
            loss = MODELS.build(loss)
        self.loss_module = loss

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=0.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        x = self.mlp(x)
        eps = 1e-6 if x.dtype == torch.float16 else 1e-12
        x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
        # The final classification head
        x = self.last_layer(x)
        return x

def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True):
    if nlayers == 1:
        return nn.Linear(in_dim, bottleneck_dim, bias=bias)
    else:
        layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
        if use_bn:
            layers.append(nn.BatchNorm1d(hidden_dim))
        layers.append(nn.GELU())
        for _ in range(nlayers - 2):
            layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
            if use_bn:
                layers.append(nn.BatchNorm1d(hidden_dim))
            layers.append(nn.GELU())
        layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
        return nn.Sequential(*layers)

In [13]:
model = MODELS.build(cfg.model.head)

Here 2
obj_cls: <class '__main__.DINOHead'>
**args: {'in_dim': 1024, 'out_dim': 1000, 'use_bn': False, 'nlayers': 3, 'hidden_dim': 2048, 'bottleneck_dim': 256, 'mlp_bias': True, 'loss': {'type': 'CrossEntropyLoss', 'loss_weight': 1.0}}
Here 2
obj_cls: <class 'mmpretrain.models.losses.cross_entropy_loss.CrossEntropyLoss'>
**args: {'loss_weight': 1.0}
Here 2
obj_cls: <class 'mmpretrain.models.losses.cross_entropy_loss.CrossEntropyLoss'>
**args: {'loss_weight': 1.0}


  WeightNorm.apply(module, name, dim)


In [14]:
runner = Runner.from_cfg(cfg)

05/29 19:00:19 - mmengine - [4m[97mINFO[0m - 
------------------------------------------------------------
System environment:
    sys.platform: linux
    Python: 3.8.20 (default, Oct  3 2024, 15:24:27) [GCC 11.2.0]
    CUDA available: True
    MUSA available: False
    numpy_random_seed: 1321360872
    GPU 0,1,2,3: NVIDIA A100-SXM4-40GB
    CUDA_HOME: /usr/local/cuda
    NVCC: Cuda compilation tools, release 12.4, V12.4.131
    GCC: gcc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0
    PyTorch: 2.4.1
    PyTorch compiling details: PyTorch built with:
  - GCC 9.3
  - C++ Version: 201703
  - Intel(R) oneAPI Math Kernel Library Version 2023.1-Product Build 20230303 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v3.4.2 (Git Hash 1137e04ec0b5251ca2b4400a4fd3c667ce843d67)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - LAPACK is enabled (usually provided by MKL)
  - NNPACK is enabled
  - CPU capability usage: AVX2
  - CUDA Runtime 12.4
  - NVCC architecture flags: -gencode;arch=compute

  WeightNorm.apply(module, name, dim)


05/29 19:00:20 - mmengine - [4m[97mINFO[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.
Here 2
obj_cls: <class 'mmengine.hooks.runtime_info_hook.RuntimeInfoHook'>
**args: {}
Here 2
obj_cls: <class 'mmengine.hooks.iter_timer_hook.IterTimerHook'>
**args: {}
Here 2
obj_cls: <class 'mmengine.hooks.sampler_seed_hook.DistSamplerSeedHook'>
**args: {}
Here 2
obj_cls: <class 'mmengine.hooks.logger_hook.LoggerHook'>
**args: {'interval': 100}
Here 2
obj_cls: <class 'mmengine.hooks.param_scheduler_hook.ParamSchedulerHook'>
**args: {}
Here 2
obj_cls: <class 'mmengine.hooks.checkpoint_hook.CheckpointHook'>
**args: {'interval': 1}
Here 2
obj_cls: <class 'mmpretrain.engine.hooks.visualization_hook.VisualizationHook'>
**args: {'enable': False}
05/29 19:00:20 - mmengine - [4m[97mINFO[0m - Hooks will be executed in the following order:
before_run:
(VERY_HIGH   ) RuntimeInfoHook              

In [15]:
runner = RUNNERS.build(cfg)

05/29 19:00:20 - mmengine - [4m[97mINFO[0m - 
------------------------------------------------------------
System environment:
    sys.platform: linux
    Python: 3.8.20 (default, Oct  3 2024, 15:24:27) [GCC 11.2.0]
    CUDA available: True
    MUSA available: False
    numpy_random_seed: 473739851
    GPU 0,1,2,3: NVIDIA A100-SXM4-40GB
    CUDA_HOME: /usr/local/cuda
    NVCC: Cuda compilation tools, release 12.4, V12.4.131
    GCC: gcc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0
    PyTorch: 2.4.1
    PyTorch compiling details: PyTorch built with:
  - GCC 9.3
  - C++ Version: 201703
  - Intel(R) oneAPI Math Kernel Library Version 2023.1-Product Build 20230303 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v3.4.2 (Git Hash 1137e04ec0b5251ca2b4400a4fd3c667ce843d67)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - LAPACK is enabled (usually provided by MKL)
  - NNPACK is enabled
  - CPU capability usage: AVX2
  - CUDA Runtime 12.4
  - NVCC architecture flags: -gencode;arch=compute_