Skip to content

Commit

Permalink
Merge pull request #3718 from chiragjn/cj_nvidia_gpu_isolation
Browse files Browse the repository at this point in the history
Inject Nvidia GPUs using volume-mounts to isolate them to assigned pods
  • Loading branch information
arnaldo2792 committed Feb 7, 2024
2 parents c325a08 + 911775f commit 0da372a
Show file tree
Hide file tree
Showing 13 changed files with 49 additions and 13 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
accept-nvidia-visible-devices-as-volume-mounts = true
accept-nvidia-visible-devices-envvar-when-unprivileged = false

[nvidia-container-cli]
root = "/"
path = "/usr/bin/nvidia-container-cli"
environment = []
ldconfig = "@/sbin/ldconfig"
34 changes: 31 additions & 3 deletions packages/nvidia-container-toolkit/nvidia-container-toolkit.spec
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@ License: Apache-2.0
URL: https://%{goimport}

Source0: https://%{goimport}/archive/v%{gover}/nvidia-container-toolkit-%{gover}.tar.gz
Source1: nvidia-container-toolkit-config.toml
Source1: nvidia-container-toolkit-config-k8s.toml
Source2: nvidia-container-toolkit-tmpfiles.conf
Source3: nvidia-oci-hooks-json
Source4: nvidia-gpu-devices.rules
Source5: nvidia-container-toolkit-config-ecs.toml

BuildRequires: %{_cross_os}glibc-devel
Requires: %{_cross_os}libnvidia-container
Expand All @@ -25,6 +26,22 @@ Requires: %{_cross_os}shimpei
%description
%{summary}.

%package ecs
Summary: Files specific for the ECS variants
Requires: %{name}
Conflicts: %{name}-k8s

%description ecs
%{summary}.

%package k8s
Summary: Files specific for the Kubernetes variants
Requires: %{name}
Conflicts: %{name}-ecs

%description k8s
%{summary}.

%prep
%autosetup -n %{gorepo}-%{gover} -p1
%cross_go_setup %{gorepo}-%{gover} %{goproject} %{goimport}
Expand All @@ -43,19 +60,30 @@ install -d %{buildroot}%{_cross_datadir}/nvidia-container-toolkit
install -d %{buildroot}%{_cross_factorydir}/etc/nvidia-container-runtime
install -p -m 0755 nvidia-container-runtime-hook %{buildroot}%{_cross_bindir}/
install -p -m 0755 nvidia-ctk %{buildroot}%{_cross_bindir}/
install -m 0644 %{S:1} %{buildroot}%{_cross_factorydir}/etc/nvidia-container-runtime/config.toml
install -m 0644 %{S:1} %{S:5} %{buildroot}%{_cross_factorydir}/etc/nvidia-container-runtime/
install -m 0644 %{S:2} %{buildroot}%{_cross_tmpfilesdir}/nvidia-container-toolkit.conf
install -m 0644 %{S:3} %{buildroot}%{_cross_templatedir}/nvidia-oci-hooks-json
install -p -m 0644 %{S:4} %{buildroot}%{_cross_udevrulesdir}/90-nvidia-gpu-devices.rules
ln -s shimpei %{buildroot}%{_cross_bindir}/nvidia-oci

%post ecs -p <lua>
posix.link("nvidia-container-toolkit-config-ecs.toml", "%{_cross_factorydir}/etc/nvidia-container-runtime/config.toml")

%post k8s -p <lua>
posix.link("nvidia-container-toolkit-config-k8s.toml", "%{_cross_factorydir}/etc/nvidia-container-runtime/config.toml")

%files
%license LICENSE
%{_cross_attribution_file}
%{_cross_bindir}/nvidia-container-runtime-hook
%{_cross_bindir}/nvidia-ctk
%{_cross_bindir}/nvidia-oci
%{_cross_templatedir}/nvidia-oci-hooks-json
%{_cross_factorydir}/etc/nvidia-container-runtime/config.toml
%{_cross_tmpfilesdir}/nvidia-container-toolkit.conf
%{_cross_udevrulesdir}/90-nvidia-gpu-devices.rules

%files ecs
%{_cross_factorydir}/etc/nvidia-container-runtime/nvidia-container-toolkit-config-ecs.toml

%files k8s
%{_cross_factorydir}/etc/nvidia-container-runtime/nvidia-container-toolkit-config-k8s.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ After=kubelet.service
Wants=kubelet.service

[Service]
ExecStart=/usr/bin/nvidia-device-plugin --pass-device-specs=true
ExecStart=/usr/bin/nvidia-device-plugin --device-list-strategy volume-mounts --device-id-strategy index --pass-device-specs=true
Type=simple
TimeoutSec=0
RestartSec=2
Expand Down
2 changes: 1 addition & 1 deletion variants/aws-ecs-1-nvidia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ included-packages = [
"ecs-agent",
# NVIDIA support
"ecs-gpu-init",
"nvidia-container-toolkit",
"nvidia-container-toolkit-ecs",
"kmod-5.10-nvidia-tesla-470",
]

Expand Down
2 changes: 1 addition & 1 deletion variants/aws-ecs-2-nvidia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ included-packages = [
"ecs-agent",
# NVIDIA support
"ecs-gpu-init",
"nvidia-container-toolkit",
"nvidia-container-toolkit-ecs",
"kmod-6.1-nvidia-tesla-535",
]

Expand Down
2 changes: 1 addition & 1 deletion variants/aws-k8s-1.23-nvidia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ included-packages = [
"kernel-5.10",
"kubelet-1.23",
"release",
"nvidia-container-toolkit",
"nvidia-container-toolkit-k8s",
"nvidia-k8s-device-plugin",
"kmod-5.10-nvidia-tesla-470",
]
Expand Down
2 changes: 1 addition & 1 deletion variants/aws-k8s-1.24-nvidia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ included-packages = [
"kernel-5.15",
"kubelet-1.24",
"release",
"nvidia-container-toolkit",
"nvidia-container-toolkit-k8s",
"nvidia-k8s-device-plugin",
"kmod-5.15-nvidia-tesla-535",
]
Expand Down
2 changes: 1 addition & 1 deletion variants/aws-k8s-1.25-nvidia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ included-packages = [
"kernel-5.15",
"kubelet-1.25",
"release",
"nvidia-container-toolkit",
"nvidia-container-toolkit-k8s",
"nvidia-k8s-device-plugin",
"kmod-5.15-nvidia-tesla-535",
]
Expand Down
2 changes: 1 addition & 1 deletion variants/aws-k8s-1.26-nvidia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ included-packages = [
"kernel-5.15",
"kubelet-1.26",
"release",
"nvidia-container-toolkit",
"nvidia-container-toolkit-k8s",
"nvidia-k8s-device-plugin",
"kmod-5.15-nvidia-tesla-535",
]
Expand Down
2 changes: 1 addition & 1 deletion variants/aws-k8s-1.27-nvidia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ included-packages = [
"kernel-5.15",
"kubelet-1.27",
"release",
"nvidia-container-toolkit",
"nvidia-container-toolkit-k8s",
"nvidia-k8s-device-plugin",
"kmod-5.15-nvidia-tesla-535",
]
Expand Down
2 changes: 1 addition & 1 deletion variants/aws-k8s-1.28-nvidia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ included-packages = [
"kubelet-1.28",
"aws-iam-authenticator",
# nvidia
"nvidia-container-toolkit",
"nvidia-container-toolkit-k8s",
"nvidia-k8s-device-plugin",
"kmod-6.1-nvidia-tesla-535",
]
Expand Down
2 changes: 1 addition & 1 deletion variants/aws-k8s-1.29-nvidia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ included-packages = [
"kubelet-1.29",
"aws-iam-authenticator",
# nvidia
"nvidia-container-toolkit",
"nvidia-container-toolkit-k8s",
"nvidia-k8s-device-plugin",
"kmod-6.1-nvidia-tesla-535",
]
Expand Down

0 comments on commit 0da372a

Please sign in to comment.