Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 54 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[![Version: v0.0.1](https://img.shields.io/badge/Version-v0.0.1-blue)](#)
[![Tests: 101/101 Passing](https://img.shields.io/badge/Tests-101%2F101_Passing-brightgreen)](#)
[![Tests: 113/113 Passing](https://img.shields.io/badge/Tests-113%2F113_Passing-brightgreen)](#)
[![Binary Size: ~8MB](https://img.shields.io/badge/Binary_Size-%7E8MB-success)](#)

# zigCUDA - CUDA Driver API for Zig
Expand Down Expand Up @@ -37,10 +37,10 @@ INFO: cuInit succeeded
## 🎯 Key Features (v0.0.1)

- **Dynamic Driver Loading** – Works on Linux native and WSL2, multiple symbol resolution paths
- **Clean Zig API** – Context, device, memory, streams, events, module loading, kernel launch
- **Clean Zig API** – Raw Driver API access plus low-level ergonomic wrappers for memory, params, modules, and launch
- **Graceful Stubs** – Compiles and runs basic checks without a GPU
- **Zero External Dependencies** – Only needs NVIDIA driver at runtime
- **Test Coverage** – 97 passing tests across core, bindings, and integrations
- **Test Coverage** – 113 passing tests across core, bindings, ergonomics, and integrations
- **Easy Library Usage** – Single `@import("zigcuda")` with init/deinit pattern

## 📊 Status
Expand Down Expand Up @@ -80,21 +80,64 @@ exe.root_module.linkSystemLibrary("c", .{});

### 3. Example usage

The raw Driver API wrappers remain available under `zigcuda.bindings.*`. For lower-boilerplate code, use the ergonomic layer exported from `zigcuda` directly.

### Low-level ergonomic API

```zig
const std = @import("std");
const zigcuda = @import("zigcuda");

pub fn runKernel(allocator: std.mem.Allocator, input: []const f16, output: []f16) !void {
var input_dev = try zigcuda.DeviceBuffer.alloc(std.mem.sliceAsBytes(input).len);
defer input_dev.deinit();
var output_dev = try zigcuda.DeviceBuffer.alloc(std.mem.sliceAsBytes(output).len);
defer output_dev.deinit();

try input_dev.copyFromTyped(f16, input);

var module = try zigcuda.Module.loadFirst(allocator, &.{
"build/kernels/lm_head_q6k_mmq.cubin",
"kernels/lm_head_q6k_mmq.cubin",
});
defer module.deinit();

const kernel = try module.kernel("lm_head_mmq_q6k_kernel");

var params = zigcuda.Params.init();
try params.devicePtr(output_dev.ptr);
try params.devicePtr(input_dev.ptr);
try params.value(i32, @intCast(input.len));

try kernel.launch(.{
.grid = .{ .x = @intCast((input.len + 255) / 256) },
.block = .{ .x = 256 },
.sync_after = true,
}, params.slice());

try output_dev.copyToTyped(f16, output);
}
```

Defaults keep common CUDA launch boilerplate out of the call site: `grid.z = 1`, `block.y = 1`, `block.z = 1`, `shared_mem_bytes = 0`, `stream = null`, and `sync_after = false`.

**Basic device enumeration:**
```zig
const std = @import("std");
const zigcuda = @import("zigcuda");

pub fn main() !void {
try zigcuda.bindings.init();
try zigcuda.bindings.load();
try zigcuda.bindings.init(0);

const device_count = try zigcuda.bindings.getDeviceCount();
std.debug.print("Found {d} CUDA device(s)\n", .{device_count});

for (0..@min(device_count, 3)) |i| {
const props = try zigcuda.bindings.getDeviceProperties(@intCast(i));
const device = try zigcuda.bindings.getDevice(@intCast(i));
const props = try zigcuda.bindings.getDeviceProperties(device);
std.debug.print("Device {d}: {s}\n", .{
i, @as([:0]const u8, @ptrCast(&props.name)),
i, @as([*:0]const u8, @ptrCast(&props.deviceName)),
});
}
}
Expand All @@ -106,14 +149,14 @@ const std = @import("std");
const zigcuda = @import("zigcuda");

pub fn main() !void {
try zigcuda.bindings.init();
try zigcuda.bindings.load();
try zigcuda.bindings.init(0);

// Load compiled CUDA binary (.cubin file)
const filename: [:0]zigcuda.bindings.@"c_char" = "my_kernel.cubin";
const filename: [:0]const zigcuda.bindings.c_char = @ptrCast("my_kernel.cubin");
const module = try zigcuda.bindings.loadModule(filename);

var kernel_name_buf = "my_kernel".*;
const c_kernel_name: [:0]zigcuda.bindings.@"c_char" = @ptrCast(&kernel_name_buf);
const c_kernel_name: [:0]const zigcuda.bindings.c_char = @ptrCast("my_kernel");
const kernel_func = try zigcuda.bindings.getFunctionFromModule(module, c_kernel_name);

// Launch with correct parameter count (grid_dim_z is required!)
Expand Down Expand Up @@ -154,7 +197,7 @@ pub fn main() !void {
## 🛠️ Development

```bash
zig build run test # Run full suite (97 tests)
zig build test # Run full suite
zig build run # Diagnostic tool
```

Expand Down
22 changes: 21 additions & 1 deletion build.zig
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,15 @@ pub fn build(b: *std.Build) !void {
.target = target,
});

// Create ergonomic API test module
const ergonomics_test_module = b.createModule(.{
.root_source_file = b.path("test/ergonomics_test.zig"),
.target = target,
});

// Add library import to test
lib_test_module.addImport("zigcuda", lib_module);
ergonomics_test_module.addImport("zigcuda", lib_module);

// Create bindings test module (low-level CUDA API testing)
const bindings_test_module = b.createModule(.{
Expand Down Expand Up @@ -118,6 +125,7 @@ pub fn build(b: *std.Build) !void {
// CREATE ALL TEST EXECUTABLES

const lib_tests = b.addTest(.{ .root_module = lib_test_module });
const ergonomics_tests = b.addTest(.{ .root_module = ergonomics_test_module });

// Create bindings test executable (low-level API tests)
const binding_tests = b.addTest(.{
Expand All @@ -141,6 +149,7 @@ pub fn build(b: *std.Build) !void {

// Link all test executables against system libc
lib_tests.root_module.linkSystemLibrary("c", .{});
ergonomics_tests.root_module.linkSystemLibrary("c", .{});
binding_tests.root_module.linkSystemLibrary("c", .{});
v2_memory_tests.root_module.linkSystemLibrary("c", .{});
runtime_tests.root_module.linkSystemLibrary("c", .{});
Expand All @@ -154,6 +163,12 @@ pub fn build(b: *std.Build) !void {
});
run_lib_tests_cmd.addArtifactArg(lib_tests);

// Use system dynamic linker for ergonomic API tests
const run_ergonomics_tests_cmd = b.addSystemCommand(&.{
"/lib64/ld-linux-x86-64.so.2",
});
run_ergonomics_tests_cmd.addArtifactArg(ergonomics_tests);

// Use system dynamic linker to avoid glibc mismatch issues for bindings tests
const run_bindings_tests = b.addSystemCommand(&.{
"/lib64/ld-linux-x86-64.so.2",
Expand Down Expand Up @@ -183,6 +198,9 @@ pub fn build(b: *std.Build) !void {
const lib_test_step = b.step("test-lib", "Run library API tests");
lib_test_step.dependOn(&run_lib_tests_cmd.step);

const ergonomics_test_step = b.step("test-ergonomics", "Run low-level ergonomic API tests");
ergonomics_test_step.dependOn(&run_ergonomics_tests_cmd.step);

const bindings_test_step = b.step("test-bindings", "Run comprehensive CUDA API binding tests");
bindings_test_step.dependOn(&run_bindings_tests.step);

Expand Down Expand Up @@ -297,7 +315,9 @@ pub fn build(b: *std.Build) !void {
const cublas_simple_test_step = b.step("test-cublas-simple", "Run simplified cuBLAS stub tests");
cublas_simple_test_step.dependOn(&run_cublas_simple_tests.step);

const all_tests_step = b.step("test", "Run all tests (bindings + v2-memory + runtime + kernel integration + simple + cuBLAS integration)");
const all_tests_step = b.step("test", "Run all tests (bindings + ergonomics + v2-memory + runtime + kernel integration + simple + cuBLAS integration)");
all_tests_step.dependOn(lib_test_step);
all_tests_step.dependOn(ergonomics_test_step);
all_tests_step.dependOn(bindings_test_step);
all_tests_step.dependOn(v2_memory_test_step);
all_tests_step.dependOn(runtime_test_step);
Expand Down
74 changes: 74 additions & 0 deletions examples/05_ergonomic_launch.zig
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
// examples/05_ergonomic_launch.zig
// Preferred low-level style: owned buffers, typed copies, packed params, and launch defaults.

const std = @import("std");
const zigcuda = @import("zigcuda");
const cuda = zigcuda.bindings;

pub fn main() !void {
try cuda.load();
try cuda.init(0);

const device = try cuda.getDevice(0);
const ctx = try cuda.createContext(0, device);
defer cuda.destroyContext(ctx) catch {};

const n: u32 = 1024;
const allocator = std.heap.page_allocator;

const a = try allocator.alloc(f32, n);
defer allocator.free(a);
const b = try allocator.alloc(f32, n);
defer allocator.free(b);
const c = try allocator.alloc(f32, n);
defer allocator.free(c);

for (a, 0..) |*value, i| value.* = @floatFromInt(i);
for (b, 0..) |*value, i| value.* = @floatFromInt(i * 2);
@memset(c, 0);

var d_a = try zigcuda.DeviceBuffer.alloc(std.mem.sliceAsBytes(a).len);
defer d_a.deinit();
var d_b = try zigcuda.DeviceBuffer.alloc(std.mem.sliceAsBytes(b).len);
defer d_b.deinit();
var d_c = try zigcuda.DeviceBuffer.alloc(std.mem.sliceAsBytes(c).len);
defer d_c.deinit();

try d_a.copyFromTyped(f32, a);
try d_b.copyFromTyped(f32, b);

var module = try zigcuda.Module.loadFirst(allocator, &.{
"examples/kernels/vector_add.cubin",
"examples/kernels/vector_add.ptx",
});
defer module.deinit();

const kernel = try module.kernel("vector_add");

var params = zigcuda.Params.init();
try params.devicePtr(d_a.ptr);
try params.devicePtr(d_b.ptr);
try params.devicePtr(d_c.ptr);
try params.value(u32, n);

try kernel.launch(.{
.grid = zigcuda.Dim3.init((n + 255) / 256),
.block = .{ .x = 256 },
.sync_after = true,
}, params.slice());

try d_c.copyToTyped(f32, c);

var mismatches: usize = 0;
for (a, b, c) |left, right, actual| {
if (actual != left + right) {
mismatches += 1;
}
}

if (mismatches != 0) {
return error.VerificationFailed;
}

std.debug.print("Vector add completed with ergonomic zigCUDA API ({d} elements).\n", .{n});
}
16 changes: 16 additions & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,21 @@ zig build-exe --dep zigcuda -Mroot=examples/04_streams.zig -Mzigcuda=src/lib.zig
./04_streams
```

### 05_ergonomic_launch.zig
**Low-level ergonomic kernel launch**

Demonstrates:
- RAII-style `DeviceBuffer` ownership
- Typed host/device copies
- `Params` kernel argument packing
- `LaunchConfig` defaults
- `Module.loadFirst` and `Kernel.launch`

```bash
zig build-exe --dep zigcuda -Mroot=examples/05_ergonomic_launch.zig -Mzigcuda=src/lib.zig -lc
./05_ergonomic_launch
```

## Building All Examples

You can add these examples to your `build.zig` to make them easy to build:
Expand All @@ -77,6 +92,7 @@ const examples = [_][]const u8{
"02_memory_transfer",
"03_kernel_launch",
"04_streams",
"05_ergonomic_launch",
};

for (examples) |example_name| {
Expand Down
3 changes: 2 additions & 1 deletion src/bindings/cuda.zig
Original file line number Diff line number Diff line change
Expand Up @@ -941,7 +941,8 @@ pub fn loadModuleFromData(image: [:0]const c_char) errors.CUDAError!*CUmodule {

/// Unload a CUDA module
pub fn unloadModule(module: *CUmodule) errors.CUDAError!void {
const result = cuModuleUnload(module);
const cu_module_unload = cuModuleUnload orelse return error.SymbolNotFound;
const result = cu_module_unload(module);
if (result == CUDA_SUCCESS) {
return;
}
Expand Down
Loading