alexnask · jburgy · Nov 2, 2025 · Nov 3, 2025 · Nov 3, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+.zig-cache
+zig-out
diff --git a/build.zig b/build.zig
@@ -1,16 +1,30 @@
 const std = @import("std");
 
-pub fn build(b: *std.build.Builder) void {
+pub fn build(b: *std.Build) void {
     // Standard release options allow the person running `zig build` to select
     // between Debug, ReleaseSafe, ReleaseFast, and ReleaseSmall.
-    const mode = b.standardReleaseOptions();
+    const target = b.standardTargetOptions(.{});
+    const optimize = b.standardOptimizeOption(.{});
 
-    const lib = b.addStaticLibrary("ctregex", "ctregex.zig");
-    lib.setBuildMode(mode);
-    lib.install();
+    const lib = b.addLibrary(.{
+        .name = "ctregex",
+        .root_module = b.createModule(.{
+            .root_source_file = b.path("ctregex.zig"),
+            .target = target,
+            .optimize = optimize,
+        }),
+        .linkage = .static,
+    });
 
-    const main_tests = b.addTest("tests.zig");
-    main_tests.setBuildMode(mode);
+    b.installArtifact(lib);
+
+    const main_tests = b.addTest(.{
+        .root_module = b.createModule(.{
+            .root_source_file = b.path("tests.zig"),
+            .target = target,
+            .optimize = optimize,
+        }),
+    });
 
     const test_step = b.step("test", "Run library tests");
     test_step.dependOn(&main_tests.step);

diff --git a/ctregex.zig b/ctregex.zig
@@ -25,50 +25,54 @@ fn utf16leDecode(chars: []const u16) !u21 {
     }
 }
 
-fn ctUtf8EncodeChar(comptime codepoint: u21) []const u8 {
-    var buf: [4]u8 = undefined;
-    return buf[0 .. std.unicode.utf8Encode(codepoint, &buf) catch unreachable];
-}
-
-fn checkAscii(comptime codepoint: u21) void {
-    if (codepoint > 127) @compileError("Cannot match character '" ++ ctUtf8EncodeChar(codepoint) ++ "' in ascii mode.");
+fn checkAscii(comptime codepoint: u21) usize {
+    if (codepoint > 127) @compileError("Cannot match character '" ++ std.unicode.utf8EncodeComptime(codepoint) ++ "' in ascii mode.");
+    return 1;
 }
 
 fn charLenInEncoding(comptime codepoint: u21, comptime encoding: Encoding) usize {
-    switch (encoding) {
-        .ascii => {
-            checkAscii(codepoint);
-            return 1;
-        },
-        .utf8 => return std.unicode.utf8CodepointSequenceLength(codepoint) catch unreachable,
-        .utf16le => return if (codepoint < 0x10000) 1 else 2,
-        .codepoint => return 1,
-    }
+    return switch (encoding) {
+        .ascii => checkAscii(codepoint),
+        .utf8 => std.unicode.utf8CodepointSequenceLength(codepoint) catch unreachable,
+        .utf16le => if (codepoint < 0x10000) 1 else 2,
+        .codepoint => 1,
+    };
 }
 
-fn ctEncode(comptime str: []const u21, comptime encoding: Encoding) []const encoding.CharT() {
-    if (encoding == .codepoint) return str;
-
+fn ctLenInEncoding(comptime str: []const u21, comptime encoding: Encoding) usize {
     var len: usize = 0;
     for (str) |c| len += charLenInEncoding(c, encoding);
+    return len;
+}
 
-    var result: [len]encoding.CharT() = undefined;
+fn ctEncode(comptime str: []const u21, comptime encoding: Encoding) [ctLenInEncoding(str, encoding)]encoding.CharT() {
+    comptime var result: [ctLenInEncoding(str, encoding)]encoding.CharT() = undefined;
     var idx: usize = 0;
     for (str) |c| {
         switch (encoding) {
             .ascii => {
-                result[idx] = @truncate(u8, c);
+                result[idx] = @truncate(c);
                 idx += 1;
             },
-            .utf8 => idx += std.unicode.utf8Encode(c, result[idx..]) catch unreachable,
+            .utf8 => {
+                for (std.unicode.utf8EncodeComptime(c)) |t| {
+                    result[idx] = t;
+                    idx += 1;
+                }
+            },
             .utf16le => {
-                const utf8_c = ctUtf8EncodeChar(c);
-                idx += std.unicode.utf8ToUtf16Le(result[idx..], utf8_c) catch unreachable;
+                for (std.unicode.utf8ToUtf16LeStringLiteral(&std.unicode.utf8EncodeComptime(c))) |t| {
+                    result[idx] = t;
+                    idx += 1;
+                }
+            },
+            .codepoint => {
+                result[idx] = c;
+                idx += 1;
             },
-            .codepoint => unreachable,
         }
     }
-    return &result;
+    return result;
 }
 
 fn ctIntStr(comptime int: anytype) []const u8 {
@@ -112,7 +116,7 @@ const RegexParser = struct {
     fn skipWhitespace(comptime parser: *RegexParser) void {
         while (parser.iterator.i < parser.iterator.bytes.len and
             (parser.iterator.bytes[parser.iterator.i] == ' ' or
-            parser.iterator.bytes[parser.iterator.i] == '\t')) : (parser.iterator.i += 1)
+                parser.iterator.bytes[parser.iterator.i] == '\t')) : (parser.iterator.i += 1)
         {}
     }
 
@@ -337,15 +341,15 @@ const RegexParser = struct {
     fn parseAsciiIdent(comptime parser: *RegexParser) []const u8 {
         var c = parser.peek() orelse parser.raiseError("Expected ascii identifier", .{});
         if (c > 127) parser.raiseError("Expected ascii character in identifier, got '{}'", .{c});
-        if (c != '_' and !std.ascii.isAlpha(@truncate(u8, c))) {
+        if (c != '_' and !std.ascii.isAlphabetic(@truncate(c))) {
             parser.raiseError("Identifier must start with '_' or a letter, got '{}''", .{c});
         }
-        var res: []const u8 = &[1]u8{@truncate(u8, parser.iterator.nextCodepoint() orelse unreachable)};
+        var res: []const u8 = &[1]u8{@truncate(parser.iterator.nextCodepoint() orelse unreachable)};
         readChars: while (true) {
             c = parser.peek() orelse break :readChars;
-            if (c > 127 or (c != '_' and !std.ascii.isAlNum(@truncate(u8, c))))
+            if (c > 127 or (c != '_' and !std.ascii.isAlphanumeric(@truncate(c))))
                 break :readChars;
-            res = res ++ &[1]u8{@truncate(u8, parser.iterator.nextCodepoint() orelse unreachable)};
+            res = res ++ &[1]u8{@truncate(parser.iterator.nextCodepoint() orelse unreachable)};
         }
         return res;
     }
@@ -356,11 +360,11 @@ const RegexParser = struct {
 
     fn maybeParseNaturalNum(comptime parser: *RegexParser) ?usize {
         var c = parser.peek() orelse return null;
-        if (c > 127 or !std.ascii.isDigit(@truncate(u8, c))) return null;
+        if (c > 127 or !std.ascii.isDigit(@truncate(c))) return null;
         var res: usize = (parser.iterator.nextCodepoint() orelse unreachable) - '0';
         readChars: while (true) {
             c = parser.peek() orelse break :readChars;
-            if (c > 127 or !std.ascii.isDigit(@truncate(u8, c))) break :readChars;
+            if (c > 127 or !std.ascii.isDigit(@truncate(c))) break :readChars;
             res = res * 10 + ((parser.iterator.nextCodepoint() orelse unreachable) - '0');
         }
         return res;
@@ -548,7 +552,7 @@ const RegexParser = struct {
             const lhs_len = self.lhs.minLen(encoding);
             if (self.rhs) |rhs| {
                 const rhs_len = rhs.minLen(encoding);
-                return std.math.min(lhs_len, rhs_len);
+                return @min(lhs_len, rhs_len);
             }
             return lhs_len;
         }
@@ -570,7 +574,7 @@ const RegexParser = struct {
                 .literal => |codepoint_str| block: {
                     var str: []const u8 = "literal<";
                     for (codepoint_str) |codepoint| {
-                        str = str ++ ctUtf8EncodeChar(codepoint);
+                        str = str ++ std.unicode.utf8EncodeComptime(codepoint);
                     }
                     break :block str ++ ">";
                 },
@@ -630,11 +634,11 @@ const RegexParser = struct {
         fn ctStr(comptime self: Brackets) []const u8 {
             var str: []const u8 = "[";
             if (self.is_exclusive) str = str ++ "<not> ";
-            for (self.rules) |rule, idx| {
+            for (self.rules, 0..) |rule, idx| {
                 if (idx > 0) str = str ++ " ";
                 str = str ++ switch (rule) {
-                    .char => |c| ctUtf8EncodeChar(c),
-                    .range => |r| ctUtf8EncodeChar(r.start) ++ "-" ++ ctUtf8EncodeChar(r.end),
+                    .char => |c| std.unicode.utf8EncodeComptime(c),
+                    .range => |r| std.unicode.utf8EncodeComptime(r.start) ++ "-" ++ std.unicode.utf8EncodeComptime(r.end),
                     .char_class => |class| charClassToString(class),
                 };
             }
@@ -646,7 +650,7 @@ const RegexParser = struct {
             if (self.is_exclusive) return 1;
             var min_len: usize = std.math.maxInt(usize);
             for (self.rules) |rule| {
-                var curr_len: usize = switch (rule) {
+                const curr_len: usize = switch (rule) {
                     .char => |c| charLenInEncoding(c, encoding),
                     .range => |range| charLenInEncoding(range.start, encoding),
                     .char_class => |class| charClassMinLen(class, encoding),
@@ -675,38 +679,30 @@ pub const Encoding = enum {
 };
 
 inline fn readOneChar(comptime options: MatchOptions, str: []const options.encoding.CharT()) !@TypeOf(str) {
-    switch (options.encoding) {
-        .ascii, .codepoint => return str[0..1],
-        .utf8 => return str[0..try std.unicode.utf8ByteSequenceLength(str[0])],
-        .utf16le => return str[0..try utf16leCharSequenceLength(str[0])],
-    }
+    return switch (options.encoding) {
+        .ascii, .codepoint => str[0..1],
+        .utf8 => str[0..try std.unicode.utf8ByteSequenceLength(str[0])],
+        .utf16le => str[0..try utf16leCharSequenceLength(str[0])],
+    };
 }
 
 inline fn inCharClass(comptime class: u21, cp: u21) bool {
-    switch (class) {
-        'd' => return cp >= '0' and cp <= '9',
-        's' => {
-            // TODO Include same chars as PCRE
-            return cp == ' ' or cp == '\t';
-        },
+    return switch (class) {
+        'd' => cp >= '0' and cp <= '9',
+        's' => cp == ' ' or cp == '\t', // TODO Include same chars as PCRE
         else => unreachable,
-    }
+    };
 }
 
 inline fn readCharClass(comptime class: u21, comptime options: MatchOptions, str: []const options.encoding.CharT()) ?@TypeOf(str) {
-    switch (class) {
-        'd' => {
-            switch (options.encoding) {
-                .ascii, .utf8 => return if (std.ascii.isDigit(str[0])) str[0..1] else null,
-                .codepoint, .utf16le => return if (str[0] >= '0' and str[0] <= '9') str[0..1] else null,
-            }
-        },
-        's' => {
-            // TODO Include same chars as PCRE
-            return if (str[0] == ' ' or str[0] == '\t') str[0..1] else null;
+    return switch (class) {
+        'd' => switch (options.encoding) {
+            .ascii, .utf8 => if (std.ascii.isDigit(str[0])) str[0..1] else null,
+            .codepoint, .utf16le => if (str[0] >= '0' and str[0] <= '9') str[0..1] else null,
         },
+        's' => if (str[0] == ' ' or str[0] == '\t') str[0..1] else null, // TODO Include same chars as PCRE
         else => unreachable,
-    }
+    };
 }
 
 inline fn matchAtom(comptime atom: RegexParser.Atom, comptime options: MatchOptions, str: []const options.encoding.CharT(), result: anytype) !?@TypeOf(str) {
@@ -725,27 +721,22 @@ inline fn matchAtom(comptime atom: RegexParser.Atom, comptime options: MatchOpti
         .char_class => |class| return readCharClass(class, options, str),
         .literal => |lit| {
             const encoded_lit = comptime ctEncode(lit, options.encoding);
-            if (std.mem.eql(options.encoding.CharT(), encoded_lit, str[0..encoded_lit.len])) {
+            if (std.mem.eql(options.encoding.CharT(), encoded_lit[0..], str[0..encoded_lit.len])) {
                 return str[0..encoded_lit.len];
             }
             return null;
         },
         .brackets => |brackets| {
-            var this_slice: @TypeOf(str) = undefined;
+            const this_slice: @TypeOf(str) = switch (options.encoding) {
+                .codepoint, .ascii => str[0..1],
+                .utf8 => str[0..try std.unicode.utf8ByteSequenceLength(str[0])],
+                .utf16le => str[0..try utf16leCharSequenceLength(str[0])],
+            };
 
             const this_cp: u21 = switch (options.encoding) {
-                .codepoint, .ascii => block: {
-                    this_slice = str[0..1];
-                    break :block str[0];
-                },
-                .utf8 => block: {
-                    this_slice = str[0..try std.unicode.utf8ByteSequenceLength(str[0])];
-                    break :block try std.unicode.utf8Decode(this_slice);
-                },
-                .utf16le => block: {
-                    this_slice = str[0..try utf16leCharSequenceLength(str[0])];
-                    break :block try utf16leDecode(this_slice);
-                },
+                .codepoint, .ascii => str[0],
+                .utf8 => try std.unicode.utf8Decode(this_slice),
+                .utf16le => try utf16leDecode(this_slice),
             };
 
             inline for (brackets.rules) |rule| {
@@ -756,8 +747,8 @@ inline fn matchAtom(comptime atom: RegexParser.Atom, comptime options: MatchOpti
                     },
                     .range => |range| {
                         if (options.encoding == .ascii) {
-                            checkAscii(range.start);
-                            checkAscii(range.end);
+                            _ = checkAscii(range.start);
+                            _ = checkAscii(range.end);
                         }
 
                         if (this_cp >= range.start and this_cp <= range.end)
@@ -835,7 +826,7 @@ inline fn matchSubExpr(comptime sub_expr: RegexParser.SubExpr, comptime options:
                         }
                     } else {
                         // TODO Using an inline while here crashes the compiler in codegen
-                        var curr_additional_rep: usize = 0;
+                        const curr_additional_rep: usize = 0;
                         _ = curr_additional_rep;
                         while (curr_rep < range.max) : (curr_rep += 1) {
                             if (try matchAtom(atom.data, options, str[curr_slice.len..], result)) |matched_slice| {
@@ -885,7 +876,7 @@ pub fn MatchResult(comptime regex: []const u8, comptime options: MatchOptions) t
     if (RegexParser.parse(regex)) |parsed| {
         const capture_len = parsed.captures.len;
         var capture_names: [capture_len]?[]const u8 = undefined;
-        for (parsed.captures) |capt, idx| {
+        for (parsed.captures, 0..) |capt, idx| {
             if (capt.capture_info) |info| {
                 capture_names[idx] = info.name;
             }
@@ -903,20 +894,15 @@ pub fn MatchResult(comptime regex: []const u8, comptime options: MatchOptions) t
                 self.captures = [1]?[]const CharT{null} ** capture_len;
             }
 
-            pub usingnamespace if (capture_len != 0)
-                struct {
-                    pub fn capture(self: Self, comptime name: []const u8) ?[]const CharT {
-                        inline for (capture_names2) |maybe_name, curr_idx| {
-                            if (maybe_name) |curr_name| {
-                                if (comptime std.mem.eql(u8, name, curr_name))
-                                    return self.captures[curr_idx];
-                            }
-                        }
-                        @compileError("No capture named '" ++ name ++ "'");
+            pub fn capture(self: Self, comptime name: []const u8) ?[]const CharT {
+                inline for (capture_names2, 0..) |maybe_name, curr_idx| {
+                    if (maybe_name) |curr_name| {
+                        if (comptime std.mem.eql(u8, name, curr_name))
+                            return self.captures[curr_idx];
                     }
                 }
-            else
-                struct {};
+                @compileError("No capture named '" ++ name ++ "'");
+            }
         };
     }
     return void;