Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.zig-cache
zig-out
28 changes: 21 additions & 7 deletions build.zig
Original file line number Diff line number Diff line change
@@ -1,16 +1,30 @@
const std = @import("std");

pub fn build(b: *std.build.Builder) void {
pub fn build(b: *std.Build) void {
// Standard release options allow the person running `zig build` to select
// between Debug, ReleaseSafe, ReleaseFast, and ReleaseSmall.
const mode = b.standardReleaseOptions();
const target = b.standardTargetOptions(.{});
const optimize = b.standardOptimizeOption(.{});

const lib = b.addStaticLibrary("ctregex", "ctregex.zig");
lib.setBuildMode(mode);
lib.install();
const lib = b.addLibrary(.{
.name = "ctregex",
.root_module = b.createModule(.{
.root_source_file = b.path("ctregex.zig"),
.target = target,
.optimize = optimize,
}),
.linkage = .static,
});

const main_tests = b.addTest("tests.zig");
main_tests.setBuildMode(mode);
b.installArtifact(lib);

const main_tests = b.addTest(.{
.root_module = b.createModule(.{
.root_source_file = b.path("tests.zig"),
.target = target,
.optimize = optimize,
}),
});

const test_step = b.step("test", "Run library tests");
test_step.dependOn(&main_tests.step);
Expand Down
170 changes: 78 additions & 92 deletions ctregex.zig
Original file line number Diff line number Diff line change
Expand Up @@ -25,50 +25,54 @@ fn utf16leDecode(chars: []const u16) !u21 {
}
}

fn ctUtf8EncodeChar(comptime codepoint: u21) []const u8 {
var buf: [4]u8 = undefined;
return buf[0 .. std.unicode.utf8Encode(codepoint, &buf) catch unreachable];
}

fn checkAscii(comptime codepoint: u21) void {
if (codepoint > 127) @compileError("Cannot match character '" ++ ctUtf8EncodeChar(codepoint) ++ "' in ascii mode.");
fn checkAscii(comptime codepoint: u21) usize {
if (codepoint > 127) @compileError("Cannot match character '" ++ std.unicode.utf8EncodeComptime(codepoint) ++ "' in ascii mode.");
return 1;
}

fn charLenInEncoding(comptime codepoint: u21, comptime encoding: Encoding) usize {
switch (encoding) {
.ascii => {
checkAscii(codepoint);
return 1;
},
.utf8 => return std.unicode.utf8CodepointSequenceLength(codepoint) catch unreachable,
.utf16le => return if (codepoint < 0x10000) 1 else 2,
.codepoint => return 1,
}
return switch (encoding) {
.ascii => checkAscii(codepoint),
.utf8 => std.unicode.utf8CodepointSequenceLength(codepoint) catch unreachable,
.utf16le => if (codepoint < 0x10000) 1 else 2,
.codepoint => 1,
};
}

fn ctEncode(comptime str: []const u21, comptime encoding: Encoding) []const encoding.CharT() {
if (encoding == .codepoint) return str;

fn ctLenInEncoding(comptime str: []const u21, comptime encoding: Encoding) usize {
var len: usize = 0;
for (str) |c| len += charLenInEncoding(c, encoding);
return len;
}

var result: [len]encoding.CharT() = undefined;
fn ctEncode(comptime str: []const u21, comptime encoding: Encoding) [ctLenInEncoding(str, encoding)]encoding.CharT() {
comptime var result: [ctLenInEncoding(str, encoding)]encoding.CharT() = undefined;
var idx: usize = 0;
for (str) |c| {
switch (encoding) {
.ascii => {
result[idx] = @truncate(u8, c);
result[idx] = @truncate(c);
idx += 1;
},
.utf8 => idx += std.unicode.utf8Encode(c, result[idx..]) catch unreachable,
.utf8 => {
for (std.unicode.utf8EncodeComptime(c)) |t| {
result[idx] = t;
idx += 1;
}
},
.utf16le => {
const utf8_c = ctUtf8EncodeChar(c);
idx += std.unicode.utf8ToUtf16Le(result[idx..], utf8_c) catch unreachable;
for (std.unicode.utf8ToUtf16LeStringLiteral(&std.unicode.utf8EncodeComptime(c))) |t| {
result[idx] = t;
idx += 1;
}
},
.codepoint => {
result[idx] = c;
idx += 1;
},
.codepoint => unreachable,
}
}
return &result;
return result;
}

fn ctIntStr(comptime int: anytype) []const u8 {
Expand Down Expand Up @@ -112,7 +116,7 @@ const RegexParser = struct {
fn skipWhitespace(comptime parser: *RegexParser) void {
while (parser.iterator.i < parser.iterator.bytes.len and
(parser.iterator.bytes[parser.iterator.i] == ' ' or
parser.iterator.bytes[parser.iterator.i] == '\t')) : (parser.iterator.i += 1)
parser.iterator.bytes[parser.iterator.i] == '\t')) : (parser.iterator.i += 1)
{}
}

Expand Down Expand Up @@ -337,15 +341,15 @@ const RegexParser = struct {
fn parseAsciiIdent(comptime parser: *RegexParser) []const u8 {
var c = parser.peek() orelse parser.raiseError("Expected ascii identifier", .{});
if (c > 127) parser.raiseError("Expected ascii character in identifier, got '{}'", .{c});
if (c != '_' and !std.ascii.isAlpha(@truncate(u8, c))) {
if (c != '_' and !std.ascii.isAlphabetic(@truncate(c))) {
parser.raiseError("Identifier must start with '_' or a letter, got '{}''", .{c});
}
var res: []const u8 = &[1]u8{@truncate(u8, parser.iterator.nextCodepoint() orelse unreachable)};
var res: []const u8 = &[1]u8{@truncate(parser.iterator.nextCodepoint() orelse unreachable)};
readChars: while (true) {
c = parser.peek() orelse break :readChars;
if (c > 127 or (c != '_' and !std.ascii.isAlNum(@truncate(u8, c))))
if (c > 127 or (c != '_' and !std.ascii.isAlphanumeric(@truncate(c))))
break :readChars;
res = res ++ &[1]u8{@truncate(u8, parser.iterator.nextCodepoint() orelse unreachable)};
res = res ++ &[1]u8{@truncate(parser.iterator.nextCodepoint() orelse unreachable)};
}
return res;
}
Expand All @@ -356,11 +360,11 @@ const RegexParser = struct {

fn maybeParseNaturalNum(comptime parser: *RegexParser) ?usize {
var c = parser.peek() orelse return null;
if (c > 127 or !std.ascii.isDigit(@truncate(u8, c))) return null;
if (c > 127 or !std.ascii.isDigit(@truncate(c))) return null;
var res: usize = (parser.iterator.nextCodepoint() orelse unreachable) - '0';
readChars: while (true) {
c = parser.peek() orelse break :readChars;
if (c > 127 or !std.ascii.isDigit(@truncate(u8, c))) break :readChars;
if (c > 127 or !std.ascii.isDigit(@truncate(c))) break :readChars;
res = res * 10 + ((parser.iterator.nextCodepoint() orelse unreachable) - '0');
}
return res;
Expand Down Expand Up @@ -548,7 +552,7 @@ const RegexParser = struct {
const lhs_len = self.lhs.minLen(encoding);
if (self.rhs) |rhs| {
const rhs_len = rhs.minLen(encoding);
return std.math.min(lhs_len, rhs_len);
return @min(lhs_len, rhs_len);
}
return lhs_len;
}
Expand All @@ -570,7 +574,7 @@ const RegexParser = struct {
.literal => |codepoint_str| block: {
var str: []const u8 = "literal<";
for (codepoint_str) |codepoint| {
str = str ++ ctUtf8EncodeChar(codepoint);
str = str ++ std.unicode.utf8EncodeComptime(codepoint);
}
break :block str ++ ">";
},
Expand Down Expand Up @@ -630,11 +634,11 @@ const RegexParser = struct {
fn ctStr(comptime self: Brackets) []const u8 {
var str: []const u8 = "[";
if (self.is_exclusive) str = str ++ "<not> ";
for (self.rules) |rule, idx| {
for (self.rules, 0..) |rule, idx| {
if (idx > 0) str = str ++ " ";
str = str ++ switch (rule) {
.char => |c| ctUtf8EncodeChar(c),
.range => |r| ctUtf8EncodeChar(r.start) ++ "-" ++ ctUtf8EncodeChar(r.end),
.char => |c| std.unicode.utf8EncodeComptime(c),
.range => |r| std.unicode.utf8EncodeComptime(r.start) ++ "-" ++ std.unicode.utf8EncodeComptime(r.end),
.char_class => |class| charClassToString(class),
};
}
Expand All @@ -646,7 +650,7 @@ const RegexParser = struct {
if (self.is_exclusive) return 1;
var min_len: usize = std.math.maxInt(usize);
for (self.rules) |rule| {
var curr_len: usize = switch (rule) {
const curr_len: usize = switch (rule) {
.char => |c| charLenInEncoding(c, encoding),
.range => |range| charLenInEncoding(range.start, encoding),
.char_class => |class| charClassMinLen(class, encoding),
Expand Down Expand Up @@ -675,38 +679,30 @@ pub const Encoding = enum {
};

inline fn readOneChar(comptime options: MatchOptions, str: []const options.encoding.CharT()) !@TypeOf(str) {
switch (options.encoding) {
.ascii, .codepoint => return str[0..1],
.utf8 => return str[0..try std.unicode.utf8ByteSequenceLength(str[0])],
.utf16le => return str[0..try utf16leCharSequenceLength(str[0])],
}
return switch (options.encoding) {
.ascii, .codepoint => str[0..1],
.utf8 => str[0..try std.unicode.utf8ByteSequenceLength(str[0])],
.utf16le => str[0..try utf16leCharSequenceLength(str[0])],
};
}

inline fn inCharClass(comptime class: u21, cp: u21) bool {
switch (class) {
'd' => return cp >= '0' and cp <= '9',
's' => {
// TODO Include same chars as PCRE
return cp == ' ' or cp == '\t';
},
return switch (class) {
'd' => cp >= '0' and cp <= '9',
's' => cp == ' ' or cp == '\t', // TODO Include same chars as PCRE
else => unreachable,
}
};
}

inline fn readCharClass(comptime class: u21, comptime options: MatchOptions, str: []const options.encoding.CharT()) ?@TypeOf(str) {
switch (class) {
'd' => {
switch (options.encoding) {
.ascii, .utf8 => return if (std.ascii.isDigit(str[0])) str[0..1] else null,
.codepoint, .utf16le => return if (str[0] >= '0' and str[0] <= '9') str[0..1] else null,
}
},
's' => {
// TODO Include same chars as PCRE
return if (str[0] == ' ' or str[0] == '\t') str[0..1] else null;
return switch (class) {
'd' => switch (options.encoding) {
.ascii, .utf8 => if (std.ascii.isDigit(str[0])) str[0..1] else null,
.codepoint, .utf16le => if (str[0] >= '0' and str[0] <= '9') str[0..1] else null,
},
's' => if (str[0] == ' ' or str[0] == '\t') str[0..1] else null, // TODO Include same chars as PCRE
else => unreachable,
}
};
}

inline fn matchAtom(comptime atom: RegexParser.Atom, comptime options: MatchOptions, str: []const options.encoding.CharT(), result: anytype) !?@TypeOf(str) {
Expand All @@ -725,27 +721,22 @@ inline fn matchAtom(comptime atom: RegexParser.Atom, comptime options: MatchOpti
.char_class => |class| return readCharClass(class, options, str),
.literal => |lit| {
const encoded_lit = comptime ctEncode(lit, options.encoding);
if (std.mem.eql(options.encoding.CharT(), encoded_lit, str[0..encoded_lit.len])) {
if (std.mem.eql(options.encoding.CharT(), encoded_lit[0..], str[0..encoded_lit.len])) {
return str[0..encoded_lit.len];
}
return null;
},
.brackets => |brackets| {
var this_slice: @TypeOf(str) = undefined;
const this_slice: @TypeOf(str) = switch (options.encoding) {
.codepoint, .ascii => str[0..1],
.utf8 => str[0..try std.unicode.utf8ByteSequenceLength(str[0])],
.utf16le => str[0..try utf16leCharSequenceLength(str[0])],
};

const this_cp: u21 = switch (options.encoding) {
.codepoint, .ascii => block: {
this_slice = str[0..1];
break :block str[0];
},
.utf8 => block: {
this_slice = str[0..try std.unicode.utf8ByteSequenceLength(str[0])];
break :block try std.unicode.utf8Decode(this_slice);
},
.utf16le => block: {
this_slice = str[0..try utf16leCharSequenceLength(str[0])];
break :block try utf16leDecode(this_slice);
},
.codepoint, .ascii => str[0],
.utf8 => try std.unicode.utf8Decode(this_slice),
.utf16le => try utf16leDecode(this_slice),
};

inline for (brackets.rules) |rule| {
Expand All @@ -756,8 +747,8 @@ inline fn matchAtom(comptime atom: RegexParser.Atom, comptime options: MatchOpti
},
.range => |range| {
if (options.encoding == .ascii) {
checkAscii(range.start);
checkAscii(range.end);
_ = checkAscii(range.start);
_ = checkAscii(range.end);
}

if (this_cp >= range.start and this_cp <= range.end)
Expand Down Expand Up @@ -835,7 +826,7 @@ inline fn matchSubExpr(comptime sub_expr: RegexParser.SubExpr, comptime options:
}
} else {
// TODO Using an inline while here crashes the compiler in codegen
var curr_additional_rep: usize = 0;
const curr_additional_rep: usize = 0;
_ = curr_additional_rep;
while (curr_rep < range.max) : (curr_rep += 1) {
if (try matchAtom(atom.data, options, str[curr_slice.len..], result)) |matched_slice| {
Expand Down Expand Up @@ -885,7 +876,7 @@ pub fn MatchResult(comptime regex: []const u8, comptime options: MatchOptions) t
if (RegexParser.parse(regex)) |parsed| {
const capture_len = parsed.captures.len;
var capture_names: [capture_len]?[]const u8 = undefined;
for (parsed.captures) |capt, idx| {
for (parsed.captures, 0..) |capt, idx| {
if (capt.capture_info) |info| {
capture_names[idx] = info.name;
}
Expand All @@ -903,20 +894,15 @@ pub fn MatchResult(comptime regex: []const u8, comptime options: MatchOptions) t
self.captures = [1]?[]const CharT{null} ** capture_len;
}

pub usingnamespace if (capture_len != 0)
struct {
pub fn capture(self: Self, comptime name: []const u8) ?[]const CharT {
inline for (capture_names2) |maybe_name, curr_idx| {
if (maybe_name) |curr_name| {
if (comptime std.mem.eql(u8, name, curr_name))
return self.captures[curr_idx];
}
}
@compileError("No capture named '" ++ name ++ "'");
pub fn capture(self: Self, comptime name: []const u8) ?[]const CharT {
inline for (capture_names2, 0..) |maybe_name, curr_idx| {
if (maybe_name) |curr_name| {
if (comptime std.mem.eql(u8, name, curr_name))
return self.captures[curr_idx];
}
}
else
struct {};
@compileError("No capture named '" ++ name ++ "'");
}
};
}
return void;
Expand Down
Loading