chore: string_parser.zig => StringParser.zig

buzz-language · Sep 27, 2024 · e0587d1 · e0587d1
1 parent 8f36d82
commit e0587d1
Show file tree

Hide file tree

Showing 4 changed files with 196 additions and 210 deletions.
diff --git a/src/Parser.zig b/src/Parser.zig
@@ -12,7 +12,7 @@ const GarbageCollector = @import("memory.zig").GarbageCollector;
 const Scanner = @import("Scanner.zig");
 const RunFlavor = @import("vm.zig").RunFlavor;
 const Reporter = @import("Reporter.zig");
-const StringParser = @import("string_parser.zig").StringParser;
+const StringParser = @import("StringParser.zig");
 const pcre = if (!is_wasm) @import("pcre.zig") else void;
 const buzz_api = @import("lib/buzz_api.zig");
 const io = @import("io.zig");

diff --git a/src/StringParser.zig b/src/StringParser.zig
@@ -0,0 +1,195 @@
+const std = @import("std");
+const Parser = @import("Parser.zig");
+const Scanner = @import("Scanner.zig");
+const obj = @import("obj.zig");
+const Value = @import("value.zig").Value;
+const Token = @import("Token.zig");
+const Ast = @import("Ast.zig");
+
+const Self = @This();
+
+script_name: []const u8,
+source: []const u8,
+
+parser: *Parser,
+current: ?u8 = null,
+// TODO: this memory is never freed: it end up as key of the `strings` hashmap
+//       and since not all of its keys come from here, we don't know which we can
+//       free when we deinit strings.
+current_chunk: std.ArrayList(u8),
+offset: usize = 0,
+previous_interp: ?usize = null,
+chunk_count: usize = 0,
+elements: std.ArrayList(Ast.Node.Index),
+line_offset: usize,
+column_offset: usize,
+
+pub fn init(
+    parser: *Parser,
+    source: []const u8,
+    script_name: []const u8,
+    line_offset: usize,
+    column_offset: usize,
+) Self {
+    return Self{
+        .parser = parser,
+        .source = source,
+        .current_chunk = std.ArrayList(u8).init(parser.gc.allocator),
+        .elements = std.ArrayList(Ast.Node.Index).init(parser.gc.allocator),
+        .line_offset = line_offset,
+        .column_offset = column_offset,
+        .script_name = script_name,
+    };
+}
+
+fn advance(self: *Self) ?u8 {
+    if (self.offset >= self.source.len) {
+        return null;
+    }
+
+    self.current = self.source[self.offset];
+    self.offset += 1;
+
+    if (self.current != null and self.current.? == '\n') {
+        self.line_offset += 1;
+        self.column_offset = 0;
+    } else {
+        self.column_offset += 1;
+    }
+
+    return self.current;
+}
+
+pub fn parse(self: *Self) !Ast.Node.Index {
+    while (self.offset < self.source.len) {
+        const char: ?u8 = self.advance();
+        if (char == null) {
+            break;
+        }
+
+        switch (char.?) {
+            '\\' => try self.escape(),
+            '{' => {
+                if (self.previous_interp == null or self.previous_interp.? < self.offset - 1) {
+                    if (self.current_chunk.items.len > 0) {
+                        try self.push(self.current_chunk.items);
+                        // The previous `current_chunk` memory is owned by the parser
+                        self.current_chunk = std.ArrayList(u8).init(self.parser.gc.allocator);
+
+                        try self.inc();
+                    }
+                }
+
+                try self.interpolation();
+
+                try self.inc();
+            },
+            else => try self.current_chunk.append(char.?),
+        }
+    }
+
+    // Trailing string
+    if ((self.previous_interp == null or self.previous_interp.? < self.offset) and self.current_chunk.items.len > 0) {
+        try self.push(self.current_chunk.items);
+
+        // The previous `current_chunk` memory is owned by the parser
+        self.current_chunk = std.ArrayList(u8).init(self.parser.gc.allocator);
+    }
+
+    self.elements.shrinkAndFree(self.elements.items.len);
+
+    return try self.parser.ast.appendNode(
+        .{
+            .tag = .String,
+            .location = self.parser.ast.nodes.items(.location)[self.elements.items[0]],
+            .end_location = self.parser.ast.nodes.items(.location)[self.elements.getLast()],
+            .type_def = self.parser.gc.type_registry.str_type,
+            .components = .{
+                .String = self.elements.items,
+            },
+        },
+    );
+}
+
+fn push(self: *Self, chars: []const u8) !void {
+    try self.elements.append(
+        try self.parser.ast.appendNode(
+            .{
+                .tag = .StringLiteral,
+                .location = self.parser.current_token.? - 1,
+                .end_location = self.parser.current_token.? - 1,
+                .type_def = self.parser.gc.type_registry.str_type,
+                .components = .{
+                    .StringLiteral = try self.parser.gc.copyString(chars),
+                },
+            },
+        ),
+    );
+}
+
+fn inc(self: *Self) !void {
+    self.chunk_count += 1;
+}
+
+fn interpolation(self: *Self) !void {
+    const expr = self.source[self.offset..];
+
+    var expr_scanner = Scanner.init(
+        self.parser.gc.allocator,
+        self.parser.script_name,
+        expr,
+    );
+    expr_scanner.line_offset = self.line_offset;
+    expr_scanner.column_offset = self.column_offset;
+
+    // Replace parser scanner with one that only looks at that substring
+    const scanner = self.parser.scanner;
+    self.parser.scanner = expr_scanner;
+
+    try self.parser.advance();
+
+    // Parse expression
+    try self.elements.append(try self.parser.expression(false));
+
+    self.offset += self.parser.scanner.?.current.offset - 1;
+    self.previous_interp = self.offset;
+
+    // Put back parser's scanner
+    self.parser.scanner = scanner;
+
+    // Consume closing `}`
+    _ = self.advance();
+}
+
+fn escape(self: *Self) !void {
+    const char: ?u8 = self.advance();
+    if (char == null) {
+        return;
+    }
+    switch (char.?) {
+        'n' => try self.current_chunk.append('\n'),
+        't' => try self.current_chunk.append('\t'),
+        'r' => try self.current_chunk.append('\r'),
+        '"' => try self.current_chunk.append('"'),
+        '\\' => try self.current_chunk.append('\\'),
+        '{' => try self.current_chunk.append('{'),
+        else => try self.rawChar(),
+    }
+}
+
+fn rawChar(self: *Self) !void {
+    const start: usize = self.offset - 1;
+    while (self.offset + 1 < self.source.len and self.source[self.offset + 1] >= '0' and self.source[self.offset + 1] <= '9') {
+        _ = self.advance();
+    }
+
+    const num_str: []const u8 = self.source[start..@min(self.offset + 1, self.source.len)];
+    _ = self.advance();
+    const number: ?u8 = std.fmt.parseInt(u8, num_str, 10) catch null;
+
+    if (number) |unumber| {
+        try self.current_chunk.append(unumber);
+    } else {
+        self.parser.reportError(.raw_char, "Raw char should be between 0 and 255.");
+    }
+}
diff --git a/src/memory.zig b/src/memory.zig
@@ -182,18 +182,6 @@ pub const TypeRegistry = struct {
 //     - Unmark all objects
 //     - Do a mark and sweep
 //
-// Writer barrier on those OpCodes:
-//     * OP_LIST_APPEND
-//     * OP_SET_MAP
-//     * OP_SET_SUBSCRIPT
-//     * OP_INHERIT
-//     * OP_INSTANCE
-//     * OP_METHOD
-//     * OP_PROPERTY
-//     * OP_SET_PROPERTY
-//
-// Can we avoid going through the whole linked list of objects when sweeping?
-//
 // For now we only have either young or old objects but we could improve on it with more categories with each its threshold
 pub const GarbageCollector = struct {
     const Self = @This();