zig-csv

A library to parse CSV files into user-defined structs.

Disclaimer: I haven't used this in production and I don't know how it would fare there. I made this to learn zig.

Quickstart

Install

It is not super clear to me how people install libraries in zig but these instructions have worked for me:

Make a vendor folder in your project and add csv as a git submodule:

$ mkdir vendor
$ git submodule add https://github.com/bensu/csv-zig vendor/csv
$ ls vendor/csv
README.md	build.zig	data		src		test

Add it as a package in your build.zig:

const exe = b.addExecutable("your-app", "src/main.zig");
exe.addPackage(.{
    .name = "csv",
    .source = .{ .path = "vendor/csv/src/csv.zig" },
});

Parse

Consider the following CSV file test/data/pokemon_example.csv:

id,name,captured,color,health,
1,squirtle,false,blue,,
2,charmander,false,red,,
3,pikachu,true,yellow,10.0,

You can define a struct with you expect to find in it and then parse it with an iterator:

const std = @import("std");
const fs = std.fs;

// Import csv
const csv = @import("csv.zig");

const Color = enum { red, blue, green, yellow };

// Define the type of CSV rows as a struct
const Pokemon = struct {
    id: u32,
    name: []const u8,
    captured: bool,
    color: Color,
    health: ?f32,
};

test "parsing pokemon" {
    var file = try fs.cwd().openFile("test/data/pokemon_example.csv", .{});
    defer file.close();
    const reader = file.reader();

    const allocator = std.testing.allocator;
    var arena = std.heap.ArenaAllocator.init(allocator);
    defer arena.deinit();

    const config: csv.CsvConfig = .{}; // use the default config

    const PokemonCsvParser = csv.CsvParser(Pokemon, fs.File.Reader, config);

    var parser = try PokemonCsvParser.init(arena.allocator(), reader);

    var number_captured: u32 = 0;
    while (try parser.next()) |pokemon| {
        if (pokemon.captured) {
            number_captured += 1;
        }
    }

    try std.testing.expectEqual(number_captured, 1);
    std.debug.print("You have captured {} Pokemons", .{number_captured});
}

Serialize

Now, instead of parsing the file, we are going to serialize the same contents from in-memory data into tmp/pokemon.csv:

test "serializing pokemon" {
    var file = try fs.cwd().createFile("tmp/pokemon.csv", .{});
    defer file.close();
    const writer = file.writer();

    const allocator = std.testing.allocator;
    var arena = std.heap.ArenaAllocator.init(allocator);
    defer arena.deinit();

    const config: csv.CsvConfig = .{};
    const PokemonCsvSerializer = csv.CsvSerializer(Pokemon, fs.File.Writer, config);
    var serializer = PokemonCsvSerializer.init(writer);

    const pokemons = [3]Pokemon{
        Pokemon{
            .id = 1,
            .name = "squirtle",
            .captured = false,
            .color = Color.blue,
            .health = null,
        },
        Pokemon{
            .id = 2,
            .name = "charmander",
            .captured = false,
            .color = Color.red,
            .health = null,
        },
        Pokemon{
            .id = 1,
            .name = "pikachu",
            .captured = true,
            .color = Color.yellow,
            .health = 10.0,
        },
    };

    try serializer.writeHeader();

    for (pokemons) |pokemon| {
        try serializer.appendRow(pokemon);
    }
}

tmp/pokemon.csv should now have the same contents as test/data/pokemon_example.csv above, header included.

API Reference

pub const CsvConfig = struct {
    field_end_delimiter: u8 = ',',
    row_end_delimiter: u8 = '\n',
    quote_delimiter: u8 = '"',
    skip_first_row: bool = true,
};

pub fn CsvParser(
    comptime T: type,
    comptime Reader: type,
    comptime config: cnf.CsvConfig,
) type {
    return struct {

        // Create one CsvParser, valid for one pass over the Reader
        fn init(
            allocator: std.mem.Allocator,
            reader: Reader,
        ) CsvParseError!CsvSerializer {}


        // Returns the next row T or null if the iterator is done
        fn next() CsvParseError!?T {}

        // Like next() but writes the struct into the provider pointer
        fn nextInto(struct_pointer: *T) CsvParseError!?*T {}
    }
}


pub fn CsvSerializer(
    comptime T: type,
    comptime Writer: type,
    comptime config: cnf.CsvConfig,
) type {
    return struct {
        fn init(writer: Writer) CsvSerializer {}

        fn writeHeader() WriterError!void {}

        fn appendRow(data: T) WriterError!void {}
    }
}

with the following errors:

// Errors generated by the CsvParser
const CsvParseSpecificError = error{
    BadInput,
    MissingFields,
    ExtraFields,
    OutOfMemory,
};

pub const ReaderError = error { ... }; // from reader.read(), e.g. fs.File.Reader
pub const WriterError = error { ... }; // from writer.write(), e.g. fs.File.Writer

pub const CsvParseError = CsvParseSpecificError || ReaderError;

Example usage:

const config: csv.CsvConfig = {
    .field_end_delimiter = ',',
    .row_end_delimiter = '\n',
    .quote_delimiter = '"',
    .skip_first_row = true,
};

const StructType = struct {
    int_field:   u32,
    float_field: f64,
    str_field:   []const u8,
    enum_field:  enum { red, blue, yellow },
    union_field: union { int_case: i32, float_case: f32 },
    bool_field:  bool,
    maybe_field: ?f64,
    void_field:  void,  // Use to skip parsing certain columns
}

var parser = csv.CsvParser(StructType, fs.File.Reader, config).init(reader);

var total: u32 = 0;
while (try parser.next()) |row| {
    // do something with the row
    if (std.mem.eql(u8, "special_value", row.str_field)) {
        total += row.int_field;
    }
}

var serializer = csv.CsvSerializer(StructType, config).init(writer);

try serializer.writeHeader();
try serializer.appendRow(StructType{ ... });
try serializer.appendRow(StructType{ ... });
// ...

Examples

Parse from one file and serialize into another one

From src/csv/end_to_end.zig:

const T = struct { id: i64, age: u32 };

const from_path = "data/from_file.csv";
var from_file = try fs.cwd().openFile(from_path, .{});
defer from_file.close();
const reader = from_file.reader();

const to_path = "tmp/to_file.csv";
var to_file = try fs.cwd().createFile(to_path, .{});
defer to_file.close();
const writer = to_file.writer();

const allocator = std.testing.allocator;
var arena = std.heap.ArenaAllocator.init(allocator);
defer arena.deinit();

var parser = try csv.CsvParser(T, fs.File.Reader, .{}).init(arena.allocator(), reader);

var serializer = csv.CsvSerializer(T, fs.File.Writer, .{}).init(writer);

var rows: usize = 0;
try serializer.writeHeader();
while (try parser.next()) |row| {
    rows = rows + 1;
    try serializer.appendRow(row);
}

std.debug.print("Wrote {} rows", .{rows});

Parse into a pre-allocated Array

From src/csv/parse.zig:

const T = struct { id: i64, age: u32 };

const file_path = "test/data/simple_end_to_end.csv";
var file = try fs.cwd().openFile(file_path, .{});
defer file.close();
const reader = file.reader();

var arena = std.heap.ArenaAllocator.init(allocator);
defer arena.deinit();
const arena_allocator = arena.allocator();

// if you know how many to rows to expect you can use an Array directly
const expected_rows = 17;
const array: []T = try arena_allocator.alloc(T, expected_rows);

var parser = try csv.CsvParser(T, fs.File.Reader, .{}).init(arena_allocator, reader);

var i: usize = 0;
while (i < expected_rows) {
    _ = try parser.nextInto(&array[i]);
    i += 1;
}

Parse into a pre-allocated ArrayList

From src/csv/parse.zig:

const T = struct { id: i64, age: u32 };

const file_path = "test/data/simple_end_to_end.csv";
var file = try fs.cwd().openFile(file_path, .{});
defer file.close();
const reader = file.reader();

var arena = std.heap.ArenaAllocator.init(allocator);
defer arena.deinit();
const arena_allocator = arena.allocator();

// if you don't know how many rows to expect, you can use ArrayList
var list = std.ArrayList(T).init(allocator);
defer list.deinit();

var parser = try csv.CsvParser(T, fs.File.Reader, .{}).init(arena_allocator, reader);

while (try parser.next()) |row| {
    try list.append(row);
}

Performance: skip fields and re-use strings

To improve performance, you can:

Assign void to the fields that you don't need and the parser will skip them.
Re-use the same memory for the strings of every row, provided you don't need to keep those strings after you processed them.

// 1. We mark void every field we don't need, maintaining their order

const NamelessPokemon = struct {
    id: void,
    name: []const u8,
    captured: bool,
    color: void,
    health: void,
};

var file = try fs.cwd().openFile("test/data/pokemon_example.csv", .{});
defer file.close();
const reader = file.reader();

// 2. We will keep the strings of one row at a time in this buffer
var buffer: [4096]u8 = undefined;
var fba = std.heap.FixedBufferAllocator.init(&buffer);

const PokemonCsvParser = csv.CsvParser(NamelessPokemon, fs.File.Reader, .{});

var parser = try PokemonCsvParser.init(fba.allocator(), reader);

var pikachus_captured: u32 = 0;
while (try parser.next()) |pokemon| {

    // 1. We only use pokemon.captured and pokemon.name, everything else is void
    if (pokemon.captured and std.mem.eql(u8, "pikachu", pokemon.name)) {
        pikachus_captured += 1;
    }

    // 2. We already used the allocated strings (pokemon.name) so we can reset
    //    the memory. If we didn't, we would get an OutOfMemory error when the
    //    FixedBufferAllocator runs out of memory
    fba.reset();
}

std.debug.print("You captured {} Pikachus", .{pikachus_captured});

Parse and serialize directly from buffers

From src/csv/end_to_end_test.zig:

test "buffer end to end" {
    const T = struct { id: u32, name: []const u8 };

    // parse
    const source = "id,name,\n1,none,";
    const n = source.len;

    var parsed_rows: [1]T = undefined;

    var buffer_stream = std.io.fixedBufferStream(source[0..n]);
    const reader = buffer_stream.reader();

    var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
    defer arena.deinit();
    const arena_allocator = arena.allocator();

    var parser = try csv.CsvParser(T, @TypeOf(reader), .{}).init(arena_allocator, reader);

    var i: usize = 0;
    while (try parser.next()) |row| {
        parsed_rows[i] = row;
        i += 1;
    }

    // serialize
    var buffer: [n + 1]u8 = undefined;
    var fixed_buffer_stream = std.io.fixedBufferStream(buffer[0..]);
    const writer = fixed_buffer_stream.writer();

    var serializer = csv.CsvSerializer(T, @TypeOf(writer), .{}).init(writer);

    try serializer.writeHeader();
    for (parsed_rows) |row| {
        try serializer.appendRow(row);
    }

    try std.testing.expect(std.mem.eql(u8, source, buffer[0..n]));
}

Informal benchmarks

In my M1, this library can run over a 144Mb CSV file in 418ms if it parses every column and 301ms if it only extracts a few fields:

$ # get the benchmark data
$ git submodule update --remote benchmark
$ cd benchmark
$ bash unzip_data.bash
$ cd ..
$ # run the benchmark
$ zig build -Drelease-fast=true; zig-out/bin/csv

Starting benchmark
Parsed in 4ms on average     -- bench.NFL               // 1.3MB all columns, 325 MB/s
Parsed in 418ms on average   -- bench.FullPopulation    // 144MB all columns, 344 MB/s
Parsed in 301ms on average   -- bench.Population        // 144MB few columns, 478 MB/s
Parsed in 1ms on average     -- bench.MBTA              // N/A 1ms might be off by 50%
Parsed in 263ms on average   -- bench.Trade             // 150MB all columns, 570 MB/s
Parsed in 117ms on average   -- bench.StateDepartment   //  70MB all columns, 598 MB/s
Number of US-MA population: 5988064 in 420 ms           // 144MB all columns, 342 MB/s
Total population: 2289584999 in 291 ms                  // 144MB few columns, 494 MB/s

I took these benchmark files from these great projects:

Thank you to these authors for compiling the benchmarks!

After running those benchmarks in my computer, this library is on par or slightly better than rust-csv and cpp/csv-parser and around 2x faster than the Java libraries (which makes sense because in zig it is possible to avoid a lot of allocations relative to Java). You can find more info in the benchmarks documentation.

Name		Name	Last commit message	Last commit date
Latest commit History 101 Commits
.vscode		.vscode
benchmark @ 9867583		benchmark @ 9867583
data		data
docs		docs
src		src
test/data		test/data
.gitignore		.gitignore
.gitmodules		.gitmodules
LICENSE		LICENSE
README.md		README.md
build.zig		build.zig

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

zig-csv

Quickstart

Install

Parse

Serialize

API Reference

Examples

Parse from one file and serialize into another one

Parse into a pre-allocated Array

Parse into a pre-allocated ArrayList

Performance: skip fields and re-use strings

Parse and serialize directly from buffers

Informal benchmarks

About

Releases

Packages

Languages

License

bensu/zig-csv

Folders and files

Latest commit

History

Repository files navigation

zig-csv

Quickstart

Install

Parse

Serialize

API Reference

Examples

Parse from one file and serialize into another one

Parse into a pre-allocated Array

Parse into a pre-allocated ArrayList

Performance: skip fields and re-use strings

Parse and serialize directly from buffers

Informal benchmarks

About

Resources

License

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages