A library to parse CSV files into user-defined structs.
Disclaimer: I haven't used this in production and I don't know how it would fare there. I made this to learn zig.
It is not super clear to me how people install libraries in zig but these instructions have worked for me:
- Make a
vendor
folder in your project and addcsv
as a git submodule:
$ mkdir vendor
$ git submodule add https://github.com/bensu/csv-zig vendor/csv
$ ls vendor/csv
README.md build.zig data src test
- Add it as a package in your
build.zig
:
const exe = b.addExecutable("your-app", "src/main.zig");
exe.addPackage(.{
.name = "csv",
.source = .{ .path = "vendor/csv/src/csv.zig" },
});
Consider the following CSV file test/data/pokemon_example.csv
:
id,name,captured,color,health,
1,squirtle,false,blue,,
2,charmander,false,red,,
3,pikachu,true,yellow,10.0,
You can define a struct with you expect to find in it and then parse it with an iterator:
const std = @import("std");
const fs = std.fs;
// Import csv
const csv = @import("csv.zig");
const Color = enum { red, blue, green, yellow };
// Define the type of CSV rows as a struct
const Pokemon = struct {
id: u32,
name: []const u8,
captured: bool,
color: Color,
health: ?f32,
};
test "parsing pokemon" {
var file = try fs.cwd().openFile("test/data/pokemon_example.csv", .{});
defer file.close();
const reader = file.reader();
const allocator = std.testing.allocator;
var arena = std.heap.ArenaAllocator.init(allocator);
defer arena.deinit();
const config: csv.CsvConfig = .{}; // use the default config
const PokemonCsvParser = csv.CsvParser(Pokemon, fs.File.Reader, config);
var parser = try PokemonCsvParser.init(arena.allocator(), reader);
var number_captured: u32 = 0;
while (try parser.next()) |pokemon| {
if (pokemon.captured) {
number_captured += 1;
}
}
try std.testing.expectEqual(number_captured, 1);
std.debug.print("You have captured {} Pokemons", .{number_captured});
}
Now, instead of parsing the file, we are going to serialize the same contents from in-memory data into tmp/pokemon.csv
:
test "serializing pokemon" {
var file = try fs.cwd().createFile("tmp/pokemon.csv", .{});
defer file.close();
const writer = file.writer();
const allocator = std.testing.allocator;
var arena = std.heap.ArenaAllocator.init(allocator);
defer arena.deinit();
const config: csv.CsvConfig = .{};
const PokemonCsvSerializer = csv.CsvSerializer(Pokemon, fs.File.Writer, config);
var serializer = PokemonCsvSerializer.init(writer);
const pokemons = [3]Pokemon{
Pokemon{
.id = 1,
.name = "squirtle",
.captured = false,
.color = Color.blue,
.health = null,
},
Pokemon{
.id = 2,
.name = "charmander",
.captured = false,
.color = Color.red,
.health = null,
},
Pokemon{
.id = 1,
.name = "pikachu",
.captured = true,
.color = Color.yellow,
.health = 10.0,
},
};
try serializer.writeHeader();
for (pokemons) |pokemon| {
try serializer.appendRow(pokemon);
}
}
tmp/pokemon.csv
should now have the same contents as test/data/pokemon_example.csv
above, header included.
pub const CsvConfig = struct {
field_end_delimiter: u8 = ',',
row_end_delimiter: u8 = '\n',
quote_delimiter: u8 = '"',
skip_first_row: bool = true,
};
pub fn CsvParser(
comptime T: type,
comptime Reader: type,
comptime config: cnf.CsvConfig,
) type {
return struct {
// Create one CsvParser, valid for one pass over the Reader
fn init(
allocator: std.mem.Allocator,
reader: Reader,
) CsvParseError!CsvSerializer {}
// Returns the next row T or null if the iterator is done
fn next() CsvParseError!?T {}
// Like next() but writes the struct into the provider pointer
fn nextInto(struct_pointer: *T) CsvParseError!?*T {}
}
}
pub fn CsvSerializer(
comptime T: type,
comptime Writer: type,
comptime config: cnf.CsvConfig,
) type {
return struct {
fn init(writer: Writer) CsvSerializer {}
fn writeHeader() WriterError!void {}
fn appendRow(data: T) WriterError!void {}
}
}
with the following errors:
// Errors generated by the CsvParser
const CsvParseSpecificError = error{
BadInput,
MissingFields,
ExtraFields,
OutOfMemory,
};
pub const ReaderError = error { ... }; // from reader.read(), e.g. fs.File.Reader
pub const WriterError = error { ... }; // from writer.write(), e.g. fs.File.Writer
pub const CsvParseError = CsvParseSpecificError || ReaderError;
Example usage:
const config: csv.CsvConfig = {
.field_end_delimiter = ',',
.row_end_delimiter = '\n',
.quote_delimiter = '"',
.skip_first_row = true,
};
const StructType = struct {
int_field: u32,
float_field: f64,
str_field: []const u8,
enum_field: enum { red, blue, yellow },
union_field: union { int_case: i32, float_case: f32 },
bool_field: bool,
maybe_field: ?f64,
void_field: void, // Use to skip parsing certain columns
}
var parser = csv.CsvParser(StructType, fs.File.Reader, config).init(reader);
var total: u32 = 0;
while (try parser.next()) |row| {
// do something with the row
if (std.mem.eql(u8, "special_value", row.str_field)) {
total += row.int_field;
}
}
var serializer = csv.CsvSerializer(StructType, config).init(writer);
try serializer.writeHeader();
try serializer.appendRow(StructType{ ... });
try serializer.appendRow(StructType{ ... });
// ...
From src/csv/end_to_end.zig
:
const T = struct { id: i64, age: u32 };
const from_path = "data/from_file.csv";
var from_file = try fs.cwd().openFile(from_path, .{});
defer from_file.close();
const reader = from_file.reader();
const to_path = "tmp/to_file.csv";
var to_file = try fs.cwd().createFile(to_path, .{});
defer to_file.close();
const writer = to_file.writer();
const allocator = std.testing.allocator;
var arena = std.heap.ArenaAllocator.init(allocator);
defer arena.deinit();
var parser = try csv.CsvParser(T, fs.File.Reader, .{}).init(arena.allocator(), reader);
var serializer = csv.CsvSerializer(T, fs.File.Writer, .{}).init(writer);
var rows: usize = 0;
try serializer.writeHeader();
while (try parser.next()) |row| {
rows = rows + 1;
try serializer.appendRow(row);
}
std.debug.print("Wrote {} rows", .{rows});
From src/csv/parse.zig
:
const T = struct { id: i64, age: u32 };
const file_path = "test/data/simple_end_to_end.csv";
var file = try fs.cwd().openFile(file_path, .{});
defer file.close();
const reader = file.reader();
var arena = std.heap.ArenaAllocator.init(allocator);
defer arena.deinit();
const arena_allocator = arena.allocator();
// if you know how many to rows to expect you can use an Array directly
const expected_rows = 17;
const array: []T = try arena_allocator.alloc(T, expected_rows);
var parser = try csv.CsvParser(T, fs.File.Reader, .{}).init(arena_allocator, reader);
var i: usize = 0;
while (i < expected_rows) {
_ = try parser.nextInto(&array[i]);
i += 1;
}
From src/csv/parse.zig
:
const T = struct { id: i64, age: u32 };
const file_path = "test/data/simple_end_to_end.csv";
var file = try fs.cwd().openFile(file_path, .{});
defer file.close();
const reader = file.reader();
var arena = std.heap.ArenaAllocator.init(allocator);
defer arena.deinit();
const arena_allocator = arena.allocator();
// if you don't know how many rows to expect, you can use ArrayList
var list = std.ArrayList(T).init(allocator);
defer list.deinit();
var parser = try csv.CsvParser(T, fs.File.Reader, .{}).init(arena_allocator, reader);
while (try parser.next()) |row| {
try list.append(row);
}
To improve performance, you can:
- Assign
void
to the fields that you don't need and the parser will skip them. - Re-use the same memory for the strings of every row, provided you don't need to keep those strings after you processed them.
// 1. We mark void every field we don't need, maintaining their order
const NamelessPokemon = struct {
id: void,
name: []const u8,
captured: bool,
color: void,
health: void,
};
var file = try fs.cwd().openFile("test/data/pokemon_example.csv", .{});
defer file.close();
const reader = file.reader();
// 2. We will keep the strings of one row at a time in this buffer
var buffer: [4096]u8 = undefined;
var fba = std.heap.FixedBufferAllocator.init(&buffer);
const PokemonCsvParser = csv.CsvParser(NamelessPokemon, fs.File.Reader, .{});
var parser = try PokemonCsvParser.init(fba.allocator(), reader);
var pikachus_captured: u32 = 0;
while (try parser.next()) |pokemon| {
// 1. We only use pokemon.captured and pokemon.name, everything else is void
if (pokemon.captured and std.mem.eql(u8, "pikachu", pokemon.name)) {
pikachus_captured += 1;
}
// 2. We already used the allocated strings (pokemon.name) so we can reset
// the memory. If we didn't, we would get an OutOfMemory error when the
// FixedBufferAllocator runs out of memory
fba.reset();
}
std.debug.print("You captured {} Pikachus", .{pikachus_captured});
From src/csv/end_to_end_test.zig
:
test "buffer end to end" {
const T = struct { id: u32, name: []const u8 };
// parse
const source = "id,name,\n1,none,";
const n = source.len;
var parsed_rows: [1]T = undefined;
var buffer_stream = std.io.fixedBufferStream(source[0..n]);
const reader = buffer_stream.reader();
var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
defer arena.deinit();
const arena_allocator = arena.allocator();
var parser = try csv.CsvParser(T, @TypeOf(reader), .{}).init(arena_allocator, reader);
var i: usize = 0;
while (try parser.next()) |row| {
parsed_rows[i] = row;
i += 1;
}
// serialize
var buffer: [n + 1]u8 = undefined;
var fixed_buffer_stream = std.io.fixedBufferStream(buffer[0..]);
const writer = fixed_buffer_stream.writer();
var serializer = csv.CsvSerializer(T, @TypeOf(writer), .{}).init(writer);
try serializer.writeHeader();
for (parsed_rows) |row| {
try serializer.appendRow(row);
}
try std.testing.expect(std.mem.eql(u8, source, buffer[0..n]));
}
In my M1, this library can run over a 144Mb CSV file in 418ms if it parses every column and 301ms if it only extracts a few fields:
$ # get the benchmark data
$ git submodule update --remote benchmark
$ cd benchmark
$ bash unzip_data.bash
$ cd ..
$ # run the benchmark
$ zig build -Drelease-fast=true; zig-out/bin/csv
Starting benchmark
Parsed in 4ms on average -- bench.NFL // 1.3MB all columns, 325 MB/s
Parsed in 418ms on average -- bench.FullPopulation // 144MB all columns, 344 MB/s
Parsed in 301ms on average -- bench.Population // 144MB few columns, 478 MB/s
Parsed in 1ms on average -- bench.MBTA // N/A 1ms might be off by 50%
Parsed in 263ms on average -- bench.Trade // 150MB all columns, 570 MB/s
Parsed in 117ms on average -- bench.StateDepartment // 70MB all columns, 598 MB/s
Number of US-MA population: 5988064 in 420 ms // 144MB all columns, 342 MB/s
Total population: 2289584999 in 291 ms // 144MB few columns, 494 MB/s
I took these benchmark files from these great projects:
Thank you to these authors for compiling the benchmarks!
After running those benchmarks in my computer, this library is on par or slightly better than rust-csv and cpp/csv-parser and around 2x faster than the Java libraries (which makes sense because in zig it is possible to avoid a lot of allocations relative to Java). You can find more info in the benchmarks documentation.