Skip to content
Permalink
Browse files

SimpleSplitConv: strtok replacement

  • Loading branch information
pjotrp committed Oct 22, 2018
1 parent e3e1853 commit 84d14e601409ea62b42fa83969f0b822a8c1c378
Showing with 69 additions and 1 deletion.
  1. +2 −1 Makefile
  2. +67 −0 bio/std/range/splitter.d
@@ -15,7 +15,7 @@ endif
DLIBS = $(LIBRARY_PATH)/libphobos2-ldc.a $(LIBRARY_PATH)/libdruntime-ldc.a
DLIBS_DEBUG = $(LIBRARY_PATH)/libphobos2-ldc-debug.a $(LIBRARY_PATH)/libdruntime-ldc-debug.a

SRC = $(wildcard bio/std/*.d) bio2/bgzf.d bio/core/bgzf/constants.d
SRC = $(wildcard bio/std/*.d) $(wildcard bio/std/*/*.d) bio2/bgzf.d bio/core/bgzf/constants.d
OBJ = $(SRC:.d=.o)
BIN = bin/biod_tests

@@ -38,6 +38,7 @@ $(BIN): $(OBJ)

check: $(BIN)
$(BIN)

clean:
rm -vf $(OBJ)
rm -v $(BIN)
@@ -0,0 +1,67 @@
/*
This file is part of BioD.
Copyright (C) 2018 Pjotr Prins <pjotr.prins@thebird.nl>
*/

module bio.std.range.splitter;

import std.algorithm;
import std.array;
import std.conv;
import std.exception;
import std.stdio;

import std.range.primitives;

immutable ubyte[] SPLIT_ON = [ 0x20, 0x09, 0x0A, ';', ',' ];

/**
SimpleSplitConv takes a range R (typically a text line) and splits
it/tokenizes it on a list of characters. Essentially fields/tokens
are split by tabs, semi-colons or comma's and spaces. This compares
to C's strtok(str, ", \t;").
This routine happens often in bioinformatics and is a replacement
for the much unsafer C strtok. This edition should also handle
UTF.
The default is to split on space, newline, tab, semi-colon and
comma.
*/

struct SimpleSplitConv(R)
if (isInputRange!R)
{
R list, split_on;

this(R range, R splits_on = cast(R)SPLIT_ON) {
list = range;
split_on = splits_on;
}

int opApply(scope int delegate(R) dg) {
size_t start = 0;
bool in_whitespace = false;
foreach(size_t pos, c; list) {
if (canFind(split_on,c)) { // hit split char
if (!in_whitespace) { // emit
auto token = list[start..pos];
dg(token);
}
start = pos+1;
in_whitespace = true;
} else {
in_whitespace = false;
}
}
return 0;
}
}

unittest {
auto s = cast(ubyte[])"hello 1 2 \t3 4 \n";
assert(array(SimpleSplitConv!(ubyte[])(s)) == ["hello","1","2","3","4"]);
assert(array(SimpleSplitConv!(ubyte[])(cast(ubyte[])" hello, 1 2 \t3 4 \n")) == ["","hello","1","2","3","4"]);
assert(array(SimpleSplitConv!(ubyte[])(cast(ubyte[])"hello, 1 2 \n\t3 4 \n")) == ["hello","1","2","3","4"]);
}

0 comments on commit 84d14e6

Please sign in to comment.
You can’t perform that action at this time.