Skip to content

Commit

Permalink
optimise standard csv
Browse files Browse the repository at this point in the history
  • Loading branch information
arnaudroger committed Jun 20, 2016
1 parent 7280c50 commit 5d1c3f2
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 31 deletions.
28 changes: 28 additions & 0 deletions sfm-jmh/src/main/java/org/sfm/csv/CsvParserBenchmark.java
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,33 @@ public class CsvParserBenchmark {
CsvParserBenchmark.parseTrim avgt 20 778.768 ± 1.860 ns/op
Mac 2.9.4
Benchmark Mode Cnt Score Error Units
CsvParserBenchmark.parse avgt 20 212.758 ± 2.900 ns/op
CsvParserBenchmark.parseQuote avgt 20 289.970 ± 4.882 ns/op
CsvParserBenchmark.parseQuote2 avgt 20 359.580 ± 4.132 ns/op
CsvParserBenchmark.parseTrim avgt 20 263.594 ± 5.287 ns/op
Mac 2.9.5
Benchmark Mode Cnt Score Error Units
CsvParserBenchmark.parse avgt 20 184.681 ± 4.202 ns/op
CsvParserBenchmark.parseQuote avgt 20 317.010 ± 2.848 ns/op
CsvParserBenchmark.parseQuote2 avgt 20 385.423 ± 8.884 ns/op
CsvParserBenchmark.parseTrim avgt 20 275.656 ± 1.510 ns/op
Perf branch
Benchmark Mode Cnt Score Error Units
CsvParserBenchmark.parse avgt 20 168.031 ± 2.957 ns/op
CsvParserBenchmark.parseQuote avgt 20 286.045 ± 5.008 ns/op
CsvParserBenchmark.parseQuote2 avgt 20 337.570 ± 5.398 ns/op
CsvParserBenchmark.parseTrim avgt 20 256.215 ± 1.940 ns/op
*/
public String csv = "val,val2 sdssddsds,lllll llll,sdkokokokokads<>Sddsdsds, adsdsadsad ,1, 3 ,4";
public String csvQuote = "\"val\",\"val2 sdssddsds\",\"lllll llll\",\"sdkokokokokads<>Sddsdsds\",\"adsdsadsad\",\"1\",\"3\",\"4\"";
public String csvQuote2 = "\"val \"\" \",\"val2 \"\"sdssddsds\",\"lllll llll\",\"sdkokokokokads<>Sddsdsds\",\"adsdsadsad\",\"1\",\"3\",\"4\"";


public static final CsvParser.DSL dsl = CsvParser.dsl();
Expand All @@ -58,6 +82,10 @@ public void parseQuote(Blackhole blackhole) throws IOException {
dsl.parse(csvQuote, new MyCellConsumer(blackhole));
}

@Benchmark
public void parseQuote2(Blackhole blackhole) throws IOException {
dsl.parse(csvQuote2, new MyCellConsumer(blackhole));
}
public static void main(String[] args) throws IOException {
new CsvParserBenchmark().parseQuote(null);
}
Expand Down
70 changes: 40 additions & 30 deletions sfm/src/main/java/org/sfm/csv/parser/StandardCsvCharConsumer.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
public final class StandardCsvCharConsumer extends CsvCharConsumer {

private static final int NOTHING = 8;
private static final int IN_QUOTE = 4;
private static final int IN_CR = 2;
private static final int QUOTE = 1;
private static final int IN_CR = 4;
private static final int QUOTE = 2;
private static final int IN_QUOTE = 1;
private static final int NONE = 0;
private static final int TURN_OFF_NOTHING = ~NOTHING;
private static final int TURN_OFF_IN_CR_MASK = ~IN_CR;
Expand Down Expand Up @@ -43,15 +43,15 @@ public final void consumeAllBuffer(CellConsumer cellConsumer) {
private int consumeOneChar(CellConsumer cellConsumer, int currentIndex, int currentState, char character) {
switch(character) {
case ',':
return newCellIfNotInQuote(currentIndex, currentState, cellConsumer);
return newCellIfNotInQuote(currentIndex, currentState, cellConsumer);
case '\n':
return handleEndOfLineLF(currentIndex, currentState, cellConsumer);
return handleEndOfLineLF(currentIndex, currentState, cellConsumer);
case '\r':
return handleEndOfLineCR(currentIndex, currentState, cellConsumer);
case '"':
return quote(currentIndex, currentState);
return quote(currentState);
}
return currentState;
return currentState & TURN_OFF_IN_CR_MASK;
}

@Override
Expand Down Expand Up @@ -85,7 +85,7 @@ public boolean consumeToNextRow(CellConsumer cellConsumer) {
currentState &= TURN_OFF_NOTHING;
break;
case '"':
currentState = quote(currentIndex, currentState);
currentState = quote(currentState);
break;
default:
}
Expand All @@ -97,12 +97,12 @@ public boolean consumeToNextRow(CellConsumer cellConsumer) {
}


protected final int newCellIfNotInQuote(int currentIndex, int currentState, CellConsumer cellConsumer) {
private int newCellIfNotInQuote(int currentIndex, int currentState, CellConsumer cellConsumer) {
if ((currentState & IN_QUOTE) != 0) return currentState & TURN_OFF_IN_CR_MASK;
return newCell(currentIndex, cellConsumer);
}

protected final int handleEndOfLineLF(int currentIndex, int currentState, CellConsumer cellConsumer) {
private int handleEndOfLineLF(int currentIndex, int currentState, CellConsumer cellConsumer) {
final int inQuoteAndCr = currentState & (IN_QUOTE | IN_CR);
if (inQuoteAndCr == IN_CR) {
// we had a preceding cr so shift the mark
Expand All @@ -113,56 +113,64 @@ protected final int handleEndOfLineLF(int currentIndex, int currentState, CellCo
return currentState & TURN_OFF_IN_CR_MASK;
}

protected final int handleEndOfLineCR(int currentIndex, int currentState, CellConsumer cellConsumer) {
private int handleEndOfLineCR(int currentIndex, int currentState, CellConsumer cellConsumer) {
if ((currentState & IN_QUOTE) == 0) {
endOfRow(currentIndex, cellConsumer);
return IN_CR;
}
return currentState;
}

private final int endOfRow(int currentIndex, CellConsumer cellConsumer) {
private int endOfRow(int currentIndex, CellConsumer cellConsumer) {
newCell(currentIndex, cellConsumer);
cellConsumer.endOfRow();
return NONE;
}

protected final int quote(int currentIndex, int currentState) {
if (isNotAllConsumedFromMark(currentIndex)) {
return currentState ^ ALL_QUOTES;
private int quote(int currentState) {
if ((currentState & ALL_QUOTES) == 0) {
return (currentState ^ IN_QUOTE) & TURN_OFF_IN_CR_MASK;
} else {
return currentState | IN_QUOTE;
return (currentState ^ ALL_QUOTES) & TURN_OFF_IN_CR_MASK;
}
}

protected final int newCell(int end, final CellConsumer cellConsumer) {
private int newCell(int end, final CellConsumer cellConsumer) {
char[] charBuffer = csvBuffer.getCharBuffer();
int start = csvBuffer.getMark();
if (charBuffer[start] != '"') {
cellConsumer.newCell(charBuffer, start, end - start);
} else {
newEscapedCell(charBuffer, start, end, cellConsumer);
newQuotedCell(charBuffer, start, end, cellConsumer);
}
csvBuffer.mark(end + 1);
return NONE;

}

protected final void newEscapedCell(final char[] chars, final int offset, final int end, CellConsumer cellConsumer) {

private void newQuotedCell(final char[] chars, final int offset, final int end, CellConsumer cellConsumer) {
int start = offset + 1;

boolean notEscaped = true;
boolean escaped = false;
// copy chars apart from escape chars
int realIndex = start;
for(int i = start; i < end; i++) {
notEscaped = !notEscaped || '"' != chars[i];
chars[realIndex] = chars[i];
if (notEscaped) {
realIndex++;
int skipIndex = 0;

int i = start;
for (; i < end - 1 ; i++) {
int correctedIndex = i - skipIndex;
escaped = '"' == chars[correctedIndex] && !escaped;
if (escaped) {
skipIndex ++;
System.arraycopy(chars, correctedIndex + 1, chars, correctedIndex, end - 1 - i);
}
}
cellConsumer.newCell(chars, start, realIndex - start);

// if last is not quote add to shifted char
if ('"' == chars[i] && !escaped) {
skipIndex ++;
}

cellConsumer.newCell(chars, start, end - start - skipIndex);
}

@Override
Expand All @@ -184,8 +192,10 @@ public final boolean refillBuffer() throws IOException {
return csvBuffer.fillBuffer();
}

protected final boolean isNotAllConsumedFromMark(int bufferIndex) {
return (bufferIndex) >= (csvBuffer.getMark() + 1) ;
private boolean isNotAllConsumedFromMark(int bufferIndex) {
return (bufferIndex) > (csvBuffer.getMark()) ;
}



}
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

public final class StringArrayConsumer<RH extends RowHandler<String[]>> implements CellConsumer {
private final RH handler;
private String[] currentRow = new String[10];
private String[] currentRow = new String[8];
private int currentIndex;


Expand All @@ -28,6 +28,7 @@ public void newCell(char[] chars, int offset, int length) {
public void endOfRow() {
try {
String[] result = Arrays.copyOf(currentRow, currentIndex);
Arrays.fill(currentRow, null);
handler.handle(result);
currentIndex = 0;
} catch (Exception e) {
Expand Down
7 changes: 7 additions & 0 deletions sfm/src/test/java/org/sfm/csv/CsvParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -787,8 +787,15 @@ public void test264() throws IOException {
it = CsvParser.iterator(new StringReader("\"\"\"\""));
strings = it.next();
assertArrayEquals(new String[]{"\""}, strings);

}

@Test
public void testQuotedStringShift() throws IOException {
Iterator<String[]> it = CsvParser.iterator("\"\"\"a\"\"b\"\"c\"\"d\"");
String[] strings = it.next();
assertArrayEquals(new String[]{"\"a\"b\"c\"d"}, strings);
}

@Test
public void testTrimSpaceToQuote() throws IOException {
Expand Down

0 comments on commit 5d1c3f2

Please sign in to comment.