From 5d1c3f2bae772450de8dca00cdfbac1395ad2bad Mon Sep 17 00:00:00 2001 From: Arnaud Roger Date: Mon, 20 Jun 2016 22:10:51 +0100 Subject: [PATCH] optimise standard csv --- .../java/org/sfm/csv/CsvParserBenchmark.java | 28 ++++++++ .../csv/parser/StandardCsvCharConsumer.java | 70 +++++++++++-------- .../sfm/csv/parser/StringArrayConsumer.java | 3 +- .../test/java/org/sfm/csv/CsvParserTest.java | 7 ++ 4 files changed, 77 insertions(+), 31 deletions(-) diff --git a/sfm-jmh/src/main/java/org/sfm/csv/CsvParserBenchmark.java b/sfm-jmh/src/main/java/org/sfm/csv/CsvParserBenchmark.java index 335a4103d..5a367b19d 100644 --- a/sfm-jmh/src/main/java/org/sfm/csv/CsvParserBenchmark.java +++ b/sfm-jmh/src/main/java/org/sfm/csv/CsvParserBenchmark.java @@ -34,9 +34,33 @@ public class CsvParserBenchmark { CsvParserBenchmark.parseTrim avgt 20 778.768 ± 1.860 ns/op + Mac 2.9.4 + Benchmark Mode Cnt Score Error Units + CsvParserBenchmark.parse avgt 20 212.758 ± 2.900 ns/op + CsvParserBenchmark.parseQuote avgt 20 289.970 ± 4.882 ns/op + CsvParserBenchmark.parseQuote2 avgt 20 359.580 ± 4.132 ns/op + CsvParserBenchmark.parseTrim avgt 20 263.594 ± 5.287 ns/op + + + Mac 2.9.5 + Benchmark Mode Cnt Score Error Units + CsvParserBenchmark.parse avgt 20 184.681 ± 4.202 ns/op + CsvParserBenchmark.parseQuote avgt 20 317.010 ± 2.848 ns/op + CsvParserBenchmark.parseQuote2 avgt 20 385.423 ± 8.884 ns/op + CsvParserBenchmark.parseTrim avgt 20 275.656 ± 1.510 ns/op + + Perf branch + Benchmark Mode Cnt Score Error Units + CsvParserBenchmark.parse avgt 20 168.031 ± 2.957 ns/op + CsvParserBenchmark.parseQuote avgt 20 286.045 ± 5.008 ns/op + CsvParserBenchmark.parseQuote2 avgt 20 337.570 ± 5.398 ns/op + CsvParserBenchmark.parseTrim avgt 20 256.215 ± 1.940 ns/op + + */ public String csv = "val,val2 sdssddsds,lllll llll,sdkokokokokads<>Sddsdsds, adsdsadsad ,1, 3 ,4"; public String csvQuote = "\"val\",\"val2 sdssddsds\",\"lllll llll\",\"sdkokokokokads<>Sddsdsds\",\"adsdsadsad\",\"1\",\"3\",\"4\""; + public String csvQuote2 = "\"val \"\" \",\"val2 \"\"sdssddsds\",\"lllll llll\",\"sdkokokokokads<>Sddsdsds\",\"adsdsadsad\",\"1\",\"3\",\"4\""; public static final CsvParser.DSL dsl = CsvParser.dsl(); @@ -58,6 +82,10 @@ public void parseQuote(Blackhole blackhole) throws IOException { dsl.parse(csvQuote, new MyCellConsumer(blackhole)); } + @Benchmark + public void parseQuote2(Blackhole blackhole) throws IOException { + dsl.parse(csvQuote2, new MyCellConsumer(blackhole)); + } public static void main(String[] args) throws IOException { new CsvParserBenchmark().parseQuote(null); } diff --git a/sfm/src/main/java/org/sfm/csv/parser/StandardCsvCharConsumer.java b/sfm/src/main/java/org/sfm/csv/parser/StandardCsvCharConsumer.java index 44ddca389..860d62080 100644 --- a/sfm/src/main/java/org/sfm/csv/parser/StandardCsvCharConsumer.java +++ b/sfm/src/main/java/org/sfm/csv/parser/StandardCsvCharConsumer.java @@ -9,9 +9,9 @@ public final class StandardCsvCharConsumer extends CsvCharConsumer { private static final int NOTHING = 8; - private static final int IN_QUOTE = 4; - private static final int IN_CR = 2; - private static final int QUOTE = 1; + private static final int IN_CR = 4; + private static final int QUOTE = 2; + private static final int IN_QUOTE = 1; private static final int NONE = 0; private static final int TURN_OFF_NOTHING = ~NOTHING; private static final int TURN_OFF_IN_CR_MASK = ~IN_CR; @@ -43,15 +43,15 @@ public final void consumeAllBuffer(CellConsumer cellConsumer) { private int consumeOneChar(CellConsumer cellConsumer, int currentIndex, int currentState, char character) { switch(character) { case ',': - return newCellIfNotInQuote(currentIndex, currentState, cellConsumer); + return newCellIfNotInQuote(currentIndex, currentState, cellConsumer); case '\n': - return handleEndOfLineLF(currentIndex, currentState, cellConsumer); + return handleEndOfLineLF(currentIndex, currentState, cellConsumer); case '\r': return handleEndOfLineCR(currentIndex, currentState, cellConsumer); case '"': - return quote(currentIndex, currentState); + return quote(currentState); } - return currentState; + return currentState & TURN_OFF_IN_CR_MASK; } @Override @@ -85,7 +85,7 @@ public boolean consumeToNextRow(CellConsumer cellConsumer) { currentState &= TURN_OFF_NOTHING; break; case '"': - currentState = quote(currentIndex, currentState); + currentState = quote(currentState); break; default: } @@ -97,12 +97,12 @@ public boolean consumeToNextRow(CellConsumer cellConsumer) { } - protected final int newCellIfNotInQuote(int currentIndex, int currentState, CellConsumer cellConsumer) { + private int newCellIfNotInQuote(int currentIndex, int currentState, CellConsumer cellConsumer) { if ((currentState & IN_QUOTE) != 0) return currentState & TURN_OFF_IN_CR_MASK; return newCell(currentIndex, cellConsumer); } - protected final int handleEndOfLineLF(int currentIndex, int currentState, CellConsumer cellConsumer) { + private int handleEndOfLineLF(int currentIndex, int currentState, CellConsumer cellConsumer) { final int inQuoteAndCr = currentState & (IN_QUOTE | IN_CR); if (inQuoteAndCr == IN_CR) { // we had a preceding cr so shift the mark @@ -113,7 +113,7 @@ protected final int handleEndOfLineLF(int currentIndex, int currentState, CellCo return currentState & TURN_OFF_IN_CR_MASK; } - protected final int handleEndOfLineCR(int currentIndex, int currentState, CellConsumer cellConsumer) { + private int handleEndOfLineCR(int currentIndex, int currentState, CellConsumer cellConsumer) { if ((currentState & IN_QUOTE) == 0) { endOfRow(currentIndex, cellConsumer); return IN_CR; @@ -121,48 +121,56 @@ protected final int handleEndOfLineCR(int currentIndex, int currentState, CellCo return currentState; } - private final int endOfRow(int currentIndex, CellConsumer cellConsumer) { + private int endOfRow(int currentIndex, CellConsumer cellConsumer) { newCell(currentIndex, cellConsumer); cellConsumer.endOfRow(); return NONE; } - protected final int quote(int currentIndex, int currentState) { - if (isNotAllConsumedFromMark(currentIndex)) { - return currentState ^ ALL_QUOTES; + private int quote(int currentState) { + if ((currentState & ALL_QUOTES) == 0) { + return (currentState ^ IN_QUOTE) & TURN_OFF_IN_CR_MASK; } else { - return currentState | IN_QUOTE; + return (currentState ^ ALL_QUOTES) & TURN_OFF_IN_CR_MASK; } } - protected final int newCell(int end, final CellConsumer cellConsumer) { + private int newCell(int end, final CellConsumer cellConsumer) { char[] charBuffer = csvBuffer.getCharBuffer(); int start = csvBuffer.getMark(); if (charBuffer[start] != '"') { cellConsumer.newCell(charBuffer, start, end - start); } else { - newEscapedCell(charBuffer, start, end, cellConsumer); + newQuotedCell(charBuffer, start, end, cellConsumer); } csvBuffer.mark(end + 1); return NONE; } - protected final void newEscapedCell(final char[] chars, final int offset, final int end, CellConsumer cellConsumer) { - + private void newQuotedCell(final char[] chars, final int offset, final int end, CellConsumer cellConsumer) { int start = offset + 1; - boolean notEscaped = true; + boolean escaped = false; // copy chars apart from escape chars - int realIndex = start; - for(int i = start; i < end; i++) { - notEscaped = !notEscaped || '"' != chars[i]; - chars[realIndex] = chars[i]; - if (notEscaped) { - realIndex++; + int skipIndex = 0; + + int i = start; + for (; i < end - 1 ; i++) { + int correctedIndex = i - skipIndex; + escaped = '"' == chars[correctedIndex] && !escaped; + if (escaped) { + skipIndex ++; + System.arraycopy(chars, correctedIndex + 1, chars, correctedIndex, end - 1 - i); } } - cellConsumer.newCell(chars, start, realIndex - start); + + // if last is not quote add to shifted char + if ('"' == chars[i] && !escaped) { + skipIndex ++; + } + + cellConsumer.newCell(chars, start, end - start - skipIndex); } @Override @@ -184,8 +192,10 @@ public final boolean refillBuffer() throws IOException { return csvBuffer.fillBuffer(); } - protected final boolean isNotAllConsumedFromMark(int bufferIndex) { - return (bufferIndex) >= (csvBuffer.getMark() + 1) ; + private boolean isNotAllConsumedFromMark(int bufferIndex) { + return (bufferIndex) > (csvBuffer.getMark()) ; } + + } diff --git a/sfm/src/main/java/org/sfm/csv/parser/StringArrayConsumer.java b/sfm/src/main/java/org/sfm/csv/parser/StringArrayConsumer.java index a6e0c5953..76d0f9716 100644 --- a/sfm/src/main/java/org/sfm/csv/parser/StringArrayConsumer.java +++ b/sfm/src/main/java/org/sfm/csv/parser/StringArrayConsumer.java @@ -8,7 +8,7 @@ public final class StringArrayConsumer> implements CellConsumer { private final RH handler; - private String[] currentRow = new String[10]; + private String[] currentRow = new String[8]; private int currentIndex; @@ -28,6 +28,7 @@ public void newCell(char[] chars, int offset, int length) { public void endOfRow() { try { String[] result = Arrays.copyOf(currentRow, currentIndex); + Arrays.fill(currentRow, null); handler.handle(result); currentIndex = 0; } catch (Exception e) { diff --git a/sfm/src/test/java/org/sfm/csv/CsvParserTest.java b/sfm/src/test/java/org/sfm/csv/CsvParserTest.java index 38c747628..5f6dc9619 100644 --- a/sfm/src/test/java/org/sfm/csv/CsvParserTest.java +++ b/sfm/src/test/java/org/sfm/csv/CsvParserTest.java @@ -787,8 +787,15 @@ public void test264() throws IOException { it = CsvParser.iterator(new StringReader("\"\"\"\"")); strings = it.next(); assertArrayEquals(new String[]{"\""}, strings); + } + @Test + public void testQuotedStringShift() throws IOException { + Iterator it = CsvParser.iterator("\"\"\"a\"\"b\"\"c\"\"d\""); + String[] strings = it.next(); + assertArrayEquals(new String[]{"\"a\"b\"c\"d"}, strings); + } @Test public void testTrimSpaceToQuote() throws IOException {