From 030fb8e37c4024b24fac2b5404300449a6741699 Mon Sep 17 00:00:00 2001 From: Dave Moten Date: Fri, 24 May 2019 22:11:17 +1000 Subject: [PATCH] [CSV-239] Add CSVRecord.getHeaderNames and allow duplicate headers (#41) * [CSV-239] Cannot get headers in column order from CSVRecord. * getHeaderNames returns all headers in column order including repeats which are allowed as per RFC 4180 * add CSVFormat.withAllowDuplicateHeaderNames() * [CSV-239] Cannot get headers in column order from CSVRecord. * only wrap headerNames with unmodifiableList if non-empty * fix and enhance CSVRecord.toMap javadoc * [CSV-239] Cannot get headers in column order from CSVRecord. * fix exception messages * [CSV-239] Cannot get headers in column order from CSVRecord. * fix whitespace * [CSV-239] Cannot get headers in column order from CSVRecord. * simplify if statement * [CSV-239] Cannot get headers in column order from CSVRecord. * fix indentation * add javadoc to Headers class * rename method to createHeaders * use String.format to build error message * initialize header names List with appropriate size --- .../org/apache/commons/csv/CSVFormat.java | 77 ++++++++++++++----- .../org/apache/commons/csv/CSVParser.java | 63 +++++++++++---- .../org/apache/commons/csv/CSVRecord.java | 2 +- .../org/apache/commons/csv/CSVParserTest.java | 25 +++++- 4 files changed, 128 insertions(+), 39 deletions(-) diff --git a/src/main/java/org/apache/commons/csv/CSVFormat.java b/src/main/java/org/apache/commons/csv/CSVFormat.java index 6f77bef53..33130b0b5 100644 --- a/src/main/java/org/apache/commons/csv/CSVFormat.java +++ b/src/main/java/org/apache/commons/csv/CSVFormat.java @@ -260,12 +260,13 @@ public CSVFormat getFormat() { *
  • {@code withQuote('"')}
  • *
  • {@code withRecordSeparator("\r\n")}
  • *
  • {@code withIgnoreEmptyLines(true)}
  • + *
  • {@code withAllowDuplicateHeaderNames(true)}
  • * * * @see Predefined#Default */ public static final CSVFormat DEFAULT = new CSVFormat(COMMA, DOUBLE_QUOTE_CHAR, null, null, null, false, true, CRLF, - null, null, null, false, false, false, false, false, false); + null, null, null, false, false, false, false, false, false, true); /** * Excel file format (using a comma as the value delimiter). Note that the actual value delimiter used by Excel is @@ -288,6 +289,7 @@ public CSVFormat getFormat() { *
  • {@code {@link #withRecordSeparator(String) withRecordSeparator("\r\n")}}
  • *
  • {@code {@link #withIgnoreEmptyLines(boolean) withIgnoreEmptyLines(false)}}
  • *
  • {@code {@link #withAllowMissingColumnNames(boolean) withAllowMissingColumnNames(true)}}
  • + *
  • {@code {@link #withAllowDuplicateHeaderNames(boolean) withAllowDuplicateHeaderNames(true)}}
  • * *

    * Note: This is currently like {@link #RFC4180} plus {@link #withAllowMissingColumnNames(boolean) @@ -671,7 +673,7 @@ private static boolean isLineBreak(final Character c) { */ public static CSVFormat newFormat(final char delimiter) { return new CSVFormat(delimiter, null, null, null, null, false, false, null, null, null, null, false, false, - false, false, false, false); + false, false, false, false, true); } /** @@ -721,6 +723,8 @@ public static CSVFormat valueOf(final String format) { private final boolean trim; private final boolean autoFlush; + + private final boolean allowDuplicateHeaderNames; /** * Creates a customized CSV format. @@ -766,7 +770,7 @@ private CSVFormat(final char delimiter, final Character quoteChar, final QuoteMo final boolean ignoreEmptyLines, final String recordSeparator, final String nullString, final Object[] headerComments, final String[] header, final boolean skipHeaderRecord, final boolean allowMissingColumnNames, final boolean ignoreHeaderCase, final boolean trim, - final boolean trailingDelimiter, final boolean autoFlush) { + final boolean trailingDelimiter, final boolean autoFlush, final boolean allowDuplicateHeaderNames) { this.delimiter = delimiter; this.quoteCharacter = quoteChar; this.quoteMode = quoteMode; @@ -785,6 +789,7 @@ private CSVFormat(final char delimiter, final Character quoteChar, final QuoteMo this.trim = trim; this.autoFlush = autoFlush; this.quotedNullString = quoteCharacter + nullString + quoteCharacter; + this.allowDuplicateHeaderNames = allowDuplicateHeaderNames; validate(); } @@ -1686,7 +1691,8 @@ public CSVFormat withAllowMissingColumnNames() { public CSVFormat withAllowMissingColumnNames(final boolean allowMissingColumnNames) { return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, - skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush); + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); } /** @@ -1701,7 +1707,8 @@ public CSVFormat withAllowMissingColumnNames(final boolean allowMissingColumnNam public CSVFormat withAutoFlush(final boolean autoFlush) { return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, - skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush); + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); } /** @@ -1736,7 +1743,8 @@ public CSVFormat withCommentMarker(final Character commentMarker) { } return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, - skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush); + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); } /** @@ -1754,7 +1762,8 @@ public CSVFormat withDelimiter(final char delimiter) { } return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, - skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush); + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); } /** @@ -1785,7 +1794,8 @@ public CSVFormat withEscape(final Character escape) { } return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escape, ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, skipHeaderRecord, - allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush); + allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); } /** @@ -1941,7 +1951,8 @@ public CSVFormat withHeader(final ResultSetMetaData metaData) throws SQLExceptio public CSVFormat withHeader(final String... header) { return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, - skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush); + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); } /** @@ -1962,7 +1973,8 @@ public CSVFormat withHeader(final String... header) { public CSVFormat withHeaderComments(final Object... headerComments) { return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, - skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush); + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); } /** @@ -1987,7 +1999,8 @@ public CSVFormat withIgnoreEmptyLines() { public CSVFormat withIgnoreEmptyLines(final boolean ignoreEmptyLines) { return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, - skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush); + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); } /** @@ -2013,7 +2026,8 @@ public CSVFormat withIgnoreHeaderCase() { public CSVFormat withIgnoreHeaderCase(final boolean ignoreHeaderCase) { return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, - skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush); + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); } /** @@ -2038,7 +2052,8 @@ public CSVFormat withIgnoreSurroundingSpaces() { public CSVFormat withIgnoreSurroundingSpaces(final boolean ignoreSurroundingSpaces) { return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, - skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush); + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); } /** @@ -2057,7 +2072,8 @@ public CSVFormat withIgnoreSurroundingSpaces(final boolean ignoreSurroundingSpac public CSVFormat withNullString(final String nullString) { return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, - skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush); + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); } /** @@ -2088,7 +2104,8 @@ public CSVFormat withQuote(final Character quoteChar) { } return new CSVFormat(delimiter, quoteChar, quoteMode, commentMarker, escapeCharacter, ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, skipHeaderRecord, - allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush); + allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); } /** @@ -2102,7 +2119,8 @@ public CSVFormat withQuote(final Character quoteChar) { public CSVFormat withQuoteMode(final QuoteMode quoteModePolicy) { return new CSVFormat(delimiter, quoteCharacter, quoteModePolicy, commentMarker, escapeCharacter, ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, - skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush); + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); } /** @@ -2140,7 +2158,8 @@ public CSVFormat withRecordSeparator(final char recordSeparator) { public CSVFormat withRecordSeparator(final String recordSeparator) { return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, - skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush); + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); } /** @@ -2167,7 +2186,8 @@ public CSVFormat withSkipHeaderRecord() { public CSVFormat withSkipHeaderRecord(final boolean skipHeaderRecord) { return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, - skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush); + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); } /** @@ -2208,7 +2228,8 @@ public CSVFormat withTrailingDelimiter() { public CSVFormat withTrailingDelimiter(final boolean trailingDelimiter) { return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, - skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush); + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); } /** @@ -2233,6 +2254,22 @@ public CSVFormat withTrim() { public CSVFormat withTrim(final boolean trim) { return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, - skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush); + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); } + + public CSVFormat withAllowDuplicateHeaderNames(boolean allowDuplicateHeaderNames) { + return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, + ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } + + public CSVFormat withAllowDuplicateHeaderNames() { + return withAllowDuplicateHeaderNames(true); + } + + public boolean getAllowDuplicateHeaderNames() { + return allowDuplicateHeaderNames; + } } diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index 15357c05f..eff740225 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -410,8 +410,9 @@ public CSVParser(final Reader reader, final CSVFormat format, final long charact this.format = format; this.lexer = new Lexer(format, new ExtendedBufferedReader(reader)); this.csvRecordIterator = new CSVRecordIterator(); - this.headerMap = createHeaderMap(); // 1st - this.headerNames = createHeaderNames(this.headerMap); // 2nd + Headers headers = createHeaders(); + this.headerMap = headers.headerMap; + this.headerNames = headers.headerNames; this.characterOffset = characterOffset; this.recordNumber = recordNumber - 1; } @@ -445,14 +446,35 @@ private Map createEmptyHeaderMap() { new LinkedHashMap<>(); } + /** + * Header information based on name and position. + */ + private static final class Headers { + /** + * Header column positions (0-based) + */ + final Map headerMap; + + /** + * Header names in column order + */ + final List headerNames; + + Headers(Map headerMap, List headerNames) { + this.headerMap = headerMap; + this.headerNames = headerNames; + } + } + /** * Creates the name to index mapping if the format defines a header. * * @return null if the format has no header. * @throws IOException if there is a problem reading the header or skipping the first record */ - private Map createHeaderMap() throws IOException { + private Headers createHeaders() throws IOException { Map hdrMap = null; + List headerNames = null; final String[] formatHeader = this.format.getHeader(); if (formatHeader != null) { hdrMap = createEmptyHeaderMap(); @@ -476,27 +498,34 @@ private Map createHeaderMap() throws IOException { final String header = headerRecord[i]; final boolean containsHeader = header == null ? false : hdrMap.containsKey(header); final boolean emptyHeader = header == null || header.trim().isEmpty(); - if (containsHeader && (!emptyHeader || !this.format.getAllowMissingColumnNames())) { - throw new IllegalArgumentException("The header contains a duplicate name: \"" + header - + "\" in " + Arrays.toString(headerRecord)); + if (containsHeader) { + if (!emptyHeader && !this.format.getAllowDuplicateHeaderNames()) { + throw new IllegalArgumentException( + String.format("The header contains a duplicate name: \"%s\" in %s." + + " If this is valid then use CSVFormat.withAllowDuplicateHeaderNames().", + header, Arrays.toString(headerRecord))); + } + if (emptyHeader && !this.format.getAllowMissingColumnNames()) { + throw new IllegalArgumentException( + "A header name is missing in " + Arrays.toString(headerRecord)); + } } if (header != null) { hdrMap.put(header, Integer.valueOf(i)); + if (headerNames == null) { + headerNames = new ArrayList<>(headerRecord.length); + } + headerNames.add(header); } } } + } + if (headerNames == null) { + headerNames = Collections.emptyList(); //immutable + } else { + headerNames = Collections.unmodifiableList(headerNames); } - return hdrMap; - } - - private List createHeaderNames(final Map headerMap) { - // @formatter:off - return headerMap == null ? null - : headerMap.entrySet().stream() - .sorted(Map.Entry.comparingByValue()) - .map(Map.Entry::getKey) - .collect(Collectors.collectingAndThen(Collectors.toList(), Collections::unmodifiableList)); - // @formatter:on + return new Headers(hdrMap, headerNames); } /** diff --git a/src/main/java/org/apache/commons/csv/CSVRecord.java b/src/main/java/org/apache/commons/csv/CSVRecord.java index acc4c8df4..99dce26a9 100644 --- a/src/main/java/org/apache/commons/csv/CSVRecord.java +++ b/src/main/java/org/apache/commons/csv/CSVRecord.java @@ -265,7 +265,7 @@ private List toList() { } /** - * Copies this record into a new Map. The new map is not connect + * Copies this record into a new Map of header name to record value. * * @return A new Map. The map is empty if the record has no headers. */ diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index 57bed48b2..f71e479d7 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -43,6 +43,7 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; +import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -292,10 +293,24 @@ public void testDefaultFormat() throws IOException { } @Test(expected = IllegalArgumentException.class) - public void testDuplicateHeaders() throws Exception { + public void testDuplicateHeadersNotAllowed() throws Exception { + CSVParser.parse("a,b,a\n1,2,3\nx,y,z", + CSVFormat.DEFAULT.withHeader(new String[] {}).withAllowDuplicateHeaderNames(false)); + } + + @Test + public void testDuplicateHeadersAllowedByDefault() throws Exception { CSVParser.parse("a,b,a\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader(new String[] {})); } + @Test + public void testEmptyFileHeaderParsing() throws Exception { + try (final CSVParser parser = CSVParser.parse("", CSVFormat.DEFAULT.withFirstRecordAsHeader())) { + assertNull(parser.nextRecord()); + assertTrue(parser.getHeaderNames().isEmpty()); + } + } + @Test public void testEmptyFile() throws Exception { try (final CSVParser parser = CSVParser.parse("", CSVFormat.DEFAULT)) { @@ -1151,6 +1166,14 @@ public void testTrim() throws Exception { assertEquals("3", record.get("Z")); Assert.assertEquals(3, record.size()); } + + @Test + public void testRepeatedHeadersAreReturnedInCSVRecordHeaderNames() throws IOException { + final Reader in = new StringReader("header1,header2,header1\n1,2,3\n4,5,6"); + final Iterator records = CSVFormat.DEFAULT.withFirstRecordAsHeader().withTrim().parse(in).iterator(); + final CSVRecord record = records.next(); + assertEquals(Arrays.asList("header1", "header2", "header1"), record.getParser().getHeaderNames()); + } private void validateLineNumbers(final String lineSeparator) throws IOException { try (final CSVParser parser = CSVParser.parse("a" + lineSeparator + "b" + lineSeparator + "c",