[CSV-239] Add CSVRecord.getHeaderNames and allow duplicate headers (#41)

* [CSV-239] Cannot get headers in column order from CSVRecord. * getHeaderNames returns all headers in column order including repeats which are allowed as per RFC 4180 * add CSVFormat.withAllowDuplicateHeaderNames() * [CSV-239] Cannot get headers in column order from CSVRecord. * only wrap headerNames with unmodifiableList if non-empty * fix and enhance CSVRecord.toMap javadoc * [CSV-239] Cannot get headers in column order from CSVRecord. * fix exception messages * [CSV-239] Cannot get headers in column order from CSVRecord. * fix whitespace * [CSV-239] Cannot get headers in column order from CSVRecord. * simplify if statement * [CSV-239] Cannot get headers in column order from CSVRecord. * fix indentation * add javadoc to Headers class * rename method to createHeaders * use String.format to build error message * initialize header names List with appropriate size
apache · May 24, 2019 · 030fb8e · 030fb8e
1 parent 4d2616b
commit 030fb8e
Show file tree

Hide file tree

Showing 4 changed files with 128 additions and 39 deletions.
diff --git a/src/main/java/org/apache/commons/csv/CSVFormat.java b/src/main/java/org/apache/commons/csv/CSVFormat.java
@@ -260,12 +260,13 @@ public CSVFormat getFormat() {
      * <li>{@code withQuote('"')}</li>
      * <li>{@code withRecordSeparator("\r\n")}</li>
      * <li>{@code withIgnoreEmptyLines(true)}</li>
+     * <li>{@code withAllowDuplicateHeaderNames(true)}</li>
      * </ul>
      *
      * @see Predefined#Default
      */
     public static final CSVFormat DEFAULT = new CSVFormat(COMMA, DOUBLE_QUOTE_CHAR, null, null, null, false, true, CRLF,
-            null, null, null, false, false, false, false, false, false);
+            null, null, null, false, false, false, false, false, false, true);
 
     /**
      * Excel file format (using a comma as the value delimiter). Note that the actual value delimiter used by Excel is
@@ -288,6 +289,7 @@ public CSVFormat getFormat() {
      * <li>{@code {@link #withRecordSeparator(String) withRecordSeparator("\r\n")}}</li>
      * <li>{@code {@link #withIgnoreEmptyLines(boolean) withIgnoreEmptyLines(false)}}</li>
      * <li>{@code {@link #withAllowMissingColumnNames(boolean) withAllowMissingColumnNames(true)}}</li>
+     * <li>{@code {@link #withAllowDuplicateHeaderNames(boolean) withAllowDuplicateHeaderNames(true)}}</li>
      * </ul>
      * <p>
      * Note: This is currently like {@link #RFC4180} plus {@link #withAllowMissingColumnNames(boolean)
@@ -671,7 +673,7 @@ private static boolean isLineBreak(final Character c) {
      */
     public static CSVFormat newFormat(final char delimiter) {
         return new CSVFormat(delimiter, null, null, null, null, false, false, null, null, null, null, false, false,
-                false, false, false, false);
+                false, false, false, false, true);
     }
 
     /**
@@ -721,6 +723,8 @@ public static CSVFormat valueOf(final String format) {
     private final boolean trim;
 
     private final boolean autoFlush;
+
+    private final boolean allowDuplicateHeaderNames;
 
     /**
      * Creates a customized CSV format.
@@ -766,7 +770,7 @@ private CSVFormat(final char delimiter, final Character quoteChar, final QuoteMo
             final boolean ignoreEmptyLines, final String recordSeparator, final String nullString,
             final Object[] headerComments, final String[] header, final boolean skipHeaderRecord,
             final boolean allowMissingColumnNames, final boolean ignoreHeaderCase, final boolean trim,
-            final boolean trailingDelimiter, final boolean autoFlush) {
+            final boolean trailingDelimiter, final boolean autoFlush, final boolean allowDuplicateHeaderNames) {
         this.delimiter = delimiter;
         this.quoteCharacter = quoteChar;
         this.quoteMode = quoteMode;
@@ -785,6 +789,7 @@ private CSVFormat(final char delimiter, final Character quoteChar, final QuoteMo
         this.trim = trim;
         this.autoFlush = autoFlush;
         this.quotedNullString = quoteCharacter + nullString + quoteCharacter;
+        this.allowDuplicateHeaderNames = allowDuplicateHeaderNames;
         validate();
     }
 
@@ -1686,7 +1691,8 @@ public CSVFormat withAllowMissingColumnNames() {
     public CSVFormat withAllowMissingColumnNames(final boolean allowMissingColumnNames) {
         return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                 ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, 
+                allowDuplicateHeaderNames);
     }
 
     /**
@@ -1701,7 +1707,8 @@ public CSVFormat withAllowMissingColumnNames(final boolean allowMissingColumnNam
     public CSVFormat withAutoFlush(final boolean autoFlush) {
         return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                 ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
     }
 
     /**
@@ -1736,7 +1743,8 @@ public CSVFormat withCommentMarker(final Character commentMarker) {
         }
         return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                 ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
     }
 
     /**
@@ -1754,7 +1762,8 @@ public CSVFormat withDelimiter(final char delimiter) {
         }
         return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                 ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
     }
 
     /**
@@ -1785,7 +1794,8 @@ public CSVFormat withEscape(final Character escape) {
         }
         return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escape, ignoreSurroundingSpaces,
                 ignoreEmptyLines, recordSeparator, nullString, headerComments, header, skipHeaderRecord,
-                allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
     }
 
     /**
@@ -1941,7 +1951,8 @@ public CSVFormat withHeader(final ResultSetMetaData metaData) throws SQLExceptio
     public CSVFormat withHeader(final String... header) {
         return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                 ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
     }
 
     /**
@@ -1962,7 +1973,8 @@ public CSVFormat withHeader(final String... header) {
     public CSVFormat withHeaderComments(final Object... headerComments) {
         return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                 ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
     }
 
     /**
@@ -1987,7 +1999,8 @@ public CSVFormat withIgnoreEmptyLines() {
     public CSVFormat withIgnoreEmptyLines(final boolean ignoreEmptyLines) {
         return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                 ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
     }
 
     /**
@@ -2013,7 +2026,8 @@ public CSVFormat withIgnoreHeaderCase() {
     public CSVFormat withIgnoreHeaderCase(final boolean ignoreHeaderCase) {
         return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                 ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
     }
 
     /**
@@ -2038,7 +2052,8 @@ public CSVFormat withIgnoreSurroundingSpaces() {
     public CSVFormat withIgnoreSurroundingSpaces(final boolean ignoreSurroundingSpaces) {
         return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                 ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
     }
 
     /**
@@ -2057,7 +2072,8 @@ public CSVFormat withIgnoreSurroundingSpaces(final boolean ignoreSurroundingSpac
     public CSVFormat withNullString(final String nullString) {
         return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                 ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
     }
 
     /**
@@ -2088,7 +2104,8 @@ public CSVFormat withQuote(final Character quoteChar) {
         }
         return new CSVFormat(delimiter, quoteChar, quoteMode, commentMarker, escapeCharacter, ignoreSurroundingSpaces,
                 ignoreEmptyLines, recordSeparator, nullString, headerComments, header, skipHeaderRecord,
-                allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
     }
 
     /**
@@ -2102,7 +2119,8 @@ public CSVFormat withQuote(final Character quoteChar) {
     public CSVFormat withQuoteMode(final QuoteMode quoteModePolicy) {
         return new CSVFormat(delimiter, quoteCharacter, quoteModePolicy, commentMarker, escapeCharacter,
                 ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
     }
 
     /**
@@ -2140,7 +2158,8 @@ public CSVFormat withRecordSeparator(final char recordSeparator) {
     public CSVFormat withRecordSeparator(final String recordSeparator) {
         return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                 ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
     }
 
     /**
@@ -2167,7 +2186,8 @@ public CSVFormat withSkipHeaderRecord() {
     public CSVFormat withSkipHeaderRecord(final boolean skipHeaderRecord) {
         return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                 ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
     }
 
     /**
@@ -2208,7 +2228,8 @@ public CSVFormat withTrailingDelimiter() {
     public CSVFormat withTrailingDelimiter(final boolean trailingDelimiter) {
         return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                 ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
     }
 
     /**
@@ -2233,6 +2254,22 @@ public CSVFormat withTrim() {
     public CSVFormat withTrim(final boolean trim) {
         return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
                 ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
-                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
     }
+
+    public CSVFormat withAllowDuplicateHeaderNames(boolean allowDuplicateHeaderNames) {
+    	return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
+                ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
+                skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
+                allowDuplicateHeaderNames);
+    }
+
+    public CSVFormat withAllowDuplicateHeaderNames() {
+    	return withAllowDuplicateHeaderNames(true);
+    }
+
+    public boolean getAllowDuplicateHeaderNames() {
+		return allowDuplicateHeaderNames;
+	}
 }
diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java
@@ -410,8 +410,9 @@ public CSVParser(final Reader reader, final CSVFormat format, final long charact
         this.format = format;
         this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
         this.csvRecordIterator = new CSVRecordIterator();
-        this.headerMap = createHeaderMap(); // 1st
-        this.headerNames = createHeaderNames(this.headerMap); // 2nd
+        Headers headers = createHeaders();
+        this.headerMap = headers.headerMap;
+        this.headerNames = headers.headerNames;
         this.characterOffset = characterOffset;
         this.recordNumber = recordNumber - 1;
     }
@@ -445,14 +446,35 @@ private Map<String, Integer> createEmptyHeaderMap() {
                 new LinkedHashMap<>();
     }
 
+    /**
+     * Header information based on name and position.
+     */
+    private static final class Headers {
+        /**
+         * Header column positions (0-based)
+         */
+        final Map<String, Integer> headerMap;
+
+        /**
+         * Header names in column order
+         */
+        final List<String> headerNames;
+
+        Headers(Map<String, Integer> headerMap, List<String> headerNames) {
+            this.headerMap = headerMap;
+            this.headerNames = headerNames;
+        }
+    }
+
     /**
      * Creates the name to index mapping if the format defines a header.
      *
      * @return null if the format has no header.
      * @throws IOException if there is a problem reading the header or skipping the first record
      */
-    private Map<String, Integer> createHeaderMap() throws IOException {
+    private Headers createHeaders() throws IOException {
         Map<String, Integer> hdrMap = null;
+        List<String> headerNames = null;
         final String[] formatHeader = this.format.getHeader();
         if (formatHeader != null) {
             hdrMap = createEmptyHeaderMap();
@@ -476,27 +498,34 @@ private Map<String, Integer> createHeaderMap() throws IOException {
                     final String header = headerRecord[i];
                     final boolean containsHeader = header == null ? false : hdrMap.containsKey(header);
                     final boolean emptyHeader = header == null || header.trim().isEmpty();
-                    if (containsHeader && (!emptyHeader || !this.format.getAllowMissingColumnNames())) {
-                        throw new IllegalArgumentException("The header contains a duplicate name: \"" + header
-                                + "\" in " + Arrays.toString(headerRecord));
+                    if (containsHeader) {
+                        if (!emptyHeader && !this.format.getAllowDuplicateHeaderNames()) {
+                            throw new IllegalArgumentException(
+                                    String.format("The header contains a duplicate name: \"%s\" in %s."
+                                        + " If this is valid then use CSVFormat.withAllowDuplicateHeaderNames().", 
+                                        header, Arrays.toString(headerRecord)));
+                        }
+                        if (emptyHeader && !this.format.getAllowMissingColumnNames()) {
+                            throw new IllegalArgumentException(
+                                    "A header name is missing in " + Arrays.toString(headerRecord));
+                        }
                     }
                     if (header != null) {
                         hdrMap.put(header, Integer.valueOf(i));
+                        if (headerNames == null) {
+                        	headerNames = new ArrayList<>(headerRecord.length);
+                        }
+                        headerNames.add(header);
                     }
                 }
             }
+        } 
+        if (headerNames == null) {
+        	headerNames = Collections.emptyList(); //immutable
+        } else {
+        	headerNames = Collections.unmodifiableList(headerNames);
         }
-        return hdrMap;
-    }
-
-    private List<String> createHeaderNames(final Map<String, Integer> headerMap) {
-        // @formatter:off
-        return headerMap == null ? null
-            : headerMap.entrySet().stream()
-                .sorted(Map.Entry.comparingByValue())
-                .map(Map.Entry::getKey)
-                .collect(Collectors.collectingAndThen(Collectors.toList(), Collections::unmodifiableList));
-        // @formatter:on
+        return new Headers(hdrMap, headerNames);
     }
 
     /**

diff --git a/src/main/java/org/apache/commons/csv/CSVRecord.java b/src/main/java/org/apache/commons/csv/CSVRecord.java
@@ -265,7 +265,7 @@ private List<String> toList() {
     }
 
     /**
-     * Copies this record into a new Map. The new map is not connect
+     * Copies this record into a new Map of header name to record value.
      *
      * @return A new Map. The map is empty if the record has no headers.
      */