Skip to content

Commit

Permalink
[CSV-239] Add CSVRecord.getHeaderNames and allow duplicate headers (#41)
Browse files Browse the repository at this point in the history
* [CSV-239] Cannot get headers in column order from CSVRecord.
* getHeaderNames returns all headers in column order including repeats which are allowed as per RFC 4180
* add CSVFormat.withAllowDuplicateHeaderNames()

* [CSV-239] Cannot get headers in column order from CSVRecord.
* only wrap headerNames with unmodifiableList if non-empty
* fix and enhance CSVRecord.toMap javadoc

* [CSV-239] Cannot get headers in column order from CSVRecord.
* fix exception messages

* [CSV-239] Cannot get headers in column order from CSVRecord.
* fix whitespace

* [CSV-239] Cannot get headers in column order from CSVRecord.
* simplify if statement

* [CSV-239] Cannot get headers in column order from CSVRecord.
* fix indentation
* add javadoc to Headers class
* rename method to createHeaders
* use String.format to build error message
* initialize header names List with appropriate size
  • Loading branch information
davidmoten authored and garydgregory committed May 24, 2019
1 parent 4d2616b commit 030fb8e
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 39 deletions.
77 changes: 57 additions & 20 deletions src/main/java/org/apache/commons/csv/CSVFormat.java
Original file line number Diff line number Diff line change
Expand Up @@ -260,12 +260,13 @@ public CSVFormat getFormat() {
* <li>{@code withQuote('"')}</li>
* <li>{@code withRecordSeparator("\r\n")}</li>
* <li>{@code withIgnoreEmptyLines(true)}</li>
* <li>{@code withAllowDuplicateHeaderNames(true)}</li>
* </ul>
*
* @see Predefined#Default
*/
public static final CSVFormat DEFAULT = new CSVFormat(COMMA, DOUBLE_QUOTE_CHAR, null, null, null, false, true, CRLF,
null, null, null, false, false, false, false, false, false);
null, null, null, false, false, false, false, false, false, true);

/**
* Excel file format (using a comma as the value delimiter). Note that the actual value delimiter used by Excel is
Expand All @@ -288,6 +289,7 @@ public CSVFormat getFormat() {
* <li>{@code {@link #withRecordSeparator(String) withRecordSeparator("\r\n")}}</li>
* <li>{@code {@link #withIgnoreEmptyLines(boolean) withIgnoreEmptyLines(false)}}</li>
* <li>{@code {@link #withAllowMissingColumnNames(boolean) withAllowMissingColumnNames(true)}}</li>
* <li>{@code {@link #withAllowDuplicateHeaderNames(boolean) withAllowDuplicateHeaderNames(true)}}</li>
* </ul>
* <p>
* Note: This is currently like {@link #RFC4180} plus {@link #withAllowMissingColumnNames(boolean)
Expand Down Expand Up @@ -671,7 +673,7 @@ private static boolean isLineBreak(final Character c) {
*/
public static CSVFormat newFormat(final char delimiter) {
return new CSVFormat(delimiter, null, null, null, null, false, false, null, null, null, null, false, false,
false, false, false, false);
false, false, false, false, true);
}

/**
Expand Down Expand Up @@ -721,6 +723,8 @@ public static CSVFormat valueOf(final String format) {
private final boolean trim;

private final boolean autoFlush;

private final boolean allowDuplicateHeaderNames;

/**
* Creates a customized CSV format.
Expand Down Expand Up @@ -766,7 +770,7 @@ private CSVFormat(final char delimiter, final Character quoteChar, final QuoteMo
final boolean ignoreEmptyLines, final String recordSeparator, final String nullString,
final Object[] headerComments, final String[] header, final boolean skipHeaderRecord,
final boolean allowMissingColumnNames, final boolean ignoreHeaderCase, final boolean trim,
final boolean trailingDelimiter, final boolean autoFlush) {
final boolean trailingDelimiter, final boolean autoFlush, final boolean allowDuplicateHeaderNames) {
this.delimiter = delimiter;
this.quoteCharacter = quoteChar;
this.quoteMode = quoteMode;
Expand All @@ -785,6 +789,7 @@ private CSVFormat(final char delimiter, final Character quoteChar, final QuoteMo
this.trim = trim;
this.autoFlush = autoFlush;
this.quotedNullString = quoteCharacter + nullString + quoteCharacter;
this.allowDuplicateHeaderNames = allowDuplicateHeaderNames;
validate();
}

Expand Down Expand Up @@ -1686,7 +1691,8 @@ public CSVFormat withAllowMissingColumnNames() {
public CSVFormat withAllowMissingColumnNames(final boolean allowMissingColumnNames) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}

/**
Expand All @@ -1701,7 +1707,8 @@ public CSVFormat withAllowMissingColumnNames(final boolean allowMissingColumnNam
public CSVFormat withAutoFlush(final boolean autoFlush) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}

/**
Expand Down Expand Up @@ -1736,7 +1743,8 @@ public CSVFormat withCommentMarker(final Character commentMarker) {
}
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}

/**
Expand All @@ -1754,7 +1762,8 @@ public CSVFormat withDelimiter(final char delimiter) {
}
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}

/**
Expand Down Expand Up @@ -1785,7 +1794,8 @@ public CSVFormat withEscape(final Character escape) {
}
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escape, ignoreSurroundingSpaces,
ignoreEmptyLines, recordSeparator, nullString, headerComments, header, skipHeaderRecord,
allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}

/**
Expand Down Expand Up @@ -1941,7 +1951,8 @@ public CSVFormat withHeader(final ResultSetMetaData metaData) throws SQLExceptio
public CSVFormat withHeader(final String... header) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}

/**
Expand All @@ -1962,7 +1973,8 @@ public CSVFormat withHeader(final String... header) {
public CSVFormat withHeaderComments(final Object... headerComments) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}

/**
Expand All @@ -1987,7 +1999,8 @@ public CSVFormat withIgnoreEmptyLines() {
public CSVFormat withIgnoreEmptyLines(final boolean ignoreEmptyLines) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}

/**
Expand All @@ -2013,7 +2026,8 @@ public CSVFormat withIgnoreHeaderCase() {
public CSVFormat withIgnoreHeaderCase(final boolean ignoreHeaderCase) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}

/**
Expand All @@ -2038,7 +2052,8 @@ public CSVFormat withIgnoreSurroundingSpaces() {
public CSVFormat withIgnoreSurroundingSpaces(final boolean ignoreSurroundingSpaces) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}

/**
Expand All @@ -2057,7 +2072,8 @@ public CSVFormat withIgnoreSurroundingSpaces(final boolean ignoreSurroundingSpac
public CSVFormat withNullString(final String nullString) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}

/**
Expand Down Expand Up @@ -2088,7 +2104,8 @@ public CSVFormat withQuote(final Character quoteChar) {
}
return new CSVFormat(delimiter, quoteChar, quoteMode, commentMarker, escapeCharacter, ignoreSurroundingSpaces,
ignoreEmptyLines, recordSeparator, nullString, headerComments, header, skipHeaderRecord,
allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}

/**
Expand All @@ -2102,7 +2119,8 @@ public CSVFormat withQuote(final Character quoteChar) {
public CSVFormat withQuoteMode(final QuoteMode quoteModePolicy) {
return new CSVFormat(delimiter, quoteCharacter, quoteModePolicy, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}

/**
Expand Down Expand Up @@ -2140,7 +2158,8 @@ public CSVFormat withRecordSeparator(final char recordSeparator) {
public CSVFormat withRecordSeparator(final String recordSeparator) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}

/**
Expand All @@ -2167,7 +2186,8 @@ public CSVFormat withSkipHeaderRecord() {
public CSVFormat withSkipHeaderRecord(final boolean skipHeaderRecord) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}

/**
Expand Down Expand Up @@ -2208,7 +2228,8 @@ public CSVFormat withTrailingDelimiter() {
public CSVFormat withTrailingDelimiter(final boolean trailingDelimiter) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}

/**
Expand All @@ -2233,6 +2254,22 @@ public CSVFormat withTrim() {
public CSVFormat withTrim(final boolean trim) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush);
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}

public CSVFormat withAllowDuplicateHeaderNames(boolean allowDuplicateHeaderNames) {
return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter,
ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header,
skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush,
allowDuplicateHeaderNames);
}

public CSVFormat withAllowDuplicateHeaderNames() {
return withAllowDuplicateHeaderNames(true);
}

public boolean getAllowDuplicateHeaderNames() {
return allowDuplicateHeaderNames;
}
}
63 changes: 46 additions & 17 deletions src/main/java/org/apache/commons/csv/CSVParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -410,8 +410,9 @@ public CSVParser(final Reader reader, final CSVFormat format, final long charact
this.format = format;
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
this.csvRecordIterator = new CSVRecordIterator();
this.headerMap = createHeaderMap(); // 1st
this.headerNames = createHeaderNames(this.headerMap); // 2nd
Headers headers = createHeaders();
this.headerMap = headers.headerMap;
this.headerNames = headers.headerNames;
this.characterOffset = characterOffset;
this.recordNumber = recordNumber - 1;
}
Expand Down Expand Up @@ -445,14 +446,35 @@ private Map<String, Integer> createEmptyHeaderMap() {
new LinkedHashMap<>();
}

/**
* Header information based on name and position.
*/
private static final class Headers {
/**
* Header column positions (0-based)
*/
final Map<String, Integer> headerMap;

/**
* Header names in column order
*/
final List<String> headerNames;

Headers(Map<String, Integer> headerMap, List<String> headerNames) {
this.headerMap = headerMap;
this.headerNames = headerNames;
}
}

/**
* Creates the name to index mapping if the format defines a header.
*
* @return null if the format has no header.
* @throws IOException if there is a problem reading the header or skipping the first record
*/
private Map<String, Integer> createHeaderMap() throws IOException {
private Headers createHeaders() throws IOException {
Map<String, Integer> hdrMap = null;
List<String> headerNames = null;
final String[] formatHeader = this.format.getHeader();
if (formatHeader != null) {
hdrMap = createEmptyHeaderMap();
Expand All @@ -476,27 +498,34 @@ private Map<String, Integer> createHeaderMap() throws IOException {
final String header = headerRecord[i];
final boolean containsHeader = header == null ? false : hdrMap.containsKey(header);
final boolean emptyHeader = header == null || header.trim().isEmpty();
if (containsHeader && (!emptyHeader || !this.format.getAllowMissingColumnNames())) {
throw new IllegalArgumentException("The header contains a duplicate name: \"" + header
+ "\" in " + Arrays.toString(headerRecord));
if (containsHeader) {
if (!emptyHeader && !this.format.getAllowDuplicateHeaderNames()) {
throw new IllegalArgumentException(
String.format("The header contains a duplicate name: \"%s\" in %s."
+ " If this is valid then use CSVFormat.withAllowDuplicateHeaderNames().",
header, Arrays.toString(headerRecord)));
}
if (emptyHeader && !this.format.getAllowMissingColumnNames()) {
throw new IllegalArgumentException(
"A header name is missing in " + Arrays.toString(headerRecord));
}
}
if (header != null) {
hdrMap.put(header, Integer.valueOf(i));
if (headerNames == null) {
headerNames = new ArrayList<>(headerRecord.length);
}
headerNames.add(header);
}
}
}
}
if (headerNames == null) {
headerNames = Collections.emptyList(); //immutable
} else {
headerNames = Collections.unmodifiableList(headerNames);
}
return hdrMap;
}

private List<String> createHeaderNames(final Map<String, Integer> headerMap) {
// @formatter:off
return headerMap == null ? null
: headerMap.entrySet().stream()
.sorted(Map.Entry.comparingByValue())
.map(Map.Entry::getKey)
.collect(Collectors.collectingAndThen(Collectors.toList(), Collections::unmodifiableList));
// @formatter:on
return new Headers(hdrMap, headerNames);
}

/**
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/apache/commons/csv/CSVRecord.java
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ private List<String> toList() {
}

/**
* Copies this record into a new Map. The new map is not connect
* Copies this record into a new Map of header name to record value.
*
* @return A new Map. The map is empty if the record has no headers.
*/
Expand Down
Loading

0 comments on commit 030fb8e

Please sign in to comment.