Skip to content

Commit

Permalink
ARROW-5195: [C++] Detect null strings in CSV string columns
Browse files Browse the repository at this point in the history
By default, when converting CSV to a string column, all CSV values are considered valid.

Add an option so that strings such as "N/A" etc. are considered null.

Author: Antoine Pitrou <antoine@python.org>

Closes #4188 from pitrou/ARROW-5195-csv-allow-null-strings and squashes the following commits:

9dfff4d <Antoine Pitrou> ARROW-5195:  Detect null strings in CSV string columns
  • Loading branch information
pitrou committed Apr 23, 2019
1 parent 5f8f08b commit 277307d
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 6 deletions.
24 changes: 24 additions & 0 deletions cpp/src/arrow/csv/converter-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,18 @@ TEST(BinaryConversion, Basics) {
{{"ab", ""}, {"cdé", "\xffgh"}});
}

TEST(BinaryConversion, Nulls) {
AssertConversion<BinaryType, std::string>(binary(), {"ab,N/A\n", "NULL,\n"},
{{"ab", "NULL"}, {"N/A", ""}},
{{true, true}, {true, true}});

auto options = ConvertOptions::Defaults();
options.strings_can_be_null = true;
AssertConversion<BinaryType, std::string>(binary(), {"ab,N/A\n", "NULL,\n"},
{{"ab", ""}, {"", ""}},
{{true, false}, {false, true}}, options);
}

TEST(StringConversion, Basics) {
AssertConversion<StringType, std::string>(utf8(), {"ab,cdé\n", ",gh\n"},
{{"ab", ""}, {"cdé", "gh"}});
Expand All @@ -131,6 +143,18 @@ TEST(StringConversion, Basics) {
{{"ab", ""}, {"cdé", "\xffgh"}}, options);
}

TEST(StringConversion, Nulls) {
AssertConversion<StringType, std::string>(utf8(), {"ab,N/A\n", "NULL,\n"},
{{"ab", "NULL"}, {"N/A", ""}},
{{true, true}, {true, true}});

auto options = ConvertOptions::Defaults();
options.strings_can_be_null = true;
AssertConversion<StringType, std::string>(utf8(), {"ab,N/A\n", "NULL,\n"},
{{"ab", ""}, {"", ""}},
{{true, false}, {false, true}}, options);
}

TEST(StringConversion, Errors) {
// Invalid UTF8 in column 0
AssertConversionError(utf8(), {"ab,cdé\n", "\xff,gh\n"}, {0});
Expand Down
21 changes: 17 additions & 4 deletions cpp/src/arrow/csv/converter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -130,19 +130,32 @@ class VarSizeBinaryConverter : public ConcreteConverter {
using BuilderType = typename TypeTraits<T>::BuilderType;
BuilderType builder(pool_);

// TODO do we accept nulls here?

auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
auto visit_non_null = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
if (CheckUTF8 && ARROW_PREDICT_FALSE(!util::ValidateUTF8(data, size))) {
return Status::Invalid("CSV conversion error to ", type_->ToString(),
": invalid UTF8 data");
}
builder.UnsafeAppend(data, size);
return Status::OK();
};

RETURN_NOT_OK(builder.Resize(parser.num_rows()));
RETURN_NOT_OK(builder.ReserveData(parser.num_bytes()));
RETURN_NOT_OK(parser.VisitColumn(col_index, visit));

if (options_.strings_can_be_null) {
auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
if (size > 0 && IsNull(data, size, false /* quoted */)) {
builder.UnsafeAppendNull();
return Status::OK();
} else {
return visit_non_null(data, size, quoted);
}
};
RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
} else {
RETURN_NOT_OK(parser.VisitColumn(col_index, visit_non_null));
}

RETURN_NOT_OK(builder.Finish(out));

return Status::OK();
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/arrow/csv/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ struct ARROW_EXPORT ConvertOptions {
// Recognized spellings for boolean values
std::vector<std::string> true_values;
std::vector<std::string> false_values;
// Whether string / binary columns can have null values.
// If true, then strings in "null_values" are considered null for string columns.
// If false, then all strings are valid string values.
bool strings_can_be_null = false;

static ConvertOptions Defaults();
};
Expand Down
21 changes: 20 additions & 1 deletion python/pyarrow/_csv.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,11 @@ cdef class ConvertOptions:
false_values: list, optional
A sequence of strings that denote false booleans in the data
(defaults are appropriate in most cases).
strings_can_be_null: bool, optional (default False)
Whether string / binary columns can have null values.
If true, then strings in null_values are considered null for
string columns.
If false, then all strings are valid string values.
"""
cdef:
CCSVConvertOptions options
Expand All @@ -269,7 +274,8 @@ cdef class ConvertOptions:
__slots__ = ()

def __init__(self, check_utf8=None, column_types=None, null_values=None,
true_values=None, false_values=None):
true_values=None, false_values=None,
strings_can_be_null=None):
self.options = CCSVConvertOptions.Defaults()
if check_utf8 is not None:
self.check_utf8 = check_utf8
Expand All @@ -281,6 +287,8 @@ cdef class ConvertOptions:
self.true_values = true_values
if false_values is not None:
self.false_values = false_values
if strings_can_be_null is not None:
self.strings_can_be_null = strings_can_be_null

@property
def check_utf8(self):
Expand All @@ -293,6 +301,17 @@ cdef class ConvertOptions:
def check_utf8(self, value):
self.options.check_utf8 = value

@property
def strings_can_be_null(self):
"""
Whether string / binary columns can have null values.
"""
return self.options.strings_can_be_null

@strings_can_be_null.setter
def strings_can_be_null(self, value):
self.options.strings_can_be_null = value

@property
def column_types(self):
"""
Expand Down
1 change: 1 addition & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -1014,6 +1014,7 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil:
vector[c_string] null_values
vector[c_string] true_values
vector[c_string] false_values
c_bool strings_can_be_null

@staticmethod
CCSVConvertOptions Defaults()
Expand Down
17 changes: 16 additions & 1 deletion python/pyarrow/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,10 @@ def test_convert_options():
opts.check_utf8 = False
assert opts.check_utf8 is False

assert opts.strings_can_be_null is False
opts.strings_can_be_null = True
assert opts.strings_can_be_null is True

assert opts.column_types == {}
# Pass column_types as mapping
opts.column_types = {'b': pa.int16(), 'c': pa.float32()}
Expand Down Expand Up @@ -167,12 +171,13 @@ def test_convert_options():

opts = cls(check_utf8=False, column_types={'a': pa.null()},
null_values=['N', 'nn'], true_values=['T', 'tt'],
false_values=['F', 'ff'])
false_values=['F', 'ff'], strings_can_be_null=True)
assert opts.check_utf8 is False
assert opts.column_types == {'a': pa.null()}
assert opts.null_values == ['N', 'nn']
assert opts.false_values == ['F', 'ff']
assert opts.true_values == ['T', 'tt']
assert opts.strings_can_be_null is True


class BaseTestCSVRead:
Expand Down Expand Up @@ -284,6 +289,16 @@ def test_custom_nulls(self):
'd': [2, None],
}

opts = ConvertOptions(null_values=['Xxx', 'Zzz'],
strings_can_be_null=True)
table = self.read_bytes(rows, convert_options=opts)
assert table.to_pydict() == {
'a': [None, None],
'b': [None, u"#N/A"],
'c': [u"1", u""],
'd': [2, None],
}

opts = ConvertOptions(null_values=[])
rows = b"a,b\n#N/A,\n"
table = self.read_bytes(rows, convert_options=opts)
Expand Down

0 comments on commit 277307d

Please sign in to comment.