Skip to content

Commit

Permalink
ARROW-1636: [C++][Integration] Implement integration test parsing in …
Browse files Browse the repository at this point in the history
…C++ for null type, add integration test data generation

This enables integration testing of null type for C++ and Java. I was a bit surprised that the cross-compatibility tests passed so I would appreciate a sanity check from a Java developer to see if there are integration tests covering the integration-JSON-format parsing and null type IPC round trip. If there are not, let's ensure there are relevant JIRA issues so it can be tested and kept working

Closes #6368 from wesm/ARROW-1636 and squashes the following commits:

eb3620b <Antoine Pitrou> Add TODOs
d028dc4 <Wes McKinney> Implement integration test parsing in C++ for null type, add integration test data generation

Lead-authored-by: Wes McKinney <wesm+git@apache.org>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
wesm and pitrou committed Feb 26, 2020
1 parent 3a4023a commit da9e0ff
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 26 deletions.
15 changes: 11 additions & 4 deletions cpp/src/arrow/ipc/json_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1434,18 +1434,25 @@ class ArrayReader {
return Status::OK();
}

Status Parse(std::shared_ptr<Array>* out) {
RETURN_NOT_OK(GetObjectInt(obj_, "count", &length_));

Status ParseValidityBitmap() {
const auto& json_valid_iter = obj_.FindMember("VALIDITY");
RETURN_NOT_ARRAY("VALIDITY", json_valid_iter, obj_);

const auto& json_validity = json_valid_iter->value.GetArray();
DCHECK_EQ(static_cast<int>(json_validity.Size()), length_);
for (const rj::Value& val : json_validity) {
DCHECK(val.IsInt());
is_valid_.push_back(val.GetInt() != 0);
}
return Status::OK();
}

Status Parse(std::shared_ptr<Array>* out) {
RETURN_NOT_OK(GetObjectInt(obj_, "count", &length_));

if (type_->id() != Type::NA) {
// Null type is the only type without any buffers
RETURN_NOT_OK(ParseValidityBitmap());
}

RETURN_NOT_OK(VisitTypeInline(*type_, this));

Expand Down
11 changes: 8 additions & 3 deletions cpp/src/arrow/ipc/json_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ void CheckPrimitive(const std::shared_ptr<DataType>& type,

std::shared_ptr<Array> array;
ASSERT_OK(builder.Finish(&array));
TestArrayRoundTrip(*array.get());
TestArrayRoundTrip(*array);
}

TEST(TestJsonSchemaWriter, FlatTypes) {
Expand All @@ -147,6 +147,7 @@ TEST(TestJsonSchemaWriter, FlatTypes) {
field("f18", union_({field("u1", int8()), field("u2", time32(TimeUnit::MILLI))},
{0, 1}, UnionMode::DENSE)),
field("f19", large_list(uint8())),
field("f20", null()),
};

Schema schema(fields);
Expand All @@ -162,6 +163,11 @@ void PrimitiveTypesCheckOne() {
CheckPrimitive<T, c_type>(std::make_shared<T>(), is_valid, values);
}

TEST(TestJsonArrayWriter, NullType) {
auto arr = std::make_shared<NullArray>(10);
TestArrayRoundTrip(*arr);
}

TEST(TestJsonArrayWriter, PrimitiveTypes) {
PrimitiveTypesCheckOne<Int8Type>();
PrimitiveTypesCheckOne<Int16Type>();
Expand Down Expand Up @@ -249,8 +255,7 @@ TEST(TestJsonArrayWriter, Unions) {
ASSERT_OK(MakeUnion(&batch));

for (int i = 0; i < batch->num_columns(); ++i) {
std::shared_ptr<Array> col = batch->column(i);
TestArrayRoundTrip(*col.get());
TestArrayRoundTrip(*batch->column(i));
}
}

Expand Down
3 changes: 3 additions & 0 deletions dev/archery/archery/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,9 @@ def _set_default(opt, default):
help='Stop on first error')
@click.option('--gold_dirs', multiple=True,
help="gold integration test file paths")
@click.option('-k', '--match',
help=("Substring for test names to include in run, "
"e.g. -k primitive"))
def integration(with_all=False, random_seed=12345, **args):
from .integration.runner import write_js_test_json, run_all_tests
import numpy as np
Expand Down
72 changes: 54 additions & 18 deletions dev/archery/archery/integration/datagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,23 @@ def _get_buffers(self):
]


class NullColumn(Column):
# This subclass is for readability only
pass


class NullType(PrimitiveType):

def __init__(self, name):
super(NullType, self).__init__(name, nullable=True)

def _get_type(self):
return OrderedDict([('name', 'null')])

def generate_column(self, size, name=None):
return NullColumn(name or self.name, size)


TEST_INT_MAX = 2 ** 31 - 1
TEST_INT_MIN = ~TEST_INT_MAX

Expand Down Expand Up @@ -928,6 +945,19 @@ def generate_primitive_case(batch_sizes, name='primitive'):
return _generate_file(name, fields, batch_sizes)


def generate_null_case(batch_sizes):
# Interleave null with non-null types to ensure the appropriate number of
# buffers (0) is read and written
fields = [
NullType(name='f0'),
get_field('f1', 'int32'),
NullType(name='f2'),
get_field('f3', 'float64'),
NullType(name='f4')
]
return _generate_file('null', fields, batch_sizes)


def generate_decimal_case():
fields = [
DecimalType(name='f{}'.format(i), precision=precision, scale=2)
Expand All @@ -936,10 +966,7 @@ def generate_decimal_case():

possible_batch_sizes = 7, 10
batch_sizes = [possible_batch_sizes[i % 2] for i in range(len(fields))]

skip = set()
skip.add('Go') # TODO(ARROW-3676)
return _generate_file('decimal', fields, batch_sizes, skip=skip)
return _generate_file('decimal', fields, batch_sizes)


def generate_datetime_case():
Expand Down Expand Up @@ -976,9 +1003,7 @@ def generate_interval_case():
]

batch_sizes = [7, 10]
skip = set()
skip.add('JS') # TODO(ARROW-5239)
return _generate_file("interval", fields, batch_sizes, skip=skip)
return _generate_file("interval", fields, batch_sizes)


def generate_map_case():
Expand All @@ -990,9 +1015,7 @@ def generate_map_case():
]

batch_sizes = [7, 10]
skip = set()
skip.add('Go') # TODO(ARROW-3679)
return _generate_file("map", fields, batch_sizes, skip=skip)
return _generate_file("map", fields, batch_sizes)


def generate_nested_case():
Expand Down Expand Up @@ -1028,12 +1051,9 @@ def generate_dictionary_case():
DictionaryType('dict1', get_field('', 'int32'), dict1),
DictionaryType('dict2', get_field('', 'int16'), dict2)
]
skip = set()
skip.add('Go') # TODO(ARROW-3039)
batch_sizes = [7, 10]
return _generate_file("dictionary", fields, batch_sizes,
dictionaries=[dict0, dict1, dict2],
skip=skip)
dictionaries=[dict0, dict1, dict2])


def generate_nested_dictionary_case():
Expand Down Expand Up @@ -1075,12 +1095,28 @@ def _temp_path():
generate_primitive_case([], name='primitive_no_batches'),
generate_primitive_case([17, 20], name='primitive'),
generate_primitive_case([0, 0, 0], name='primitive_zerolength'),
generate_decimal_case(),

generate_null_case([10, 0])
.skip_category('JS') # TODO(ARROW-7900)
.skip_category('Go'), # TODO(ARROW-7901)

# TODO(ARROW-7948): Decimal + Go
generate_decimal_case().skip_category('Go'),

generate_datetime_case(),
generate_interval_case(),
generate_map_case(),

# TODO(ARROW-5239): Intervals + JS
generate_interval_case().skip_category('JS'),

# TODO(ARROW-5620): Map + Go
generate_map_case().skip_category('Go'),

generate_nested_case(),
generate_dictionary_case(),

# TODO(ARROW-3039, ARROW-5267): Dictionaries in GO
generate_dictionary_case().skip_category('Go'),

# TODO(ARROW-7902)
generate_nested_dictionary_case().skip_category(SKIP_ARROW)
.skip_category(SKIP_FLIGHT),
]
Expand Down
9 changes: 8 additions & 1 deletion dev/archery/archery/integration/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ class IntegrationRunner(object):

def __init__(self, json_files, testers, tempdir=None, debug=False,
stop_on_error=True, gold_dirs=None, serial=False,
**unused_kwargs):
match=None, **unused_kwargs):
self.json_files = json_files
self.testers = testers
self.temp_dir = tempdir or tempfile.mkdtemp()
Expand All @@ -60,6 +60,13 @@ def __init__(self, json_files, testers, tempdir=None, debug=False,
self.serial = serial
self.gold_dirs = gold_dirs
self.failures = []
self.match = match

if self.match is not None:
print("-- Only running tests with {} in their name"
.format(self.match))
self.json_files = [json_file for json_file in self.json_files
if self.match in json_file.name]

def run(self):
"""
Expand Down

0 comments on commit da9e0ff

Please sign in to comment.