Skip to content

Commit 5651f0c

Browse files
fix(parquet/pqarrow): Fix null_count column stats (#489)
### Rationale for this change When dictionary encoding is enabled and and repetitions are set to `required`, the `null_count` statistic is negative because `defLevels` is always 0. ### What changes are included in this PR? This PR sets the null count to 0 if `defLevels - nonNullCount < 0` ### Are these changes tested? Yes. I've added a new test that exposes another bug. I started with the `fullTypeList` variable like [this test](https://github.com/apache/arrow-go/blob/c6ce2ef4e55009a786cf04b3845eba5170c98066/parquet/pqarrow/encode_dictionary_test.go#L43) and discovered that not all types are supported. If an unsupported type is encoded as a dictionary, it results in a nasty panic in the typed dictionary encoder because the types don't line up. ### Are there any user-facing changes? Yes, the stats written to parquet files are currently wrong for these sorts of columns. This PR should fix that!
1 parent c6ce2ef commit 5651f0c

File tree

3 files changed

+87
-1
lines changed

3 files changed

+87
-1
lines changed

parquet/pqarrow/encode_arrow_test.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1284,6 +1284,30 @@ func (ps *ParquetIOTestSuite) writeColumn(mem memory.Allocator, sc *schema.Group
12841284
return buf.Bytes()
12851285
}
12861286

1287+
func (ps *ParquetIOTestSuite) writeDictionaryColumn(mem memory.Allocator, sc *schema.GroupNode, values arrow.Array) []byte {
1288+
var buf bytes.Buffer
1289+
arrsc, err := pqarrow.FromParquet(schema.NewSchema(sc), nil, nil)
1290+
ps.NoError(err)
1291+
1292+
writer, err := pqarrow.NewFileWriter(
1293+
arrsc,
1294+
&buf,
1295+
parquet.NewWriterProperties(
1296+
parquet.WithDictionaryDefault(true),
1297+
parquet.WithStats(true),
1298+
),
1299+
pqarrow.NewArrowWriterProperties(pqarrow.WithAllocator(mem)),
1300+
)
1301+
ps.NoError(err)
1302+
1303+
writer.NewRowGroup()
1304+
ps.NoError(writer.WriteColumnData(values))
1305+
ps.NoError(writer.Close())
1306+
ps.NoError(writer.Close())
1307+
1308+
return buf.Bytes()
1309+
}
1310+
12871311
func (ps *ParquetIOTestSuite) readAndCheckSingleColumnFile(mem memory.Allocator, data []byte, values arrow.Array) {
12881312
reader := ps.createReader(mem, data)
12891313
cr, err := reader.GetColumn(context.TODO(), 0)
@@ -1327,6 +1351,23 @@ var fullTypeList = []arrow.DataType{
13271351
&arrow.Decimal128Type{Precision: 38, Scale: 37},
13281352
}
13291353

1354+
var dictEncodingSupportedTypeList = []arrow.DataType{
1355+
arrow.PrimitiveTypes.Int32,
1356+
arrow.PrimitiveTypes.Int64,
1357+
arrow.PrimitiveTypes.Float32,
1358+
arrow.PrimitiveTypes.Float64,
1359+
arrow.BinaryTypes.String,
1360+
arrow.BinaryTypes.Binary,
1361+
&arrow.FixedSizeBinaryType{ByteWidth: 10},
1362+
&arrow.Decimal128Type{Precision: 1, Scale: 0},
1363+
&arrow.Decimal128Type{Precision: 5, Scale: 4},
1364+
&arrow.Decimal128Type{Precision: 10, Scale: 9},
1365+
&arrow.Decimal128Type{Precision: 19, Scale: 18},
1366+
&arrow.Decimal128Type{Precision: 23, Scale: 22},
1367+
&arrow.Decimal128Type{Precision: 27, Scale: 26},
1368+
&arrow.Decimal128Type{Precision: 38, Scale: 37},
1369+
}
1370+
13301371
func (ps *ParquetIOTestSuite) TestSingleColumnRequiredWrite() {
13311372
for _, dt := range fullTypeList {
13321373
ps.Run(dt.Name(), func() {

parquet/pqarrow/encode_dict_compute.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,9 @@ func writeDictionaryArrow(ctx *arrowWriteContext, cw file.ColumnChunkWriter, lea
117117
}
118118

119119
nonNullCount := indices.Len() - indices.NullN()
120-
pageStats.IncNulls(int64(len(defLevels) - nonNullCount))
120+
nullCount := max(int64(len(defLevels)-nonNullCount), 0)
121+
122+
pageStats.IncNulls(nullCount)
121123
pageStats.IncNumValues(int64(nonNullCount))
122124
return pageStats.UpdateFromArrow(referencedDict, false)
123125
}

parquet/pqarrow/encode_dictionary_test.go

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,49 @@ func (ps *ParquetIOTestSuite) TestSingleColumnOptionalDictionaryWrite() {
6767
}
6868
}
6969

70+
func (ps *ParquetIOTestSuite) TestSingleColumnRequiredDictionaryWrite() {
71+
for _, dt := range dictEncodingSupportedTypeList {
72+
// skip tests for bool as we don't do dictionaries for it
73+
if dt.ID() == arrow.BOOL {
74+
continue
75+
}
76+
77+
ps.Run(dt.Name(), func() {
78+
mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
79+
defer mem.AssertSize(ps.T(), 0)
80+
81+
bldr := array.NewDictionaryBuilder(mem, &arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int16, ValueType: dt})
82+
defer bldr.Release()
83+
84+
values := testutils.RandomNonNull(mem, dt, smallSize)
85+
defer values.Release()
86+
ps.Require().NoError(bldr.AppendArray(values))
87+
88+
arr := bldr.NewDictionaryArray()
89+
defer arr.Release()
90+
91+
sc := ps.makeSimpleSchema(arr.DataType(), parquet.Repetitions.Required)
92+
data := ps.writeDictionaryColumn(mem, sc, arr)
93+
94+
rdr, err := file.NewParquetReader(bytes.NewReader(data))
95+
ps.NoError(err)
96+
defer rdr.Close()
97+
98+
metadata := rdr.MetaData()
99+
ps.Len(metadata.RowGroups, 1)
100+
101+
rg := metadata.RowGroup(0)
102+
col, err := rg.ColumnChunk(0)
103+
ps.NoError(err)
104+
105+
stats, err := col.Statistics()
106+
ps.NoError(err)
107+
ps.EqualValues(smallSize, stats.NumValues())
108+
ps.EqualValues(0, stats.NullCount())
109+
})
110+
}
111+
}
112+
70113
func TestPqarrowDictionaries(t *testing.T) {
71114
suite.Run(t, &ArrowWriteDictionarySuite{dataPageVersion: parquet.DataPageV1})
72115
suite.Run(t, &ArrowWriteDictionarySuite{dataPageVersion: parquet.DataPageV2})

0 commit comments

Comments
 (0)