Skip to content

Commit

Permalink
PARQUET-357: Parquet-thrift generates wrong schema for Thrift binary …
Browse files Browse the repository at this point in the history
…fields

Author: Nandor Kollar <nkollar@cloudera.com>

Closes #439 from nandorKollar/PARQUET-357 and squashes the following commits:

90cfcfb [Nandor Kollar] Address code review feedback
4bf8089 [Nandor Kollar] PARQUET-357: Parquet-thrift generates wrong schema for Thrift binary fields
  • Loading branch information
nkollar authored and zivanfi committed Jan 4, 2018
1 parent 8bfd9b4 commit da3e8eb
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 4 deletions.
Expand Up @@ -331,7 +331,7 @@ public ConvertedField visit(I64Type i64Type, State state) {

@Override
public ConvertedField visit(StringType stringType, State state) {
return visitPrimitiveType(BINARY, UTF8, state);
return stringType.isBinary() ? visitPrimitiveType(BINARY, state) : visitPrimitiveType(BINARY, UTF8, state);
}

private static boolean isUnion(StructOrUnionType s) {
Expand Down
Expand Up @@ -35,6 +35,7 @@
import org.apache.parquet.thrift.struct.ThriftType.*;
import org.apache.parquet.thrift.struct.ThriftType.StructType.StructOrUnionType;
import org.apache.parquet.thrift.struct.ThriftTypeID;
import org.apache.thrift.meta_data.FieldMetaData;

import java.util.ArrayList;
import java.util.Collection;
Expand Down Expand Up @@ -162,7 +163,12 @@ private static ThriftField toThriftField(String name, Field field, ThriftField.R
type = new I64Type();
break;
case STRING:
type = new StringType();
StringType stringType = new StringType();
FieldMetaData fieldMetaData = field.getFieldMetaData();
if (fieldMetaData != null && fieldMetaData.valueMetaData.isBinary()) {
stringType.setBinary(true);
}
type = stringType;
break;
case STRUCT:
type = toStructType(field.gettStructDescriptor());
Expand Down
Expand Up @@ -641,11 +641,21 @@ public void accept(TypeVisitor visitor) {
}

public static class StringType extends ThriftType {
private boolean binary = false;

@JsonCreator
public StringType() {
super(STRING);
}

public boolean isBinary() {
return binary;
}

public void setBinary(boolean binary) {
this.binary = binary;
}

@Override
public <R, S> R accept(StateVisitor<R, S> visitor, S state) {
return visitor.visit(this, state);
Expand Down
Expand Up @@ -21,9 +21,17 @@
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.List;
import java.util.UUID;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.junit.Assert;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.schema.OriginalType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Type;
import org.apache.parquet.schema.Types;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
Expand All @@ -33,6 +41,9 @@
import org.apache.parquet.thrift.ThriftParquetWriter;
import org.apache.parquet.thrift.test.binary.StringAndBinary;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;

public class TestBinary {
@Rule
public TemporaryFolder tempDir = new TemporaryFolder();
Expand All @@ -57,10 +68,22 @@ public void testBinary() throws IOException {
build(path)
.withThriftClass(StringAndBinary.class)
.build();


StringAndBinary record = reader.read();
reader.close();

Assert.assertEquals("Should match after serialization round trip",
assertSchema(ParquetFileReader.readFooter(new Configuration(), path));
assertEquals("Should match after serialization round trip",
expected, record);
}

private void assertSchema(ParquetMetadata parquetMetadata) {
List<Type> fields = parquetMetadata.getFileMetaData().getSchema().getFields();
assertEquals(2, fields.size());
assertEquals(Types.required(PrimitiveType.PrimitiveTypeName.BINARY).named("s"), fields.get(0));
assertEquals(OriginalType.UTF8, fields.get(0).getOriginalType());
assertEquals(Types.required(PrimitiveType.PrimitiveTypeName.BINARY).named("b"), fields.get(1));
assertNull(fields.get(1).getOriginalType());
}
}

0 comments on commit da3e8eb

Please sign in to comment.