Skip to content

Commit

Permalink
PARQUET-357: Parquet-thrift generates wrong schema for Thrift binary …
Browse files Browse the repository at this point in the history
…fields
  • Loading branch information
nkollar committed Dec 6, 2017
1 parent 81f4801 commit 4bf8089
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ public ConvertedField visit(I64Type i64Type, State state) {

@Override
public ConvertedField visit(StringType stringType, State state) {
return visitPrimitiveType(BINARY, UTF8, state);
return stringType.isBinary()? visitPrimitiveType(BINARY, state) : visitPrimitiveType(BINARY, UTF8, state);
}

private static boolean isUnion(StructOrUnionType s) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,9 @@ private static ThriftField toThriftField(String name, Field field, ThriftField.R
break;
case STRING:
type = new StringType();
if (field.getFieldMetaData() != null && field.getFieldMetaData().valueMetaData.isBinary()) {
((StringType) type).setBinary(true);
}
break;
case STRUCT:
type = toStructType(field.gettStructDescriptor());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -641,11 +641,21 @@ public void accept(TypeVisitor visitor) {
}

public static class StringType extends ThriftType {
private boolean binary = false;

@JsonCreator
public StringType() {
super(STRING);
}

public boolean isBinary() {
return binary;
}

public void setBinary(boolean binary) {
this.binary = binary;
}

@Override
public <R, S> R accept(StateVisitor<R, S> visitor, S state) {
return visitor.visit(this, state);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,17 @@
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.List;
import java.util.UUID;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.junit.Assert;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.schema.OriginalType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Type;
import org.apache.parquet.schema.Types;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
Expand All @@ -33,6 +41,9 @@
import org.apache.parquet.thrift.ThriftParquetWriter;
import org.apache.parquet.thrift.test.binary.StringAndBinary;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;

public class TestBinary {
@Rule
public TemporaryFolder tempDir = new TemporaryFolder();
Expand All @@ -57,10 +68,22 @@ public void testBinary() throws IOException {
build(path)
.withThriftClass(StringAndBinary.class)
.build();


StringAndBinary record = reader.read();
reader.close();

Assert.assertEquals("Should match after serialization round trip",
assertSchema(ParquetFileReader.readFooter(new Configuration(), path));
assertEquals("Should match after serialization round trip",
expected, record);
}

private void assertSchema(ParquetMetadata parquetMetadata) {
List<Type> fields = parquetMetadata.getFileMetaData().getSchema().getFields();
assertEquals(2, fields.size());
assertEquals(Types.required(PrimitiveType.PrimitiveTypeName.BINARY).named("s"), fields.get(0));
assertEquals(OriginalType.UTF8, fields.get(0).getOriginalType());
assertEquals(Types.required(PrimitiveType.PrimitiveTypeName.BINARY).named("b"), fields.get(1));
assertNull(fields.get(1).getOriginalType());
}
}

0 comments on commit 4bf8089

Please sign in to comment.