From 4372478aecfa9f6a7e017a830653f88a973642cc Mon Sep 17 00:00:00 2001 From: "piyush.mukati" Date: Wed, 15 Nov 2017 11:00:43 +0530 Subject: [PATCH 1/2] ORC-264 schema evolution columns name comparising is case unaware --- .../org/apache/orc/impl/SchemaEvolution.java | 5 ++-- .../apache/orc/impl/TestSchemaEvolution.java | 29 +++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java index 886d58539e..bd9dba196d 100644 --- a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java +++ b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java @@ -400,11 +400,12 @@ void buildConversion(TypeDescription fileType, List fileFieldNames = fileType.getFieldNames(); Map fileTypesIdx = new HashMap<>(); for (int i = 0; i < fileFieldNames.size(); i++) { - fileTypesIdx.put(fileFieldNames.get(i), fileChildren.get(i)); + final String fileFieldName = fileFieldNames.get(i).toLowerCase(); + fileTypesIdx.put(fileFieldName, fileChildren.get(i)); } for (int i = 0; i < readerFieldNames.size(); i++) { - String readerFieldName = readerFieldNames.get(i); + final String readerFieldName = readerFieldNames.get(i).toLowerCase(); TypeDescription readerField = readerChildren.get(i); TypeDescription fileField = fileTypesIdx.get(readerFieldName); diff --git a/java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java b/java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java index 82f823aff9..63c0d3d07e 100644 --- a/java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java +++ b/java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java @@ -718,6 +718,35 @@ public void testAddFieldBeforeEndOfStruct() { original = fileType.getChildren().get(1); assertSame(original, mapped); } + @Test + public void testCaseMismatchInReaderAndWriterSchema() { + TypeDescription fileType = + TypeDescription.fromString("struct,c:string>"); + TypeDescription readerType = + TypeDescription.fromString("struct,c:string>"); + boolean[] included = includeAll(readerType); + options.tolerateMissingSchema(false); + SchemaEvolution transition = + new SchemaEvolution(fileType, readerType, options.include(included)); + + // a -> A + TypeDescription reader = readerType.getChildren().get(0); + TypeDescription mapped = transition.getFileType(reader); + TypeDescription original = fileType.getChildren().get(0); + assertSame(original, mapped); + + // a.b -> a.b + TypeDescription readerChild = reader.getChildren().get(0); + mapped = transition.getFileType(readerChild); + TypeDescription originalChild = original.getChildren().get(0); + assertSame(originalChild, mapped); + + // c -> c + reader = readerType.getChildren().get(1); + mapped = transition.getFileType(reader); + original = fileType.getChildren().get(1); + assertSame(original, mapped); + } /** * Two structs can be equal but in different locations. They can converge to this. From d510acbeaa4b55bf7a3b14a8f35dbb620a0cc7b3 Mon Sep 17 00:00:00 2001 From: "piyush.mukati" Date: Sat, 16 Dec 2017 16:22:50 +0530 Subject: [PATCH 2/2] ORC-264 added isSchemaEvolutionCaseAware configurable --- .../core/src/java/org/apache/orc/OrcConf.java | 4 ++- java/core/src/java/org/apache/orc/Reader.java | 14 ++++++++ .../org/apache/orc/impl/SchemaEvolution.java | 34 +++++++++++++++++-- .../apache/orc/impl/TestSchemaEvolution.java | 3 +- 4 files changed, 50 insertions(+), 5 deletions(-) diff --git a/java/core/src/java/org/apache/orc/OrcConf.java b/java/core/src/java/org/apache/orc/OrcConf.java index 319366e309..443da878ea 100644 --- a/java/core/src/java/org/apache/orc/OrcConf.java +++ b/java/core/src/java/org/apache/orc/OrcConf.java @@ -150,7 +150,9 @@ public enum OrcConf { "added to all of the writers. Valid range is [1,10000] and is primarily meant for" + "n\testing. Setting this too low may negatively affect performance."), OVERWRITE_OUTPUT_FILE("orc.overwrite.output.file", "orc.overwrite.output.file", false, - "A boolean flag to enable overwriting of the output file if it already exists.\n") + "A boolean flag to enable overwriting of the output file if it already exists.\n"), + IS_SCHEMA_EVOLUTION_CASE_SENSITIVE("orc.schema.evolution.case.sensitive", "orc.schema.evolution.case.sensitive", true, + "A boolean flag to determine if the comparision of field names in schema evolution is case sensitive .\n") ; private final String attribute; diff --git a/java/core/src/java/org/apache/orc/Reader.java b/java/core/src/java/org/apache/orc/Reader.java index 2ef64d72a8..c5721fcd55 100644 --- a/java/core/src/java/org/apache/orc/Reader.java +++ b/java/core/src/java/org/apache/orc/Reader.java @@ -160,6 +160,7 @@ public static class Options implements Cloneable { private DataReader dataReader = null; private Boolean tolerateMissingSchema = null; private boolean forcePositionalEvolution; + private boolean isSchemaEvolutionCaseAware; public Options() { // PASS @@ -170,6 +171,7 @@ public Options(Configuration conf) { skipCorruptRecords = OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf); tolerateMissingSchema = OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf); forcePositionalEvolution = OrcConf.FORCE_POSITIONAL_EVOLUTION.getBoolean(conf); + isSchemaEvolutionCaseAware = OrcConf.IS_SCHEMA_EVOLUTION_CASE_SENSITIVE.getBoolean(conf); } /** @@ -262,6 +264,17 @@ public Options forcePositionalEvolution(boolean value) { return this; } + /** + * Set boolean flag to determine if the comparision of field names in schema evolution is case sensitive + * @param value the flag for schema evolution is case sensitive or not. + * @return + */ + public Options isSchemaEvolutionCaseAware(boolean value) { + this.isSchemaEvolutionCaseAware = value; + return this; + } + + public boolean[] getInclude() { return include; } @@ -309,6 +322,7 @@ public DataReader getDataReader() { public boolean getForcePositionalEvolution() { return forcePositionalEvolution; } + public boolean getIsSchemaEvolutionCaseAware() { return isSchemaEvolutionCaseAware; } public Options clone() { try { diff --git a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java index bd9dba196d..fd38260b29 100644 --- a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java +++ b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java @@ -48,6 +48,7 @@ public class SchemaEvolution { private final TypeDescription readerSchema; private boolean hasConversion = false; private final boolean isAcid; + private final boolean isSchemaEvolutionCaseAware; // indexed by reader column id private final boolean[] ppdSafeConversion; @@ -57,6 +58,7 @@ public class SchemaEvolution { private static final Pattern missingMetadataPattern = Pattern.compile("_col\\d+"); + public static class IllegalEvolutionException extends RuntimeException { public IllegalEvolutionException(String msg) { super(msg); @@ -68,6 +70,7 @@ public SchemaEvolution(TypeDescription fileSchema, Reader.Options options) { boolean allowMissingMetadata = options.getTolerateMissingSchema(); boolean[] includedCols = options.getInclude(); + this.isSchemaEvolutionCaseAware=options.getIsSchemaEvolutionCaseAware(); this.readerIncluded = includedCols == null ? null : Arrays.copyOf(includedCols, includedCols.length); this.fileIncluded = new boolean[fileSchema.getMaximumId() + 1]; @@ -398,14 +401,21 @@ void buildConversion(TypeDescription fileType, if (positionalLevels == 0) { List readerFieldNames = readerType.getFieldNames(); List fileFieldNames = fileType.getFieldNames(); - Map fileTypesIdx = new HashMap<>(); + + final Map fileTypesIdx; + if ( isSchemaEvolutionCaseAware) { + fileTypesIdx = new HashMap<>(); + } + else { + fileTypesIdx = new CaseInsensitiveMap(); + } for (int i = 0; i < fileFieldNames.size(); i++) { - final String fileFieldName = fileFieldNames.get(i).toLowerCase(); + final String fileFieldName = fileFieldNames.get(i); fileTypesIdx.put(fileFieldName, fileChildren.get(i)); } for (int i = 0; i < readerFieldNames.size(); i++) { - final String readerFieldName = readerFieldNames.get(i).toLowerCase(); + final String readerFieldName = readerFieldNames.get(i); TypeDescription readerField = readerChildren.get(i); TypeDescription fileField = fileTypesIdx.get(readerFieldName); @@ -512,4 +522,22 @@ static TypeDescription getBaseRow(TypeDescription typeDescription) { acidEventFieldNames.add("currentTransaction"); acidEventFieldNames.add("row"); } + + + + private static class CaseInsensitiveMap extends HashMap { + @Override + public V put(String key, V value) { + return super.put(key.toLowerCase(), value); + } + + @Override + public V get(Object key) { + return this.get((String) key); + } + // not @Override as key to be of type Object + public V get(String key) { + return super.get(key.toLowerCase()); + } + } } diff --git a/java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java b/java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java index 63c0d3d07e..510e00c93d 100644 --- a/java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java +++ b/java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java @@ -46,6 +46,7 @@ import org.apache.orc.TypeDescription; import org.apache.orc.Writer; import org.junit.Before; +import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TestName; @@ -727,7 +728,7 @@ public void testCaseMismatchInReaderAndWriterSchema() { boolean[] included = includeAll(readerType); options.tolerateMissingSchema(false); SchemaEvolution transition = - new SchemaEvolution(fileType, readerType, options.include(included)); + new SchemaEvolution(fileType, readerType, options.include(included).isSchemaEvolutionCaseAware(false)); // a -> A TypeDescription reader = readerType.getChildren().get(0);