diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/dfs/FileSelection.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/dfs/FileSelection.java index d5deefa6843..be3e57d0e8f 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/dfs/FileSelection.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/dfs/FileSelection.java @@ -24,6 +24,7 @@ import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.StringUtils; +import org.apache.drill.exec.store.parquet.Metadata.ParquetTableMetadata_v1; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; @@ -44,6 +45,10 @@ public class FileSelection { public List files; public String selectionRoot; + // this is a temporary location for the reference to Parquet metadata + // TODO: ideally this should be in a Parquet specific derived class. + private ParquetTableMetadata_v1 parquetMeta = null; + public FileSelection() { } @@ -60,6 +65,13 @@ public FileSelection(List statuses) { this(statuses, null); } + public FileSelection(List files, String selectionRoot, + ParquetTableMetadata_v1 meta) { + this.files = files; + this.selectionRoot = selectionRoot; + this.parquetMeta = meta; + } + public FileSelection(List statuses, String selectionRoot) { this.statuses = statuses; this.files = Lists.newArrayList(); @@ -128,6 +140,16 @@ public List getFileStatusList(DrillFileSystem fs) throws IOException return statuses; } + /** + * Return the parquet table metadata that may have been read + * from a metadata cache file during creation of this file selection. + * It will always be null for non-parquet files and null for cases + * where no metadata cache was created. + */ + public ParquetTableMetadata_v1 getParquetMetadata() { + return parquetMeta; + } + private static String commonPath(FileStatus... paths) { String commonPath = ""; String[][] folders = new String[paths.length][]; diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetFormatPlugin.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetFormatPlugin.java index ac0d8e335ef..c30806c42de 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetFormatPlugin.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetFormatPlugin.java @@ -230,7 +230,7 @@ private FileSelection expandSelection(DrillFileSystem fs, FileSelection selectio // /a/b/c.parquet and the format of the selection root must match that of the file names // otherwise downstream operations such as partition pruning can break. Path metaRootPath = Path.getPathWithoutSchemeAndAuthority(metaRootDir.getPath()); - return new FileSelection(fileNames, metaRootPath.toString(), true); + return new FileSelection(fileNames, metaRootPath.toString(), metadata /* save metadata for future use */); } else { // don't expand yet; ParquetGroupScan's metadata gathering operation // does that. diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java index 094f00fb388..55eed43fc3f 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java @@ -119,6 +119,12 @@ public class ParquetGroupScan extends AbstractFileGroupScan { private List columns; private ListMultimap mappings; private List rowGroupInfos; + /** + * The parquet table metadata may have already been read + * from a metadata cache file earlier; we can re-use during + * the ParquetGroupScan and avoid extra loading time. + */ + private ParquetTableMetadata_v1 parquetTableMetadata = null; /* * total number of rows (obtained from parquet footer) @@ -177,6 +183,7 @@ public ParquetGroupScan( // } this.selectionRoot = selectionRoot; + this.parquetTableMetadata = selection.getParquetMetadata(); init(); } @@ -201,6 +208,7 @@ private ParquetGroupScan(ParquetGroupScan that) { this.partitionValueMap = that.partitionValueMap == null ? null : new HashMap(that.partitionValueMap); this.fileSet = that.fileSet == null ? null : new HashSet(that.fileSet); this.usedMetadataCache = that.usedMetadataCache; + this.parquetTableMetadata = that.parquetTableMetadata; } @@ -486,7 +494,6 @@ public void setEndpointByteMap(EndpointByteMap byteMap) { } private void init() throws IOException { - ParquetTableMetadata_v1 parquetTableMetadata; List fileStatuses = null; if (entries.size() == 1) { Path p = Path.getPathWithoutSchemeAndAuthority(new Path(entries.get(0).getPath())); @@ -498,7 +505,9 @@ private void init() throws IOException { } if (metaPath != null && fs.exists(metaPath)) { usedMetadataCache = true; - parquetTableMetadata = Metadata.readBlockMeta(fs, metaPath.toString()); + if (parquetTableMetadata == null) { + parquetTableMetadata = Metadata.readBlockMeta(fs, metaPath.toString()); + } } else { parquetTableMetadata = Metadata.getParquetTableMetadata(fs, p.toString()); } @@ -508,9 +517,15 @@ private void init() throws IOException { if (fs.isDirectory(new Path(selectionRoot)) && fs.exists(metaPath)) { usedMetadataCache = true; if (fileSet != null) { - parquetTableMetadata = removeUnneededRowGroups(Metadata.readBlockMeta(fs, metaPath.toString())); + if (parquetTableMetadata == null) { + parquetTableMetadata = removeUnneededRowGroups(Metadata.readBlockMeta(fs, metaPath.toString())); + } else { + parquetTableMetadata = removeUnneededRowGroups(parquetTableMetadata); + } } else { - parquetTableMetadata = Metadata.readBlockMeta(fs, metaPath.toString()); + if (parquetTableMetadata == null) { + parquetTableMetadata = Metadata.readBlockMeta(fs, metaPath.toString()); + } } } else { fileStatuses = Lists.newArrayList();