From 5d6b3e1ebb97cc11479db6c30b0a1a04986c4967 Mon Sep 17 00:00:00 2001 From: kazu Date: Fri, 17 Mar 2017 17:24:41 -0700 Subject: [PATCH 1/2] HADOOP-13371. S3A globber to use bulk listObject call over recursive directory scan --- .../src/main/java/org/apache/hadoop/fs/Globber.java | 4 +++- .../src/main/java/org/apache/hadoop/fs/Path.java | 2 ++ .../src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java index 7c69167c3a125..c9eefb187143b 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java @@ -231,7 +231,9 @@ private FileStatus[] doGlob() throws IOException { } for (FileStatus candidate : candidates) { if (globFilter.hasPattern()) { - FileStatus[] children = listStatus(candidate.getPath()); + Path path = candidate.getPath(); + path.filter = globFilter; + FileStatus[] children = listStatus(path); if (children.length == 1) { // If we get back only one result, this could be either a listing // of a directory with one entry, or it could reflect the fact diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Path.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Path.java index 252b3cca79a3e..dad043f3ec747 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Path.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Path.java @@ -63,6 +63,8 @@ public class Path implements Comparable, Serializable, ObjectInputValidation { public static final boolean WINDOWS = System.getProperty("os.name").startsWith("Windows"); + public PathFilter filter = null; + /** * Pre-compiled regular expressions to detect path formats. */ diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java index 1786e68a53a96..0f57653472b12 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java @@ -1432,7 +1432,7 @@ public FileStatus[] innerListStatus(Path f) throws FileNotFoundException, Listing.FileStatusListingIterator files = listing.createFileStatusListingIterator(path, request, - ACCEPT_ALL, + (f.filter == null) ? ACCEPT_ALL : f.filter, new Listing.AcceptAllButSelfAndS3nDirs(path)); result = new ArrayList<>(files.getBatchSize()); while (files.hasNext()) { From 2101c282a4def49ce82d3c4c47adc0eb27fd18f2 Mon Sep 17 00:00:00 2001 From: kazu Date: Fri, 24 Mar 2017 16:36:01 -0700 Subject: [PATCH 2/2] HADOOP-14235. S3A Path does not understand colon (:) when globbing --- .../src/main/java/org/apache/hadoop/fs/Globber.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java index c9eefb187143b..519bf39c6f3e2 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java @@ -256,8 +256,8 @@ private FileStatus[] doGlob() throws IOException { if (!child.isDirectory()) continue; } // Set the child path based on the parent path. - child.setPath(new Path(candidate.getPath(), - child.getPath().getName())); + child.setPath(new Path(candidate.getPath().toString() + Path.SEPARATOR + + child.getPath().getName())); if (globFilter.accept(child.getPath())) { newCandidates.add(child); }