Skip to content

Commit

Permalink
NUTCH-2533 Injector: NullPointerException if seed URL dir contains no…
Browse files Browse the repository at this point in the history
…n-file entries

- read directory explicitely and log all non-file entries
- exit early if no seed URL files are present
  • Loading branch information
sebastian-nagel committed Apr 11, 2018
1 parent 19f2358 commit ecebfd5
Showing 1 changed file with 17 additions and 1 deletion.
18 changes: 17 additions & 1 deletion src/java/org/apache/nutch/crawl/InjectorJob.java
Expand Up @@ -21,6 +21,7 @@
import org.apache.gora.persistency.Persistent;
import org.apache.gora.store.DataStore;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
Expand Down Expand Up @@ -227,7 +228,22 @@ public Map<String, Object> run(Map<String, Object> args) throws Exception {
numJobs = 1;
currentJobNum = 0;
currentJob = NutchJob.getInstance(getConf(), "inject " + input);
FileInputFormat.addInputPath(currentJob, input);
FileStatus[] seedFiles = input.getFileSystem(getConf()).listStatus(input);
int numSeedFiles = 0;
for (FileStatus seedFile : seedFiles) {
if (seedFile.isFile()) {
FileInputFormat.addInputPath(currentJob, seedFile.getPath());
numSeedFiles++;
LOG.info("Injecting seed URL file {}", seedFile.getPath());
} else {
LOG.warn("Skipped non-file input in {}: {}", input,
seedFile.getPath());
}
}
if (numSeedFiles == 0) {
LOG.error("No seed files to inject found in {}", input);
return results;
}
currentJob.setMapperClass(UrlMapper.class);
currentJob.setMapOutputKeyClass(String.class);
currentJob.setMapOutputValueClass(WebPage.class);
Expand Down

0 comments on commit ecebfd5

Please sign in to comment.