From 1680346a5d8027de9486e374100447398cbd1f0c Mon Sep 17 00:00:00 2001 From: Patrick Mezard Date: Wed, 10 Jun 2020 14:02:49 +0200 Subject: [PATCH] NUTCH-2793 indexer-csv: make it work in distributed mode Before the change, the output file name was hard-coded to "nutch.csv". When running in distributed mode, multiple reducers would clobber each other output. After the change, the filename is taken from the first open(cfg, name) initialization call, where name is a unique file name generated by IndexerOutputFormat, derived from hadoop FileOutputFormat. The CSV files are now named like part-r-000xx. --- src/plugin/indexer-csv/README.md | 4 ++-- .../apache/nutch/indexwriter/csv/CSVIndexWriter.java | 11 ++++------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/plugin/indexer-csv/README.md b/src/plugin/indexer-csv/README.md index 1eadea196e..6eb9711465 100644 --- a/src/plugin/indexer-csv/README.md +++ b/src/plugin/indexer-csv/README.md @@ -1,7 +1,7 @@ indexer-csv plugin for Nutch ============================ -**indexer-csv plugin** is used for writing documents to a CSV file. It does not work in distributed mode, the output is written to the local filesystem, not to HDFS, see [NUTCH-1541](https://issues.apache.org/jira/browse/NUTCH-1541). The configuration for the index writers is on **conf/index-writers.xml** file, included in the official Nutch distribution and it's as follow: +**indexer-csv plugin** is used for writing documents to a CSV file. The configuration for the index writers is on **conf/index-writers.xml** file, included in the official Nutch distribution and it's as follow: ```xml @@ -39,4 +39,4 @@ escapechar | Escape character used to escape a quote character | " maxfieldlength | Max. length of a single field value in characters | 4096 maxfieldvalues | Max. number of values of one field, useful for, e.g., the anchor texts field | 12 header | Write CSV column headers | true -outpath | Output path / directory (local filesystem path, relative to current working directory) | csvindexwriter \ No newline at end of file +outpath | Output path / directory (local filesystem path, relative to current working directory) | csvindexwriter diff --git a/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java b/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java index 160d03dc11..fcdd31140c 100644 --- a/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java +++ b/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java @@ -44,17 +44,14 @@ * index as CSV or tab-separated plain text table. Format (encoding, separators, * etc.) is configurable by a couple of options, see output of * {@link #describe()}. - * - *

- * Note: works only in local mode, to be used with index option - * -noCommit. - *

+ * */ public class CSVIndexWriter implements IndexWriter { public static final Logger LOG = LoggerFactory .getLogger(CSVIndexWriter.class); + private String filename = "nutch.csv"; private Configuration config; /** ordered list of fields (columns) in the CSV file */ @@ -192,7 +189,7 @@ protected int find(String value, int start) { @Override public void open(Configuration conf, String name) throws IOException { - + filename = name; } /** @@ -227,7 +224,7 @@ public void open(IndexWriterParams parameters) throws IOException { LOG.info("Writing output to {}", outputPath); Path outputDir = new Path(outputPath); fs = outputDir.getFileSystem(config); - csvLocalOutFile = new Path(outputDir, "nutch.csv"); + csvLocalOutFile = new Path(outputDir, filename); if (!fs.exists(outputDir)) { fs.mkdirs(outputDir); }