Permalink
Browse files

We want to output the parsed text (in the WritableSeqFile) irrespecti…

…ve of whether we specify the use of Boilerpipe.
  • Loading branch information...
1 parent 9ed2a16 commit facf0a1e2935896d1a2f305228bd37baa090dc18 vivek committed Aug 22, 2012
@@ -22,7 +22,7 @@
public static final String CONTENT_SUBDIR_NAME = "content";
public static final String STATUS_SUBDIR_NAME = "status";
public static final String PARSE_SUBDIR_NAME = "parse";
- public static final String BOILERPIPE_SUBDIR_NAME = "boilerpipe";
+ public static final String EXTRACTED_TEXT_SUBDIR_NAME = "extracted-text";
public static final String WEB_ADDRESS = "http://wiki.github.com/bixo/bixo/bixocrawler";
public static final String EMAIL_ADDRESS = "bixo-dev@yahoogroups.com";
@@ -44,7 +44,7 @@
public static final long MILLISECONDS_PER_MINUTE = 60 * 1000L;
// WritableSequenceFile key value field names
- public static final String WRITABLE_SEQ_FILE_KEY_FN = "urlKey";
- public static final String WRITABLE_SEQ_FILE_VALUE_FN = "boilerpipeValue";
+ public static final String WRITABLE_SEQ_FILE_KEY_FN = "url";
+ public static final String WRITABLE_SEQ_FILE_VALUE_FN = "parsedText";
}
@@ -217,19 +217,17 @@ public static Flow createFlow(Path curWorkingDirPath, Path crawlDbPath, FetcherP
parser.setExtractLanguage(false);
ParsePipe parsePipe = new ParsePipe(contentPipe, parser);
-
Tap writableSeqFileSink = null;
Pipe writableSeqFileDataPipe = null;
- if (options.isUseBoilerpipe()) {
- // Let's output a WritableSequenceFile as an example - this file can then be used as input
- // when working with Mahout.
- writableSeqFileDataPipe = new Pipe("writable seqfile data", new Each(parsePipe.getTailPipe(), new CreateWritableSeqFileData()));
-
- Path writableSeqFileDataPath = new Path(curWorkingDirPath, CrawlConfig.BOILERPIPE_SUBDIR_NAME);
- writableSeqFileSink = new Hfs(new WritableSequenceFile(new Fields(CrawlConfig.WRITABLE_SEQ_FILE_KEY_FN, CrawlConfig.WRITABLE_SEQ_FILE_VALUE_FN), Text.class, Text.class),
- writableSeqFileDataPath.toString());
- }
+ // Let's output a WritableSequenceFile as an example - this file can
+ // then be used as input when working with Mahout.
+ writableSeqFileDataPipe = new Pipe("writable seqfile data", new Each(parsePipe.getTailPipe(), new CreateWritableSeqFileData()));
+
+ Path writableSeqFileDataPath = new Path(curWorkingDirPath, CrawlConfig.EXTRACTED_TEXT_SUBDIR_NAME);
+ writableSeqFileSink = new Hfs(new WritableSequenceFile(new Fields(CrawlConfig.WRITABLE_SEQ_FILE_KEY_FN, CrawlConfig.WRITABLE_SEQ_FILE_VALUE_FN), Text.class, Text.class),
+ writableSeqFileDataPath.toString());
+
Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe());
urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new CreateUrlDatumFromOutlinksFunction());
if (urlFilter != null) {
@@ -266,16 +264,10 @@ public static Flow createFlow(Path curWorkingDirPath, Path crawlDbPath, FetcherP
sinkMap.put(contentPipe.getName(), contentSink);
sinkMap.put(ParsePipe.PARSE_PIPE_NAME, parseSink);
sinkMap.put(crawlDbPipe.getName(), loopCrawldbSink);
+ sinkMap.put(writableSeqFileDataPipe.getName(), writableSeqFileSink);
- Flow flow = null;
FlowConnector flowConnector = new FlowConnector(props);
-
- if (writableSeqFileSink != null) {
- sinkMap.put(writableSeqFileDataPipe.getName(), writableSeqFileSink);
- flow = flowConnector.connect(inputSource, sinkMap, statusPipe, contentPipe, parsePipe.getTailPipe(), outputPipe, writableSeqFileDataPipe);
- } else {
- flow = flowConnector.connect(inputSource, sinkMap, statusPipe, contentPipe, parsePipe.getTailPipe(), outputPipe);
- }
+ Flow flow = flowConnector.connect(inputSource, sinkMap, statusPipe, contentPipe, parsePipe.getTailPipe(), outputPipe, writableSeqFileDataPipe);
return flow;
}

0 comments on commit facf0a1

Please sign in to comment.