Skip to content

Commit

Permalink
We want to output the parsed text (in the WritableSeqFile) irrespecti…
Browse files Browse the repository at this point in the history
…ve of whether we specify the use of Boilerpipe.
  • Loading branch information
vivek authored and vivek committed Aug 22, 2012
1 parent 9ed2a16 commit facf0a1
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 21 deletions.
6 changes: 3 additions & 3 deletions examples/src/main/java/bixo/examples/crawl/CrawlConfig.java
Expand Up @@ -22,7 +22,7 @@ public class CrawlConfig {
public static final String CONTENT_SUBDIR_NAME = "content";
public static final String STATUS_SUBDIR_NAME = "status";
public static final String PARSE_SUBDIR_NAME = "parse";
public static final String BOILERPIPE_SUBDIR_NAME = "boilerpipe";
public static final String EXTRACTED_TEXT_SUBDIR_NAME = "extracted-text";

public static final String WEB_ADDRESS = "http://wiki.github.com/bixo/bixo/bixocrawler";
public static final String EMAIL_ADDRESS = "bixo-dev@yahoogroups.com";
Expand All @@ -44,7 +44,7 @@ public class CrawlConfig {
public static final long MILLISECONDS_PER_MINUTE = 60 * 1000L;

// WritableSequenceFile key value field names
public static final String WRITABLE_SEQ_FILE_KEY_FN = "urlKey";
public static final String WRITABLE_SEQ_FILE_VALUE_FN = "boilerpipeValue";
public static final String WRITABLE_SEQ_FILE_KEY_FN = "url";
public static final String WRITABLE_SEQ_FILE_VALUE_FN = "parsedText";

}
28 changes: 10 additions & 18 deletions examples/src/main/java/bixo/examples/crawl/DemoCrawlWorkflow.java
Expand Up @@ -217,19 +217,17 @@ public static Flow createFlow(Path curWorkingDirPath, Path crawlDbPath, FetcherP
parser.setExtractLanguage(false);
ParsePipe parsePipe = new ParsePipe(contentPipe, parser);


Tap writableSeqFileSink = null;
Pipe writableSeqFileDataPipe = null;

if (options.isUseBoilerpipe()) {
// Let's output a WritableSequenceFile as an example - this file can then be used as input
// when working with Mahout.
writableSeqFileDataPipe = new Pipe("writable seqfile data", new Each(parsePipe.getTailPipe(), new CreateWritableSeqFileData()));

Path writableSeqFileDataPath = new Path(curWorkingDirPath, CrawlConfig.BOILERPIPE_SUBDIR_NAME);
writableSeqFileSink = new Hfs(new WritableSequenceFile(new Fields(CrawlConfig.WRITABLE_SEQ_FILE_KEY_FN, CrawlConfig.WRITABLE_SEQ_FILE_VALUE_FN), Text.class, Text.class),
writableSeqFileDataPath.toString());
}
// Let's output a WritableSequenceFile as an example - this file can
// then be used as input when working with Mahout.
writableSeqFileDataPipe = new Pipe("writable seqfile data", new Each(parsePipe.getTailPipe(), new CreateWritableSeqFileData()));

Path writableSeqFileDataPath = new Path(curWorkingDirPath, CrawlConfig.EXTRACTED_TEXT_SUBDIR_NAME);
writableSeqFileSink = new Hfs(new WritableSequenceFile(new Fields(CrawlConfig.WRITABLE_SEQ_FILE_KEY_FN, CrawlConfig.WRITABLE_SEQ_FILE_VALUE_FN), Text.class, Text.class),
writableSeqFileDataPath.toString());

Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe());
urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new CreateUrlDatumFromOutlinksFunction());
if (urlFilter != null) {
Expand Down Expand Up @@ -266,16 +264,10 @@ public static Flow createFlow(Path curWorkingDirPath, Path crawlDbPath, FetcherP
sinkMap.put(contentPipe.getName(), contentSink);
sinkMap.put(ParsePipe.PARSE_PIPE_NAME, parseSink);
sinkMap.put(crawlDbPipe.getName(), loopCrawldbSink);
sinkMap.put(writableSeqFileDataPipe.getName(), writableSeqFileSink);

Flow flow = null;
FlowConnector flowConnector = new FlowConnector(props);

if (writableSeqFileSink != null) {
sinkMap.put(writableSeqFileDataPipe.getName(), writableSeqFileSink);
flow = flowConnector.connect(inputSource, sinkMap, statusPipe, contentPipe, parsePipe.getTailPipe(), outputPipe, writableSeqFileDataPipe);
} else {
flow = flowConnector.connect(inputSource, sinkMap, statusPipe, contentPipe, parsePipe.getTailPipe(), outputPipe);
}
Flow flow = flowConnector.connect(inputSource, sinkMap, statusPipe, contentPipe, parsePipe.getTailPipe(), outputPipe, writableSeqFileDataPipe);

return flow;
}
Expand Down

0 comments on commit facf0a1

Please sign in to comment.