Skip to content

Commit

Permalink
NUTCH-2367 Get single record from HostDB
Browse files Browse the repository at this point in the history
  • Loading branch information
Markus Jelsma committed Mar 16, 2017
1 parent 3926910 commit be3aea1
Showing 1 changed file with 37 additions and 2 deletions.
39 changes: 37 additions & 2 deletions src/java/org/apache/nutch/hostdb/ReadHostDb.java
Expand Up @@ -30,9 +30,11 @@
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
Expand Down Expand Up @@ -200,6 +202,29 @@ private void readHostDb(Path hostDb, Path output, boolean dumpHomepages, boolean
long end = System.currentTimeMillis();
LOG.info("ReadHostDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

private void getHostDbRecord(Path hostDb, String host) throws Exception {
Configuration conf = getConf();
SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(conf, hostDb);

Class<?> keyClass = readers[0].getKeyClass();
Class<?> valueClass = readers[0].getValueClass();

if (!keyClass.getName().equals("org.apache.hadoop.io.Text"))
throw new IOException("Incompatible key (" + keyClass.getName() + ")");

Text key = (Text) keyClass.newInstance();
HostDatum value = (HostDatum) valueClass.newInstance();

for (int i = 0; i < readers.length; i++) {
while (readers[i].next(key, value)) {
if (host.equals(key.toString())) {
System.out.println(value.toString());
}
}
readers[i].close();
}
}

public static void main(String args[]) throws Exception {
int res = ToolRunner.run(NutchConfiguration.create(), new ReadHostDb(), args);
Expand All @@ -208,13 +233,14 @@ public static void main(String args[]) throws Exception {

public int run(String[] args) throws Exception {
if (args.length < 2) {
System.err.println("Usage: ReadHostDb <hostdb> <output> [-dumpHomepages | -dumpHostnames | -expr <expr.>]");
System.err.println("Usage: ReadHostDb <hostdb> [-get <url>] [<output> [-dumpHomepages | -dumpHostnames | -expr <expr.>]]");
return -1;
}

boolean dumpHomepages = false;
boolean dumpHostnames = false;
String expr = null;
String get = null;

for (int i = 0; i < args.length; i++) {
if (args[i].equals("-dumpHomepages")) {
Expand All @@ -225,6 +251,11 @@ public int run(String[] args) throws Exception {
LOG.info("ReadHostDb: dumping hostnames");
dumpHostnames = true;
}
if (args[i].equals("-get")) {
get = args[i + 1];
LOG.info("ReadHostDb: get: "+ get);
i++;
}
if (args[i].equals("-expr")) {
expr = args[i + 1];
LOG.info("ReadHostDb: evaluating expression: " + expr);
Expand All @@ -233,7 +264,11 @@ public int run(String[] args) throws Exception {
}

try {
readHostDb(new Path(args[0]), new Path(args[1]), dumpHomepages, dumpHostnames, expr);
if (get != null) {
getHostDbRecord(new Path(args[0], "current"), get);
} else {
readHostDb(new Path(args[0]), new Path(args[1]), dumpHomepages, dumpHostnames, expr);
}
return 0;
} catch (Exception e) {
LOG.error("ReadHostDb: " + StringUtils.stringifyException(e));
Expand Down

0 comments on commit be3aea1

Please sign in to comment.