Skip to content

Commit

Permalink
NUTCH-2095 WARC exporter for the CommonCrawlDataDumper
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.apache.org/repos/asf/nutch/trunk@1704594 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
jorgelbg committed Sep 22, 2015
1 parent 1e63bc1 commit 23c7761
Show file tree
Hide file tree
Showing 13 changed files with 1,266 additions and 590 deletions.
18 changes: 18 additions & 0 deletions conf/nutch-default.xml
Expand Up @@ -1878,4 +1878,22 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
</description>
</property>

<property>
<name>store.http.request</name>
<value>false</value>
<description>
Store the raw request made by Nutch, required to use the CommonCrawlDataDumper
tool for the WARC format.
</description>
</property>

<property>
<name>store.http.headers</name>
<value>false</value>
<description>
Store the raw headers received by Nutch from the server, required to use the
CommonCrawlDataDumper tool for the WARC format.
</description>
</property>

</configuration>
5 changes: 5 additions & 0 deletions ivy/ivy.xml
Expand Up @@ -81,6 +81,11 @@
<dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.5.1" conf="*->default"/>
<dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.5.1" conf="*->default"/>

<!-- WARC artifacts needed -->
<dependency org="org.netpreserve.commons" name="webarchive-commons" rev="1.1.5" conf="*->default">
<exclude module="hadoop-core"/>
</dependency>

<!--artifacts needed for testing -->
<dependency org="junit" name="junit" rev="4.11" conf="test->default" />
<!--dependency org="org.apache.hadoop" name="hadoop-test" rev="1.2.0" conf="test->default" /-->
Expand Down
160 changes: 96 additions & 64 deletions src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
Expand Up @@ -18,12 +18,19 @@
package org.apache.nutch.tools;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.InetAddress;
import java.net.URLEncoder;
import java.net.UnknownHostException;
import java.text.ParseException;

import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.util.URIUtil;
import org.apache.commons.lang.NotImplementedException;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -35,50 +42,66 @@
*
*/
public abstract class AbstractCommonCrawlFormat implements CommonCrawlFormat {
private static final Logger LOG = LoggerFactory.getLogger(AbstractCommonCrawlFormat.class.getName());
protected static final Logger LOG = LoggerFactory.getLogger(AbstractCommonCrawlFormat.class.getName());

protected String url;
protected byte[] content;

protected Content content;

protected Metadata metadata;

protected Configuration conf;

protected String keyPrefix;

protected boolean simpleDateFormat;

protected boolean jsonArray;

protected boolean reverseKey;

protected String reverseKeyValue;

public AbstractCommonCrawlFormat(String url, byte[] content, Metadata metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException {
public AbstractCommonCrawlFormat(String url, Content content, Metadata metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException {
this.url = url;
this.content = content;
this.metadata = metadata;
this.conf = nutchConf;

this.keyPrefix = config.getKeyPrefix();
this.simpleDateFormat = config.getSimpleDateFormat();
this.jsonArray = config.getJsonArray();
this.reverseKey = config.getReverseKey();
this.reverseKeyValue = config.getReverseKeyValue();
}

public String getJsonData(String url, Content content, Metadata metadata)
throws IOException {
this.url = url;
this.content = content;
this.metadata = metadata;

return this.getJsonData();
}

public String getJsonData(String url, Content content, Metadata metadata,
ParseData parseData) throws IOException {

// override of this is required in the actual formats
throw new NotImplementedException();
}

@Override
public String getJsonData() throws IOException {
try {
startObject(null);

// url
writeKeyValue("url", getUrl());

// timestamp
writeKeyValue("timestamp", getTimestamp());

// request
startObject("request");
writeKeyValue("method", getMethod());
Expand All @@ -102,7 +125,7 @@ public String getJsonData() throws IOException {
closeHeaders("headers", false, true);
writeKeyNull("body");
closeObject("request");

// response
startObject("response");
writeKeyValue("status", getResponseStatus());
Expand All @@ -125,50 +148,56 @@ public String getJsonData() throws IOException {
closeHeaders("headers", false, true);
writeKeyValue("body", getResponseContent());
closeObject("response");

// key
if (!this.keyPrefix.isEmpty()) {
this.keyPrefix += "-";
}
writeKeyValue("key", this.keyPrefix + getKey());

// imported
writeKeyValue("imported", getImported());

closeObject(null);

return generateJson();

} catch (IOException ioe) {
LOG.warn("Error in processing file " + url + ": " + ioe.getMessage());
throw new IOException("Error in generating JSON:" + ioe.getMessage());
throw new IOException("Error in generating JSON:" + ioe.getMessage());
}
}

// abstract methods

protected abstract void writeKeyValue(String key, String value) throws IOException;

protected abstract void writeKeyNull(String key) throws IOException;

protected abstract void startArray(String key, boolean nested, boolean newline) throws IOException;

protected abstract void closeArray(String key, boolean nested, boolean newline) throws IOException;

protected abstract void writeArrayValue(String value) throws IOException;

protected abstract void startObject(String key) throws IOException;

protected abstract void closeObject(String key) throws IOException;

protected abstract String generateJson() throws IOException;

// getters

protected String getUrl() {
try {
return URIUtil.encodePath(url);
} catch (URIException e) {
LOG.error("Can't encode URL " + url);
}

return url;
}

protected String getTimestamp() {
if (this.simpleDateFormat) {
String timestamp = null;
Expand All @@ -183,88 +212,88 @@ protected String getTimestamp() {
return ifNullString(metadata.get(Metadata.LAST_MODIFIED));
}
}

protected String getMethod() {
return new String("GET");
}

protected String getRequestHostName() {
String hostName = "";
try {
hostName = InetAddress.getLocalHost().getHostName();
} catch (UnknownHostException uhe) {

}
return hostName;
}

protected String getRequestHostAddress() {
String hostAddress = "";
try {
hostAddress = InetAddress.getLocalHost().getHostAddress();
} catch (UnknownHostException uhe) {

}
return hostAddress;
}

protected String getRequestSoftware() {
return conf.get("http.agent.version", "");
}

protected String getRequestRobots() {
return new String("CLASSIC");
}

protected String getRequestContactName() {
return conf.get("http.agent.name", "");
}

protected String getRequestContactEmail() {
return conf.get("http.agent.email", "");
}

protected String getRequestAccept() {
return conf.get("http.accept", "");
}

protected String getRequestAcceptEncoding() {
return new String(""); // TODO
}

protected String getRequestAcceptLanguage() {
return conf.get("http.accept.language", "");
}

protected String getRequestUserAgent() {
return conf.get("http.robots.agents", "");
}

protected String getResponseStatus() {
return ifNullString(metadata.get("status"));
}

protected String getResponseHostName() {
return URLUtil.getHost(url);
}

protected String getResponseAddress() {
return ifNullString(metadata.get("_ip_"));
}

protected String getResponseContentEncoding() {
return ifNullString(metadata.get("Content-Encoding"));
}

protected String getResponseContentType() {
return ifNullString(metadata.get("Content-Type"));
}

protected String getResponseDate() {
if (this.simpleDateFormat) {
String timestamp = null;
try {
long epoch = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z").parse(ifNullString(metadata.get("Date"))).getTime();
long epoch = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z").parse(ifNullString(metadata.get("Date"))).getTime();
timestamp = String.valueOf(epoch);
} catch (ParseException pe) {
LOG.warn(pe.getMessage());
Expand All @@ -274,15 +303,15 @@ protected String getResponseDate() {
return ifNullString(metadata.get("Date"));
}
}

protected String getResponseServer() {
return ifNullString(metadata.get("Server"));
}

protected String getResponseContent() {
return new String(content);
return new String(content.getContent());
}

protected String getKey() {
if (this.reverseKey) {
return this.reverseKeyValue;
Expand All @@ -291,7 +320,7 @@ protected String getKey() {
return url;
}
}

protected String getImported() {
if (this.simpleDateFormat) {
String timestamp = null;
Expand All @@ -306,11 +335,11 @@ protected String getImported() {
return ifNullString(metadata.get("Date"));
}
}

private static String ifNullString(String value) {
return (value != null) ? value : "";
}

private void startHeaders(String key, boolean nested, boolean newline) throws IOException {
if (this.jsonArray) {
startArray(key, nested, newline);
Expand All @@ -319,7 +348,7 @@ private void startHeaders(String key, boolean nested, boolean newline) throws IO
startObject(key);
}
}

private void closeHeaders(String key, boolean nested, boolean newline) throws IOException {
if (this.jsonArray) {
closeArray(key, nested, newline);
Expand All @@ -328,7 +357,7 @@ private void closeHeaders(String key, boolean nested, boolean newline) throws IO
closeObject(key);
}
}

private void writeKeyValueWrapper(String key, String value) throws IOException {
if (this.jsonArray) {
startArray(null, true, false);
Expand All @@ -340,4 +369,7 @@ private void writeKeyValueWrapper(String key, String value) throws IOException {
writeKeyValue(key, value);
}
}

@Override
public void close() {}
}

0 comments on commit 23c7761

Please sign in to comment.