Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

NUTCH-1024 Dynamically set fetchInterval by MIME-type

git-svn-id: https://svn.apache.org/repos/asf/nutch/trunk@1349226 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information...
commit b68e570b89a6299e11bd7c1e648f4a7f80561e4c 1 parent 81beec8
Markus Jelsma authored
View
2  CHANGES.txt
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk) Current Development:
+* NUTCH-1024 Dynamically set fetchInterval by MIME-type (markus)
+
* NUTCH-1364 Add a counter in Generator for malformed urls (lewismc)
* NUTCH-1360 Suport the storing of IP address connected to when web crawling (lewismc)
View
23 conf/adaptive-mimetypes.txt
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This configuration file is used by the MimeAdaptiveFetchScheduler and
+# allows the user to set the INC and DEC rates for the AdaptiveFetchScheduler
+# by MIME-type. Values are separated by tab.
+
+# MIME-type inc_rate dec_rate
+text/html 0.2 0.2
+application/xhtml+xml 0.2 0.2
+application/pdf 0.1 0.4
View
7 conf/nutch-default.xml
@@ -409,6 +409,13 @@
</property>
<property>
+ <name>db.fetch.schedule.mime.file</name>
+ <value>adaptive-mimetypes.txt</value>
+ <description>The configuration file for the MimeAdaptiveFetchSchedule.
+ </description>
+</property>
+
+<property>
<name>db.update.additions.allowed</name>
<value>true</value>
<description>If true, updatedb will add newly discovered URLs, if false
View
75 src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
@@ -19,9 +19,14 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.FloatWritable;
import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.util.NutchConfiguration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
/**
* This class implements an adaptive re-fetch algorithm. This works as follows:
* <ul>
@@ -53,9 +58,12 @@
*/
public class AdaptiveFetchSchedule extends AbstractFetchSchedule {
- private float INC_RATE;
+ // Loggg
+ public static final Logger LOG = LoggerFactory.getLogger(AbstractFetchSchedule.class);
+
+ protected float INC_RATE;
- private float DEC_RATE;
+ protected float DEC_RATE;
private int MAX_INTERVAL;
@@ -82,30 +90,39 @@ public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
long fetchTime, long modifiedTime, int state) {
super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime,
fetchTime, modifiedTime, state);
- long refTime = fetchTime;
- if (modifiedTime <= 0) modifiedTime = fetchTime;
+
float interval = datum.getFetchInterval();
- switch (state) {
- case FetchSchedule.STATUS_MODIFIED:
- interval *= (1.0f - DEC_RATE);
- break;
- case FetchSchedule.STATUS_NOTMODIFIED:
- interval *= (1.0f + INC_RATE);
- break;
- case FetchSchedule.STATUS_UNKNOWN:
- break;
- }
- if (SYNC_DELTA) {
- // try to synchronize with the time of change
- long delta = (fetchTime - modifiedTime) / 1000L;
- if (delta > interval) interval = delta;
- refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE * 1000);
- }
- if (interval < MIN_INTERVAL) {
- interval = MIN_INTERVAL;
- } else if (interval > MAX_INTERVAL) {
- interval = MAX_INTERVAL;
+ long refTime = fetchTime;
+
+ if (datum.getMetaData().containsKey(Nutch.WRITABLE_CUSTOM_INTERVAL_KEY)) {
+ // Is fetch interval preset in CrawlDatum MD? Then use preset interval
+ FloatWritable customIntervalWritable= (FloatWritable)(datum.getMetaData().get(Nutch.WRITABLE_CUSTOM_INTERVAL_KEY));
+ interval = customIntervalWritable.get();
+ } else {
+ if (modifiedTime <= 0) modifiedTime = fetchTime;
+ switch (state) {
+ case FetchSchedule.STATUS_MODIFIED:
+ interval *= (1.0f - DEC_RATE);
+ break;
+ case FetchSchedule.STATUS_NOTMODIFIED:
+ interval *= (1.0f + INC_RATE);
+ break;
+ case FetchSchedule.STATUS_UNKNOWN:
+ break;
+ }
+ if (SYNC_DELTA) {
+ // try to synchronize with the time of change
+ long delta = (fetchTime - modifiedTime) / 1000L;
+ if (delta > interval) interval = delta;
+ refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE * 1000);
+ }
+ if (interval < MIN_INTERVAL) {
+ interval = MIN_INTERVAL;
+ } else if (interval > MAX_INTERVAL) {
+ interval = MAX_INTERVAL;
+ }
}
+
datum.setFetchInterval(interval);
datum.setFetchTime(refTime + Math.round(interval * 1000.0));
datum.setModifiedTime(modifiedTime);
@@ -130,7 +147,7 @@ public static void main(String[] args) throws Exception {
// initial fetchInterval is 10 days
CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f);
p.setFetchTime(0);
- System.out.println(p);
+ LOG.info(p.toString());
// let's move the timeline a couple of deltas
for (int i = 0; i < 10000; i++) {
if (lastModified + update < curTime) {
@@ -139,14 +156,14 @@ public static void main(String[] args) throws Exception {
changeCnt++;
lastModified = curTime;
}
- System.out.println(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
+ LOG.info(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
+ (p.getFetchInterval() / SECONDS_PER_DAY ) + " days" + "\t missed " + miss);
if (p.getFetchTime() <= curTime) {
fetchCnt++;
fs.setFetchSchedule(new Text("http://www.example.com"), p,
p.getFetchTime(), p.getModifiedTime(), curTime, lastModified,
changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED);
- System.out.println("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
+ LOG.info("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
+ (p.getFetchInterval() / SECONDS_PER_DAY ) + " days");
if (!changed) miss++;
if (miss > maxMiss) maxMiss = miss;
@@ -157,7 +174,7 @@ public static void main(String[] args) throws Exception {
if (changed) miss++;
curTime += delta;
}
- System.out.println("Total missed: " + totalMiss + ", max miss: " + maxMiss);
- System.out.println("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times.");
+ LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss);
+ LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times.");
}
}
View
223 src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java
@@ -0,0 +1,223 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.HashMap;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.*;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.HttpHeaders;
+import org.apache.nutch.util.NutchConfiguration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Extension of @see AdaptiveFetchSchedule that allows for more flexible configuration
+ * of DEC and INC factors for various MIME-types.
+ *
+ * This class can be typically used in cases where a recrawl consists of many different
+ * MIME-types. It's not very common for MIME-types other than text/html to change frequently.
+ * Using this class you can configure different factors per MIME-type so to prefer frequently
+ * changing MIME-types over others.
+ *
+ * For it to work this class relies on the Content-Type MetaData key being present in the CrawlDB.
+ * This can either be done when injecting new URL's or by adding "Content-Type" to the
+ * db.parsemeta.to.crawldb configuration setting to force MIME-types of newly discovered URL's to
+ * be added to the CrawlDB.
+ *
+ * @author markus
+ */
+public class MimeAdaptiveFetchSchedule extends AdaptiveFetchSchedule {
+ // Loggg
+ public static final Logger LOG = LoggerFactory.getLogger(MimeAdaptiveFetchSchedule.class);
+
+ // Conf directives
+ public static final String SCHEDULE_INC_RATE = "db.fetch.schedule.adaptive.inc_rate";
+ public static final String SCHEDULE_DEC_RATE = "db.fetch.schedule.adaptive.dec_rate";
+ public static final String SCHEDULE_MIME_FILE= "db.fetch.schedule.mime.file";
+
+ // Default values for DEC and INC rate
+ private float defaultIncRate;
+ private float defaultDecRate;
+
+ // Structure to store inc and dec rates per MIME-type
+ private class AdaptiveRate {
+ public float inc;
+ public float dec;
+
+ public AdaptiveRate(Float inc, Float dec) {
+ this.inc = inc;
+ this.dec = dec;
+ }
+ }
+
+ // Here we store the mime's and their delta's
+ private HashMap<String,AdaptiveRate> mimeMap;
+
+ public void setConf(Configuration conf) {
+ super.setConf(conf);
+ if (conf == null) return;
+
+ // Read and set the default INC and DEC rates in case we cannot set values based on MIME-type
+ defaultIncRate = conf.getFloat(SCHEDULE_INC_RATE, 0.2f);
+ defaultDecRate = conf.getFloat(SCHEDULE_DEC_RATE, 0.2f);
+
+ // Where's the mime/factor file?
+ Reader mimeFile = conf.getConfResourceAsReader(conf.get(SCHEDULE_MIME_FILE, "adaptive-mimetypes.txt"));
+
+ try {
+ readMimeFile(mimeFile);
+ } catch (IOException e) {
+ LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ }
+ }
+
+ @Override
+ public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
+ long prevFetchTime, long prevModifiedTime,
+ long fetchTime, long modifiedTime, int state) {
+
+ // Set defaults
+ INC_RATE = defaultIncRate;
+ DEC_RATE = defaultDecRate;
+
+ // Check if the Content-Type field is available in the CrawlDatum
+ if (datum.getMetaData().containsKey(HttpHeaders.WRITABLE_CONTENT_TYPE)) {
+ // Get the MIME-type of the current URL
+ String currentMime = datum.getMetaData().get(HttpHeaders.WRITABLE_CONTENT_TYPE).toString();
+
+ // Get rid of charset
+ currentMime = currentMime.substring(0, currentMime.indexOf(';'));
+
+ // Check if this MIME-type exists in our map
+ if (mimeMap.containsKey(currentMime)) {
+ // Yes, set the INC and DEC rates for this MIME-type
+ INC_RATE = mimeMap.get(currentMime).inc;
+ DEC_RATE = mimeMap.get(currentMime).dec;
+ }
+ }
+
+ return super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime,
+ fetchTime, modifiedTime, state);
+ }
+
+ /**
+ * Reads the mime types and their associated INC/DEC factors in a HashMap
+ *
+ * @param mimeFile Reader
+ * @return void
+ */
+ private void readMimeFile(Reader mimeFile) throws IOException {
+ // Instance of our mime/factor map
+ mimeMap = new HashMap<String,AdaptiveRate>();
+
+ // Open a reader
+ BufferedReader reader = new BufferedReader(mimeFile);
+
+ String line = null;
+ String[] splits = null;
+
+ // Read all lines
+ while ((line = reader.readLine()) != null) {
+ // Skip blank lines and comments
+ if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+ // Split the line by TAB
+ splits = line.split("\t");
+
+ // Sanity check, we need two or three items
+ if (splits.length == 3) {
+ // Add a lower cased MIME-type and the factor to the map
+ mimeMap.put(StringUtils.lowerCase(splits[0]), new AdaptiveRate(new Float(splits[1]), new Float(splits[2])));
+ } else {
+ LOG.warn("Invalid configuration line in: " + line);
+ }
+ }
+ }
+ }
+
+ public static void main(String[] args) throws Exception {
+ FetchSchedule fs = new MimeAdaptiveFetchSchedule();
+ fs.setConf(NutchConfiguration.create());
+ // we start the time at 0, for simplicity
+ long curTime = 0;
+ long delta = 1000L * 3600L * 24L; // 2 hours
+ // we trigger the update of the page every 30 days
+ long update = 1000L * 3600L * 24L * 30L; // 30 days
+ boolean changed = true;
+ long lastModified = 0;
+ int miss = 0;
+ int totalMiss = 0;
+ int maxMiss = 0;
+ int fetchCnt = 0;
+ int changeCnt = 0;
+
+ // initial fetchInterval is 10 days
+ CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f);
+
+ // Set a default MIME-type to test with
+ org.apache.hadoop.io.MapWritable x = new org.apache.hadoop.io.MapWritable();
+ x.put(HttpHeaders.WRITABLE_CONTENT_TYPE, new Text("text/html; charset=utf-8"));
+ p.setMetaData(x);
+
+ p.setFetchTime(0);
+ LOG.info(p.toString());
+
+ // let's move the timeline a couple of deltas
+ for (int i = 0; i < 10000; i++) {
+ if (lastModified + update < curTime) {
+ //System.out.println("i=" + i + ", lastModified=" + lastModified + ", update=" + update + ", curTime=" + curTime);
+ changed = true;
+ changeCnt++;
+ lastModified = curTime;
+ }
+
+ LOG.info(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
+ + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days" + "\t missed " + miss);
+
+ if (p.getFetchTime() <= curTime) {
+ fetchCnt++;
+ fs.setFetchSchedule(new Text("http://www.example.com"), p,
+ p.getFetchTime(), p.getModifiedTime(), curTime, lastModified,
+ changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED);
+
+ LOG.info("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
+ + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days");
+
+ if (!changed) miss++;
+ if (miss > maxMiss) maxMiss = miss;
+ changed = false;
+ totalMiss += miss;
+ miss = 0;
+ }
+
+ if (changed) miss++;
+ curTime += delta;
+ }
+ LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss);
+ LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times.");
+ }
+
+
+}
View
3  src/java/org/apache/nutch/metadata/HttpHeaders.java
@@ -16,6 +16,7 @@
*/
package org.apache.nutch.metadata;
+import org.apache.hadoop.io.Text;
/**
* A collection of HTTP header names.
@@ -41,6 +42,8 @@
public final static String CONTENT_MD5 = "Content-MD5";
public final static String CONTENT_TYPE = "Content-Type";
+
+ public static final Text WRITABLE_CONTENT_TYPE = new Text(CONTENT_TYPE);
public final static String LAST_MODIFIED = "Last-Modified";
View
5 src/java/org/apache/nutch/metadata/Nutch.java
@@ -66,4 +66,9 @@
public static final String REPR_URL_KEY = "_repr_";
public static final Text WRITABLE_REPR_URL_KEY = new Text(REPR_URL_KEY);
+
+ /** Used by AdaptiveFetchSchedule to maintain custom fetch interval */
+ public static final String CUSTOM_INTERVAL_KEY = "interval";
+
+ public static final Text WRITABLE_CUSTOM_INTERVAL_KEY = new Text(CUSTOM_INTERVAL_KEY);
}
Please sign in to comment.
Something went wrong with that request. Please try again.