Skip to content

Commit

Permalink
NUTCH-2748 Fetch status gone (redirect exceeded) not to overwrite exi…
Browse files Browse the repository at this point in the history
…sting items in CrawlDb

(simplified version)
- new configuration property `http.redirect.max.exceeded.skip`:
  * if true skip redirect targets if http.redirect.max is exceeded
  * if false (default): store the redirect targets with status "linked"
- log whether exceeded redirects are "skipped" or "linked"
  • Loading branch information
sebastian-nagel committed Nov 8, 2019
1 parent 1babeb7 commit b31cab3
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 48 deletions.
15 changes: 7 additions & 8 deletions conf/nutch-default.xml
Expand Up @@ -1198,15 +1198,14 @@
</property>

<property>
<name>http.redirect.max.exceeded.status</name>
<value></value>
<name>http.redirect.max.exceeded.skip</name>
<value>false</value>
<description>
Status of items emitted for the last redirect target in a redirect
chain when redirects are followed (http.redirect.max > 0) and the
maximum number of redirects in a chain is exceeded. Possible values:
empty (target skipped), `linked` (target emitted as outlink) or
`gone` (target set to status db_gone, protocol status
`redir_exceeded`). See NUTCH-2748 for further information.
Whether to skip the last URL in a redirect chain when when redirects
are followed (http.redirect.max > 0) and the maximum number of redirects
in a chain is exceeded (redirect_count > http.redirect.max).
If not skipped the redirect target URLs are stored as `linked`
and fetched in one of the following cycles. See also NUTCH-2748.
</description>
</property>

Expand Down
54 changes: 14 additions & 40 deletions src/java/org/apache/nutch/fetcher/FetcherThread.java
Expand Up @@ -87,7 +87,7 @@ public class FetcherThread extends Thread {
private long maxCrawlDelay;
private String queueMode;
private int maxRedirect;
private byte maxRedirectExceededStatus = -1;
private boolean maxRedirectExceededSkip = false;
private String reprUrl;
private boolean redirecting;
private int redirectCount;
Expand Down Expand Up @@ -200,25 +200,8 @@ public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQ
Thread.currentThread().getId(), queueMode);

this.maxRedirect = conf.getInt("http.redirect.max", 3);
if (this.maxRedirect > 0) {
String maxRedirectExceededVal = conf
.get("http.redirect.max.exceeded.status", "");
switch (maxRedirectExceededVal) {
case "linked":
this.maxRedirectExceededStatus = CrawlDatum.STATUS_LINKED;
break;
case "gone":
this.maxRedirectExceededStatus = CrawlDatum.STATUS_FETCH_GONE;
break;
case "":
this.maxRedirectExceededStatus = -1;
break;
default:
LOG.warn(
"Ignored invalid value for http.redirect.max.exceeded.status: {} - using default (empty)",
maxRedirectExceededVal);
}
}
this.maxRedirectExceededSkip = conf
.getBoolean("http.redirect.max.exceeded.skip", false);

int maxOutlinksPerPage = conf.getInt("db.max.outlinks.per.page", 100);
maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE
Expand Down Expand Up @@ -470,18 +453,17 @@ public void run() {
if (redirecting && redirectCount > maxRedirect) {
((FetchItemQueues) fetchQueues).finishFetchItem(fit);
if (LOG.isInfoEnabled()) {
LOG.info("{} {} - redirect count exceeded {}", getName(),
Thread.currentThread().getId(), fit.url);
LOG.info("{} {} - redirect count exceeded {} ({})", getName(),
Thread.currentThread().getId(), fit.url,
maxRedirectExceededSkip ? "skipped" : "linked");
}
switch (maxRedirectExceededStatus) {
case -1 : break; // skip item
case CrawlDatum.STATUS_LINKED:
output();
case CrawlDatum.STATUS_FETCH_GONE:
output(fit.url, fit.datum, null,
ProtocolStatus.STATUS_REDIR_EXCEEDED,
CrawlDatum.STATUS_FETCH_GONE);

if (maxRedirectExceededSkip) {
// skip redirect target when redirect count is exceeded
} else {
Text newUrl = new Text(status.getMessage());
CrawlDatum newDatum = createRedirDatum(newUrl, fit,
CrawlDatum.STATUS_LINKED);
output(newUrl, newDatum, null, null, CrawlDatum.STATUS_LINKED);
}
}

Expand Down Expand Up @@ -604,15 +586,7 @@ private CrawlDatum createRedirDatum(Text redirUrl, FetchItem fit, byte status) {

private FetchItem queueRedirect(Text redirUrl, FetchItem fit)
throws ScoringFilterException {
CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED,
fit.datum.getFetchInterval(), fit.datum.getScore());
// transfer all existing metadata to the redirect
newDatum.getMetaData().putAll(fit.datum.getMetaData());
scfilters.initialScore(redirUrl, newDatum);
if (reprUrl != null) {
newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
new Text(reprUrl));
}
CrawlDatum newDatum = createRedirDatum(redirUrl, fit, CrawlDatum.STATUS_DB_UNFETCHED);
fit = FetchItem.create(redirUrl, newDatum, queueMode);
if (fit != null) {
FetchItemQueue fiq = ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID);
Expand Down

0 comments on commit b31cab3

Please sign in to comment.