Skip to content

Commit

Permalink
NUTCH-336: differentiate between newly discovered pages (known value …
Browse files Browse the repository at this point in the history
…through

inlink contributions) and newly injected pages (aribtrarily defined initial
value).


git-svn-id: https://svn.apache.org/repos/asf/lucene/nutch/branches/branch-0.8@449279 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
sigram committed Sep 23, 2006
1 parent ca0aac5 commit cf1eab7
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 8 deletions.
4 changes: 4 additions & 0 deletions CHANGES.txt
Expand Up @@ -31,6 +31,10 @@ Unreleased changes (0.8.1)

10. NUTCH-332 - Fix doubling score caused by links to self (Stefan
Groschupf via ab)

11. NUTCH-336 - Differentiate between newly discovered pages and newly
injected pages (Chris Schneider via ab) NOTE: this changes the
scoring API, filter implementations need to be updated.

Release 0.8 - 2006-07-25

Expand Down
4 changes: 1 addition & 3 deletions src/java/org/apache/nutch/crawl/CrawlDbReducer.java
Expand Up @@ -36,12 +36,10 @@ public class CrawlDbReducer implements Reducer {
private CrawlDatum result = new CrawlDatum();
private ArrayList linked = new ArrayList();
private ScoringFilters scfilters = null;
private float scoreInjected;

public void configure(JobConf job) {
retryMax = job.getInt("db.fetch.retry.max", 3);
scfilters = new ScoringFilters(job);
scoreInjected = job.getFloat("db.score.injected", 1.0f);
}

public void close() {}
Expand Down Expand Up @@ -112,7 +110,7 @@ public void reduce(WritableComparable key, Iterator values,
LOG.warn("Cannot filter init score for url " + key +
", using default: " + e.getMessage());
}
result.setScore(scoreInjected);
result.setScore(0.0f);
}
}
break;
Expand Down
4 changes: 2 additions & 2 deletions src/java/org/apache/nutch/crawl/Injector.java
Expand Up @@ -78,10 +78,10 @@ public void map(WritableComparable key, Writable val,
CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, interval);
datum.setScore(scoreInjected);
try {
scfilters.initialScore(value, datum);
scfilters.injectedScore(value, datum);
} catch (ScoringFilterException e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Cannot filter init score for url " + url +
LOG.warn("Cannot filter injected score for url " + url +
", using default (" + e.getMessage() + ")");
}
datum.setScore(scoreInjected);
Expand Down
16 changes: 15 additions & 1 deletion src/java/org/apache/nutch/scoring/ScoringFilter.java
Expand Up @@ -41,7 +41,21 @@ public interface ScoringFilter extends Configurable, Pluggable {
public final static String X_POINT_ID = ScoringFilter.class.getName();

/**
* Set an initial score for newly injected pages.
* Set an initial score for newly injected pages. Note: newly injected pages
* may have no inlinks, so filter implementations may wish to set this
* score to a non-zero value, to give newly injected pages some initial
* credit.
* @param url url of the page
* @param datum new datum. Filters will modify it in-place.
* @throws ScoringFilterException
*/
public void injectedScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException;

/**
* Set an initial score for newly discovered pages. Note: newly discovered pages
* have at least one inlink with its score contribution, so filter implementations
* may choose to set initial score to zero (unknown value), and then the inlink
* score contribution will set the "real" value of the new page.
* @param url url of the page
* @param datum new datum. Filters will modify it in-place.
* @throws ScoringFilterException
Expand Down
9 changes: 8 additions & 1 deletion src/java/org/apache/nutch/scoring/ScoringFilters.java
Expand Up @@ -92,13 +92,20 @@ public float generatorSortValue(UTF8 url, CrawlDatum datum, float initSort) thro
return initSort;
}

/** Calculate a new initial score, used when adding new pages. */
/** Calculate a new initial score, used when adding newly discovered pages. */
public void initialScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException {
for (int i = 0; i < this.filters.length; i++) {
this.filters[i].initialScore(url, datum);
}
}

/** Calculate a new initial score, used when injecting new pages. */
public void injectedScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException {
for (int i = 0; i < this.filters.length; i++) {
this.filters[i].injectedScore(url, datum);
}
}

/** Calculate updated page score during CrawlDb.update(). */
public void updateDbScore(UTF8 url, CrawlDatum old, CrawlDatum datum, List inlinked) throws ScoringFilterException {
for (int i = 0; i < this.filters.length; i++) {
Expand Down
Expand Up @@ -73,10 +73,16 @@ public void setConf(Configuration conf) {
}

/** Set to the value defined in config, 1.0f by default. */
public void initialScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException {
public void injectedScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException {
datum.setScore(scoreInjected);
}

/** Set to 0.0f (unknown value) - inlink contributions will bring it to
* a correct level. Newly discovered pages have at least one inlink. */
public void initialScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException {
datum.setScore(0.0f);
}

/** Use {@link CrawlDatum#getScore()}. */
public float generatorSortValue(UTF8 url, CrawlDatum datum, float initSort) throws ScoringFilterException {
return datum.getScore();
Expand Down

0 comments on commit cf1eab7

Please sign in to comment.