-
Notifications
You must be signed in to change notification settings - Fork 1.3k
/
ScoringFilter.java
132 lines (123 loc) · 4.83 KB
/
ScoringFilter.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.scoring;
import java.util.Collection;
import java.util.List;
import org.apache.hadoop.conf.Configurable;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.plugin.FieldPluggable;
import org.apache.nutch.storage.WebPage;
/**
* A contract defining behavior of scoring plugins.
*
* A scoring filter will manipulate scoring variables in CrawlDatum and in
* resulting search indexes. Filters can be chained in a specific order, to
* provide multi-stage scoring adjustments.
*
* @author Andrzej Bialecki
*/
public interface ScoringFilter extends Configurable, FieldPluggable {
/** The name of the extension point. */
public final static String X_POINT_ID = ScoringFilter.class.getName();
/**
* Set an initial score for newly injected pages. Note: newly injected pages
* may have no inlinks, so filter implementations may wish to set this score
* to a non-zero value, to give newly injected pages some initial credit.
*
* @param url
* url of the page
* @param page
* new page. Filters will modify it in-place.
* @throws ScoringFilterException
*/
public void injectedScore(String url, WebPage page)
throws ScoringFilterException;
/**
* Set an initial score for newly discovered pages. Note: newly discovered
* pages have at least one inlink with its score contribution, so filter
* implementations may choose to set initial score to zero (unknown value),
* and then the inlink score contribution will set the "real" value of the new
* page.
*
* @param url
* url of the page
* @param page
* @throws ScoringFilterException
*/
public void initialScore(String url, WebPage page)
throws ScoringFilterException;
/**
* This method prepares a sort value for the purpose of sorting and selecting
* top N scoring pages during fetchlist generation.
*
* @param url
* url of the page
* @param page
* {@link WebPage} object relative to the URL
* @param initSort
* initial sort value, or a value from previous filters in chain
*/
public float generatorSortValue(String url, WebPage page, float initSort)
throws ScoringFilterException;
/**
* Distribute score value from the current page to all its outlinked pages.
*
* @param fromUrl
* url of the source page
* @param scoreData
* A list of {@link ScoreDatum}
* @param allCount
* number of all collected outlinks from the source page
* @throws ScoringFilterException
*/
public void distributeScoreToOutlinks(String fromUrl, WebPage page,
Collection<ScoreDatum> scoreData, int allCount)
throws ScoringFilterException;
/**
* This method calculates a new score during table update, based on the values
* contributed by inlinked pages.
*
* @param url
* url of the page
* @param page {@link WebPage} object relative to the URL
* @param inlinkedScoreData
* list of {@link ScoreDatum}s for all inlinks pointing to
* this URL.
* @throws ScoringFilterException
*/
public void updateScore(String url, WebPage page,
List<ScoreDatum> inlinkedScoreData) throws ScoringFilterException;
/**
* This method calculates a Lucene document boost.
*
* @param url
* url of the page
* @param doc
* document. NOTE: this already contains all information collected by
* indexing filters. Implementations may modify this instance, in
* order to store/remove some information.
* @param initScore
* initial boost value for the Lucene document.
* @return boost value for the Lucene document. This value is passed as an
* argument to the next scoring filter in chain. NOTE: implementations
* may also express other scoring strategies by modifying Lucene
* document directly.
* @throws ScoringFilterException
*/
public float indexerScore(String url, NutchDocument doc, WebPage page,
float initScore) throws ScoringFilterException;
}