Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
18737d6
Added support for REST services in IndexingJob
sujen1412 Jun 5, 2015
67678ac
Added IndexingJob in JObFactory
sujen1412 Jun 5, 2015
59d2e1f
Merge remote-tracking branch 'upstream/trunk' into trunk
sujen1412 Jun 9, 2015
7717816
Cosine similarity model scoring plugin
sujen1412 Jun 11, 2015
38aa53f
Added scoring-similarity plugin in build files
sujen1412 Jun 11, 2015
2b712c0
Overriding method calculate similarity
sujen1412 Jun 14, 2015
81ed178
Added support to remove stop words
sujen1412 Jun 14, 2015
5bbd033
Averaging out similarity scores
sujen1412 Jun 14, 2015
07b000c
Added Apache license info
sujen1412 Jun 14, 2015
671c547
Deleted interface files
sujen1412 Jun 15, 2015
d00a64c
Correct stopword.txt path
sujen1412 Jun 15, 2015
5043e58
Removed debugging statements
sujen1412 Jun 15, 2015
d0dc134
Now reading stopword as a confResource
sujen1412 Jun 15, 2015
dbbbaa7
Added configuration variable
sujen1412 Jun 15, 2015
bfcff91
Now parsing gold-standard doc only once in setConf(conf) method
sujen1412 Jun 15, 2015
9fbed02
Augmenting default.properties to include JavaDoc for new plugins
sujen1412 Jun 15, 2015
fab191d
Added a SimilarityModel interface
sujen1412 Jun 16, 2015
78007dd
Modefied SCoringFilter to make use of the similarity model interface …
sujen1412 Jun 16, 2015
5a439ae
Implementing the new interface
sujen1412 Jun 16, 2015
f75beef
Moved to package Cosine
sujen1412 Jun 16, 2015
58e272f
Renamed filepath config property in nutch-site.xml
sujen1412 Jun 16, 2015
2102950
Loggin error via LOG.error()
sujen1412 Jun 16, 2015
15cf63d
Caching stopwords to prevent it from loading everytime
sujen1412 Jun 16, 2015
23bdc71
Renamed package to cosine
sujen1412 Jun 18, 2015
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@
<packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
<packageset dir="${plugins.dir}/protocol-selenium/src/java"/>
<packageset dir="${plugins.dir}/scoring-depth/src/java"/>
<packageset dir="${plugins.dir}/scoring-similarity/src/java"/>
<packageset dir="${plugins.dir}/scoring-link/src/java"/>
<packageset dir="${plugins.dir}/scoring-opic/src/java"/>
<packageset dir="${plugins.dir}/subcollection/src/java"/>
Expand Down Expand Up @@ -610,6 +611,7 @@
<packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
<packageset dir="${plugins.dir}/protocol-selenium/src/java"/>
<packageset dir="${plugins.dir}/scoring-depth/src/java"/>
<packageset dir="${plugins.dir}/scoring-similarity/src/java"/>
<packageset dir="${plugins.dir}/scoring-link/src/java"/>
<packageset dir="${plugins.dir}/scoring-opic/src/java"/>
<packageset dir="${plugins.dir}/subcollection/src/java"/>
Expand Down Expand Up @@ -1019,6 +1021,7 @@
<source path="${plugins.dir}/protocol-selenium/src/java"/>
<source path="${plugins.dir}/protocol-selenium/src/test"/>
<source path="${plugins.dir}/scoring-depth/src/java/" />
<source path="${plugins.dir}/scoring-similarity/src/java/" />
<source path="${plugins.dir}/scoring-link/src/java/" />
<source path="${plugins.dir}/scoring-opic/src/java/" />
<source path="${plugins.dir}/subcollection/src/java/" />
Expand Down
4 changes: 3 additions & 1 deletion default.properties
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,8 @@ plugins.protocol=\
org.apache.nutch.protocol.file*:\
org.apache.nutch.protocol.ftp*:\
org.apache.nutch.protocol.http*:\
org.apache.nutch.protocol.httpclient*
org.apache.nutch.protocol.httpclient*:\
org.apache.nutch.protocol.selenium*

#
# URL Filter Plugins
Expand Down Expand Up @@ -118,6 +119,7 @@ plugins.scoring=\
org.apache.nutch.scoring.depth*:\
org.apache.nutch.scoring.link*:\
org.apache.nutch.scoring.opic*:\
org.apache.nutch.scoring.similarity*:\
org.apache.nutch.scoring.tld*:\
org.apache.nutch.scoring.urlmeta*

Expand Down
2 changes: 2 additions & 0 deletions src/plugin/build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
<ant dir="scoring-depth" target="deploy"/>
<ant dir="scoring-opic" target="deploy"/>
<ant dir="scoring-link" target="deploy"/>
<ant dir="scoring-similarity" target="deploy"/>
<ant dir="subcollection" target="deploy"/>
<ant dir="tld" target="deploy"/>
<ant dir="urlfilter-automaton" target="deploy"/>
Expand Down Expand Up @@ -165,6 +166,7 @@
<ant dir="scoring-depth" target="clean"/>
<ant dir="scoring-opic" target="clean"/>
<ant dir="scoring-link" target="clean"/>
<ant dir="scoring-similarity" target="clean"/>
<ant dir="subcollection" target="clean"/>
<ant dir="tld" target="clean"/>
<ant dir="urlfilter-automaton" target="clean"/>
Expand Down
27 changes: 27 additions & 0 deletions src/plugin/scoring-similarity/build.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="scoring-similarity" default="jar-core">

<import file="../build-plugin.xml"/>

<!-- Deploy Unit test dependencies -->
<target name="deps-test">
<ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
</target>

</project>
41 changes: 41 additions & 0 deletions src/plugin/scoring-similarity/ivy.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
<?xml version="1.0" ?>

<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<ivy-module version="1.0">
<info organisation="org.apache.nutch" module="${ant.project.name}">
<license name="Apache 2.0"/>
<ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
<description>
Apache Nutch
</description>
</info>

<configurations>
<include file="../../..//ivy/ivy-configurations.xml"/>
</configurations>

<publications>
<!--get the artifact from our module name-->
<artifact conf="master"/>
</publications>

<dependencies>
</dependencies>

</ivy-module>
39 changes: 39 additions & 0 deletions src/plugin/scoring-similarity/plugin.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<plugin
id="scoring-similarity"
name="Similarity based Scoring Plug-in"
version="1.0.0"
provider-name="nutch.org">


<runtime>
<library name="scoring-similarity.jar">
<export name="*"/>
</library>
</runtime>

<extension id="org.apache.nutch.scoring.similarity"
name="SimilarityScoring"
point="org.apache.nutch.scoring.ScoringFilter">

<implementation id="scoring-similarity"
class="org.apache.nutch.scoring.similarity.SimilarityScoringFilter" />
</extension>

</plugin>
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.scoring.similarity;

import java.util.Collection;
import java.util.Map.Entry;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.protocol.Content;

public interface SimilarityModel {

public void setConf(Configuration conf);

public float setURLScoreAfterParsing(Text url, Content content, Parse parse);

public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
CrawlDatum adjust, int allCount);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.scoring.similarity;

import java.util.Collection;
import java.util.Map.Entry;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.scoring.AbstractScoringFilter;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.similarity.cosine.CosineSimilarityModel;

public class SimilarityScoringFilter extends AbstractScoringFilter {

private Configuration conf;
private SimilarityModel similarityModel;

@Override
public Configuration getConf() {
return conf;
}

@Override
public void setConf(Configuration conf) {
this.conf = conf;
similarityModel = new CosineSimilarityModel();
similarityModel.setConf(conf);
}

@Override
public void passScoreAfterParsing(Text url, Content content, Parse parse)
throws ScoringFilterException {

float score = similarityModel.setURLScoreAfterParsing(url, content, parse);
parse.getData().getContentMeta()
.set(Nutch.SCORE_KEY, score+"");
}

@Override
public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
CrawlDatum adjust, int allCount) throws ScoringFilterException {
similarityModel.distributeScoreToOutlinks(fromUrl, parseData, targets, adjust, allCount);
return adjust;
}
}
Loading