From b0ce4a157dbd0bfd8ea368f3fa230a90c7117ae2 Mon Sep 17 00:00:00 2001 From: Asitang Mishra Date: Wed, 17 Jun 2015 09:11:42 -0700 Subject: [PATCH 1/5] patch 1.0 for NUTCH-2038 --- .classpath | 855 ++++++++++++++++++ .project | 17 + build.xml | 3 + conf/nutch-default.xml | 22 + ivy/ivy.xml | 8 +- src/java/org/apache/nutch/net/URLFilters.java | 58 +- .../nutch/parse/ModelURLFilterAbstract.java | 12 + .../org/apache/nutch/parse/ParseSegment.java | 46 +- src/plugin/build.xml | 2 + src/plugin/urlfilter-model/build.xml | 22 + src/plugin/urlfilter-model/ivy.xml | 41 + src/plugin/urlfilter-model/plugin.xml | 41 + .../nutch/urlfilter/model/ModelURLFilter.java | 158 ++++ .../nutch/urlfilter/model/NBClassifier.java | 234 +++++ .../nutch/urlfilter/model/package-info.java | 25 + 15 files changed, 1524 insertions(+), 20 deletions(-) create mode 100644 .classpath create mode 100644 .project create mode 100644 src/java/org/apache/nutch/parse/ModelURLFilterAbstract.java create mode 100644 src/plugin/urlfilter-model/build.xml create mode 100644 src/plugin/urlfilter-model/ivy.xml create mode 100644 src/plugin/urlfilter-model/plugin.xml create mode 100644 src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/ModelURLFilter.java create mode 100644 src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/NBClassifier.java create mode 100644 src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/package-info.java diff --git a/.classpath b/.classpath new file mode 100644 index 0000000000..51cf515586 --- /dev/null +++ b/.classpath @@ -0,0 +1,855 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.project b/.project new file mode 100644 index 0000000000..0978facd4c --- /dev/null +++ b/.project @@ -0,0 +1,17 @@ + + + NUTCH-CLONE + + + + + + org.eclipse.jdt.core.javabuilder + + + + + + org.eclipse.jdt.core.javanature + + diff --git a/build.xml b/build.xml index be49b4f520..7cb8e87d4a 100644 --- a/build.xml +++ b/build.xml @@ -211,6 +211,7 @@ + @@ -621,6 +622,7 @@ + @@ -1037,6 +1039,7 @@ + diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index e8ccf42d10..92dd165419 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -1135,6 +1135,28 @@ + + parser.modelfilter.trainfile + tweets-train.tsv + + + + + + parser.modelfilter.dictionaryfile + wordlist.txt + + + + + + parser.modelfilter + true + + + + + parse.plugin.file parse-plugins.xml diff --git a/ivy/ivy.xml b/ivy/ivy.xml index 3850c0c692..74c305c760 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -78,7 +78,11 @@ - + + + + + @@ -100,6 +104,8 @@ + + diff --git a/src/java/org/apache/nutch/net/URLFilters.java b/src/java/org/apache/nutch/net/URLFilters.java index 3deccca8ec..7d793ffb9b 100644 --- a/src/java/org/apache/nutch/net/URLFilters.java +++ b/src/java/org/apache/nutch/net/URLFilters.java @@ -23,22 +23,44 @@ /** Creates and caches {@link URLFilter} implementing plugins. */ public class URLFilters { - public static final String URLFILTER_ORDER = "urlfilter.order"; - private URLFilter[] filters; - - public URLFilters(Configuration conf) { - this.filters = (URLFilter[]) PluginRepository.get(conf).getOrderedPlugins( - URLFilter.class, URLFilter.X_POINT_ID, URLFILTER_ORDER); - } - - /** Run all defined filters. Assume logical AND. */ - public String filter(String urlString) throws URLFilterException { - for (int i = 0; i < this.filters.length; i++) { - if (urlString == null) - return null; - urlString = this.filters[i].filter(urlString); - - } - return urlString; - } + public static final String URLFILTER_ORDER = "urlfilter.order"; + private URLFilter[] filters; + private URLFilter filter = null; + + public URLFilters(Configuration conf) { + this.filters = (URLFilter[]) PluginRepository.get(conf) + .getOrderedPlugins(URLFilter.class, URLFilter.X_POINT_ID, + URLFILTER_ORDER); + } + + /** Run all defined filters. Assume logical AND. */ + public String filter(String urlString) throws URLFilterException { + for (int i = 0; i < this.filters.length; i++) { + if (urlString == null) + return null; + urlString = this.filters[i].filter(urlString); + + } + return urlString; + } + + /**Get a filter with the full classname if only it is activated through the nutchsite.xml*/ + public URLFilter getFilter(String pid) { + + if (filter == null) { + + for (int i = 0; i < this.filters.length; i++) { + + if (filters[i].getClass().getName().equals(pid)) { + + filter = filters[i]; + break; + } + + } + + } + return filter; + + } } diff --git a/src/java/org/apache/nutch/parse/ModelURLFilterAbstract.java b/src/java/org/apache/nutch/parse/ModelURLFilterAbstract.java new file mode 100644 index 0000000000..6c6bead2cc --- /dev/null +++ b/src/java/org/apache/nutch/parse/ModelURLFilterAbstract.java @@ -0,0 +1,12 @@ +package org.apache.nutch.parse; + +import org.apache.nutch.net.URLFilter; + +public abstract class ModelURLFilterAbstract implements URLFilter{ + + + public abstract void filterParse(String text); + public abstract boolean filterUrl(String url) ; + public abstract void configure(String[] args) ; + +} diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java b/src/java/org/apache/nutch/parse/ParseSegment.java index b1ed1092c9..08472b8b48 100644 --- a/src/java/org/apache/nutch/parse/ParseSegment.java +++ b/src/java/org/apache/nutch/parse/ParseSegment.java @@ -29,11 +29,12 @@ import org.apache.hadoop.conf.*; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.protocols.Response; import org.apache.nutch.protocol.*; import org.apache.nutch.scoring.ScoringFilterException; import org.apache.nutch.scoring.ScoringFilters; -import org.apache.hadoop.fs.FileSystem; + import org.apache.nutch.util.*; import org.apache.hadoop.fs.Path; @@ -56,6 +57,14 @@ public class ParseSegment extends NutchTool implements Tool, private ParseUtil parseUtil; private boolean skipTruncated; + + public static final String PARSER_MODELFILTER="parser.modelfilter"; + public static final String TRAINFILE_MODELFILTER="parser.modelfilter.trainfile"; + public static final String DICTFILE_MODELFILTER="parser.modelfilter.dictionaryfile"; + + private boolean filterflag; + private URLFilters filters; + private ModelURLFilterAbstract filter; public ParseSegment() { this(null); @@ -69,6 +78,18 @@ public void configure(JobConf job) { setConf(job); this.scfilters = new ScoringFilters(job); skipTruncated = job.getBoolean(SKIP_TRUNCATED, true); + + filterflag = job.getBoolean(PARSER_MODELFILTER, true); + if(filterflag){ + String[] args=new String[2]; + args[0]=getConf().get(TRAINFILE_MODELFILTER); + args[1]=getConf().get(DICTFILE_MODELFILTER); + + filters = new URLFilters(job); + filter=(ModelURLFilterAbstract) filters.getFilter("org.apache.nutch.urlfilter.model.ModelURLFilter"); + filter.configure(args); + + } } public void close() { @@ -140,6 +161,29 @@ public void map(WritableComparable key, Content content, LOG.warn("Error passing score: " + url + ": " + e.getMessage()); } } + +if(filterflag){ + + + + filter.filterParse(parse.getText()); + + ArrayList tempOutlinks= new ArrayList(); + Outlink[] out=null; + for(int i=0;i + @@ -174,6 +175,7 @@ + diff --git a/src/plugin/urlfilter-model/build.xml b/src/plugin/urlfilter-model/build.xml new file mode 100644 index 0000000000..a7135bf29b --- /dev/null +++ b/src/plugin/urlfilter-model/build.xml @@ -0,0 +1,22 @@ + + + + + + + diff --git a/src/plugin/urlfilter-model/ivy.xml b/src/plugin/urlfilter-model/ivy.xml new file mode 100644 index 0000000000..1a86d68030 --- /dev/null +++ b/src/plugin/urlfilter-model/ivy.xml @@ -0,0 +1,41 @@ + + + + + + + + + + Apache Nutch + + + + + + + + + + + + + + + + diff --git a/src/plugin/urlfilter-model/plugin.xml b/src/plugin/urlfilter-model/plugin.xml new file mode 100644 index 0000000000..43b41d2132 --- /dev/null +++ b/src/plugin/urlfilter-model/plugin.xml @@ -0,0 +1,41 @@ + + + + + + + + + + + + + + + + + + + diff --git a/src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/ModelURLFilter.java b/src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/ModelURLFilter.java new file mode 100644 index 0000000000..50109b44b3 --- /dev/null +++ b/src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/ModelURLFilter.java @@ -0,0 +1,158 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.model; + + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.parse.ModelURLFilterAbstract; + + +import java.io.Reader; + +import java.io.BufferedReader; + +import java.io.IOException; + +import java.util.ArrayList; + +/** + * Filters URLs based on a file of URL prefixes. The file is named by (1) + * property "urlfilter.prefix.file" in ./conf/nutch-default.xml, and (2) + * attribute "file" in plugin.xml of this plugin Attribute "file" has higher + * precedence if defined. + * + *

+ * The format of this file is one URL prefix per line. + *

+ */ +public class ModelURLFilter extends ModelURLFilterAbstract { + + private static final Logger LOG = LoggerFactory + .getLogger(ModelURLFilter.class); + + private boolean relevent = false; + private Configuration conf; + private String inputFilePath; + private String dictionaryFile; + private ArrayList wordlist = new ArrayList(); + + public ModelURLFilter() throws Exception { + + } + + public void configure(String[] args) { + + inputFilePath = args[0]; + dictionaryFile = args[1]; + BufferedReader br = null; + + try { + + String CurrentLine; + + Reader reader = conf.getConfResourceAsReader(dictionaryFile); + br = new BufferedReader(reader); + while ((CurrentLine = br.readLine()) != null) { + wordlist.add(CurrentLine); + } + + } catch (IOException e) { + + e.printStackTrace(); + } finally { + try { + if (br != null) + br.close(); + } catch (IOException ex) { + ex.printStackTrace(); + } + } + + try { + + train(); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + public void filterParse(String text) { + + try { + relevent = classify(text); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + } + + public boolean filterUrl(String url) { + + if (!relevent) { + if (!containsWord(url, wordlist)) { + return false; + } + } + + return true; + } + + public String filter(String url) { + + return url; + + } + + public boolean classify(String text) throws IOException { + + // if classified as relevent "1" then return true + if (NBClassifier.classify(text).equals("1")) + return true; + return false; + } + + public void train() throws Exception { + + // check if the model file exists, if it does then don't train + NBClassifier.createModel(inputFilePath); + + } + + public boolean containsWord(String url, ArrayList wordlist) { + for (String word : wordlist) { + if (url.contains(word)) { + return true; + } + } + + return false; + } + + public void setConf(Configuration conf) { + this.conf = conf; + + } + + public Configuration getConf() { + return this.conf; + } + +} diff --git a/src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/NBClassifier.java b/src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/NBClassifier.java new file mode 100644 index 0000000000..714aaa4164 --- /dev/null +++ b/src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/NBClassifier.java @@ -0,0 +1,234 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.urlfilter.model; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.SequenceFile.Writer; +import org.apache.hadoop.io.Text; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.util.Version; +import org.apache.mahout.classifier.naivebayes.BayesUtils; +import org.apache.mahout.classifier.naivebayes.NaiveBayesModel; +import org.apache.mahout.classifier.naivebayes.StandardNaiveBayesClassifier; +import org.apache.mahout.classifier.naivebayes.training.TrainNaiveBayesJob; +import org.apache.mahout.common.Pair; +import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable; +import org.apache.mahout.math.RandomAccessSparseVector; +import org.apache.mahout.math.Vector; +import org.apache.mahout.math.Vector.Element; +import org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles; +import org.apache.mahout.vectorizer.TFIDF; + +import com.google.common.collect.ConcurrentHashMultiset; +import com.google.common.collect.Multiset; + +public class NBClassifier { + + public static Map readDictionnary(Configuration conf, + Path dictionnaryPath) { + Map dictionnary = new HashMap(); + for (Pair pair : new SequenceFileIterable( + dictionnaryPath, true, conf)) { + dictionnary.put(pair.getFirst().toString(), pair.getSecond().get()); + } + return dictionnary; + } + + public static Map readDocumentFrequency(Configuration conf, + Path documentFrequencyPath) { + Map documentFrequency = new HashMap(); + for (Pair pair : new SequenceFileIterable( + documentFrequencyPath, true, conf)) { + documentFrequency + .put(pair.getFirst().get(), pair.getSecond().get()); + } + return documentFrequency; + } + + public static void createModel(String inputTrainFilePath) throws Exception { + + String[] args1 = new String[4]; + + args1[0] = "-i"; + args1[1] = "outseq"; + args1[2] = "-o"; + args1[3] = "vectors"; + + String[] args2 = new String[9]; + + args2[0] = "-i"; + args2[1] = "vectors/tfidf-vectors"; + args2[2] = "-el"; + args2[3] = "-li"; + args2[4] = "labelindex"; + args2[5] = "-o"; + args2[6] = "model"; + args2[7] = "-ow"; + args2[8] = "-c"; + + convertToSeq(inputTrainFilePath, "outseq"); + + SparseVectorsFromSequenceFiles.main(args1); + + TrainNaiveBayesJob.main(args2); + } + + public static String classify(String text) throws IOException { + return classify(text, "model", "labelindex", + "vectors/dictionary.file-0", "vectors/df-count/part-r-00000"); + } + + public static String classify(String text, String modelPath, + String labelIndexPath, String dictionaryPath, + String documentFrequencyPath) throws IOException { + + Configuration configuration = new Configuration(); + + // model is a matrix (wordId, labelId) => probability score + NaiveBayesModel model = NaiveBayesModel.materialize( + new Path(modelPath), configuration); + + StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier( + model); + + // labels is a map label => classId + Map labels = BayesUtils.readLabelIndex(configuration, + new Path(labelIndexPath)); + Map dictionary = readDictionnary(configuration, + new Path(dictionaryPath)); + Map documentFrequency = readDocumentFrequency( + configuration, new Path(documentFrequencyPath)); + + // analyzer used to extract word from text + Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); + // int labelCount = labels.size(); + int documentCount = documentFrequency.get(-1).intValue(); + + Multiset words = ConcurrentHashMultiset.create(); + + // extract words from text + TokenStream ts = analyzer.tokenStream("text", new StringReader(text)); + CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + ts.reset(); + int wordCount = 0; + while (ts.incrementToken()) { + if (termAtt.length() > 0) { + String word = ts.getAttribute(CharTermAttribute.class) + .toString(); + Integer wordId = dictionary.get(word); + // if the word is not in the dictionary, skip it + if (wordId != null) { + words.add(word); + wordCount++; + } + } + } + + ts.end(); + ts.close(); + // create vector wordId => weight using tfidf + Vector vector = new RandomAccessSparseVector(10000); + TFIDF tfidf = new TFIDF(); + for (Multiset.Entry entry : words.entrySet()) { + String word = entry.getElement(); + int count = entry.getCount(); + Integer wordId = dictionary.get(word); + Long freq = documentFrequency.get(wordId); + double tfIdfValue = tfidf.calculate(count, freq.intValue(), + wordCount, documentCount); + vector.setQuick(wordId, tfIdfValue); + } + // one score for each label + + Vector resultVector = classifier.classifyFull(vector); + double bestScore = -Double.MAX_VALUE; + int bestCategoryId = -1; + for (Element element : resultVector.all()) { + int categoryId = element.index(); + double score = element.get(); + if (score > bestScore) { + bestScore = score; + bestCategoryId = categoryId; + } + + } + + analyzer.close(); + return labels.get(bestCategoryId); + + } + + static void convertToSeq(String inputFileName, String outputDirName) + throws IOException { + Configuration configuration = new Configuration(); + FileSystem fs = FileSystem.get(configuration); + Writer writer = new SequenceFile.Writer(fs, configuration, new Path( + outputDirName + "/chunk-0"), Text.class, Text.class); + + BufferedReader reader = new BufferedReader( + new FileReader(inputFileName)); + Text key = new Text(); + Text value = new Text(); + while (true) { + String line = reader.readLine(); + if (line == null) { + break; + } + String[] tokens = line.split("\t", 3); + if (tokens.length != 3) { + // System.out.println("Skip line: " + line); + continue; + } + String category = tokens[0]; + String id = tokens[1]; + String message = tokens[2]; + key.set("/" + category + "/" + id); + value.set(message); + writer.append(key, value); + + } + reader.close(); + writer.close(); + + } + + public static void main(String args[]) throws Exception { + + // createModel("data/tweets-train.tsv"); + + // example + // String result=classify("how are you doing here bro"); + + // System.out.println(result); + } +} diff --git a/src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/package-info.java b/src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/package-info.java new file mode 100644 index 0000000000..a74d4bebad --- /dev/null +++ b/src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/package-info.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * URL filter plugin to include only URLs which match an element in a given list of + * domain suffixes, domain names, and/or host names. + * See {@link org.apache.nutch.urlfilter.domainblacklist} for the counterpart + * (exclude URLs by host or domain). + */ +package org.apache.nutch.urlfilter.model; + From e243cc5e626106a4cd8dfca8d9c2ec93e9648560 Mon Sep 17 00:00:00 2001 From: Asitang Mishra Date: Wed, 17 Jun 2015 09:14:37 -0700 Subject: [PATCH 2/5] patch 1.0 for NUTCH-2038 --- .classpath | 855 ----------------------------------------------------- .project | 17 -- 2 files changed, 872 deletions(-) delete mode 100644 .classpath delete mode 100644 .project diff --git a/.classpath b/.classpath deleted file mode 100644 index 51cf515586..0000000000 --- a/.classpath +++ /dev/null @@ -1,855 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/.project b/.project deleted file mode 100644 index 0978facd4c..0000000000 --- a/.project +++ /dev/null @@ -1,17 +0,0 @@ - - - NUTCH-CLONE - - - - - - org.eclipse.jdt.core.javabuilder - - - - - - org.eclipse.jdt.core.javanature - - From 711f44d8d4af51538ff1764145ac743445b6f43b Mon Sep 17 00:00:00 2001 From: Asitang Mishra Date: Wed, 17 Jun 2015 09:35:28 -0700 Subject: [PATCH 3/5] patch 1.0 for NUTCH-2038 --- src/java/org/apache/nutch/net/URLFilters.java | 56 +++++++++---------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/src/java/org/apache/nutch/net/URLFilters.java b/src/java/org/apache/nutch/net/URLFilters.java index 7d793ffb9b..9ed436978a 100644 --- a/src/java/org/apache/nutch/net/URLFilters.java +++ b/src/java/org/apache/nutch/net/URLFilters.java @@ -23,44 +23,42 @@ /** Creates and caches {@link URLFilter} implementing plugins. */ public class URLFilters { - public static final String URLFILTER_ORDER = "urlfilter.order"; - private URLFilter[] filters; - private URLFilter filter = null; + public static final String URLFILTER_ORDER = "urlfilter.order"; + private URLFilter[] filters; - public URLFilters(Configuration conf) { - this.filters = (URLFilter[]) PluginRepository.get(conf) - .getOrderedPlugins(URLFilter.class, URLFilter.X_POINT_ID, - URLFILTER_ORDER); - } + public URLFilters(Configuration conf) { + this.filters = (URLFilter[]) PluginRepository.get(conf).getOrderedPlugins( + URLFilter.class, URLFilter.X_POINT_ID, URLFILTER_ORDER); + } - /** Run all defined filters. Assume logical AND. */ - public String filter(String urlString) throws URLFilterException { - for (int i = 0; i < this.filters.length; i++) { - if (urlString == null) - return null; - urlString = this.filters[i].filter(urlString); + /** Run all defined filters. Assume logical AND. */ + public String filter(String urlString) throws URLFilterException { + for (int i = 0; i < this.filters.length; i++) { + if (urlString == null) + return null; + urlString = this.filters[i].filter(urlString); - } - return urlString; - } + } + return urlString; + } +/**Get a filter with the full classname if only it is activated through the nutchsite.xml*/ + public URLFilter getFilter(String pid) { - /**Get a filter with the full classname if only it is activated through the nutchsite.xml*/ - public URLFilter getFilter(String pid) { + if (filter == null) { - if (filter == null) { + for (int i = 0; i < this.filters.length; i++) { - for (int i = 0; i < this.filters.length; i++) { + if (filters[i].getClass().getName().equals(pid)) { - if (filters[i].getClass().getName().equals(pid)) { + filter = filters[i]; + break; + } - filter = filters[i]; - break; - } + } - } + } + return filter; - } - return filter; + } - } } From e0e924e15c247d3fa3dd92f387fe53ba7effd78a Mon Sep 17 00:00:00 2001 From: Asitang Mishra Date: Thu, 18 Jun 2015 08:09:30 -0700 Subject: [PATCH 4/5] final commir for pattch 1.0 --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 1af3f31c5f..5b3c687303 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ conf/slaves build/ runtime/ logs/ +/bin/ From cca768bc1c790a976594136433485fe899465cb8 Mon Sep 17 00:00:00 2001 From: Asitang Mishra Date: Fri, 19 Jun 2015 13:13:34 -0700 Subject: [PATCH 5/5] Patch 2.0 for NUTCH-2038 --- conf/nutch-default.xml | 50 ++-- src/java/org/apache/nutch/net/URLFilters.java | 13 +- .../nutch/parse/ModelURLFilterAbstract.java | 45 +++- .../org/apache/nutch/parse/ParseSegment.java | 96 ++++--- .../nutch/urlfilter/model/ModelURLFilter.java | 170 +++++++------ .../nutch/urlfilter/model/NBClassifier.java | 234 ------------------ .../urlfilter/model/NaiveBayesClassifier.java | 229 +++++++++++++++++ .../nutch/urlfilter/model/package-info.java | 10 +- 8 files changed, 455 insertions(+), 392 deletions(-) delete mode 100644 src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/NBClassifier.java create mode 100644 src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/NaiveBayesClassifier.java diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 92dd165419..867b87b413 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -1135,28 +1135,6 @@ - - parser.modelfilter.trainfile - tweets-train.tsv - - - - - - parser.modelfilter.dictionaryfile - wordlist.txt - - - - - - parser.modelfilter - true - - - - - parse.plugin.file parse-plugins.xml @@ -1280,6 +1258,34 @@ + + urlfilter.model.trainfile + + Set the name of the file to be used for Naive Bayes training. The format will be: +Each line contains two tab seperted parts +There are two columns/parts: +1. "1" or "0", "1" for relevent and "0" for irrelevent document. +3. Text (text that will be used for training) + +Each row will be considered a new "document" for the classifier. + + + + + + urlfilter.model.wordlist + + Put the name of the file you want to be used as a list of "hot words" to be matched in the url for the model filter. The format should be one word per line. + + + + + urlfilter.model.filter + false + A boolean. Set it to true if using the model filter. + + + urlfilter.domain.file domain-urlfilter.txt diff --git a/src/java/org/apache/nutch/net/URLFilters.java b/src/java/org/apache/nutch/net/URLFilters.java index 9ed436978a..16051e53d4 100644 --- a/src/java/org/apache/nutch/net/URLFilters.java +++ b/src/java/org/apache/nutch/net/URLFilters.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. @@ -25,6 +25,7 @@ public class URLFilters { public static final String URLFILTER_ORDER = "urlfilter.order"; private URLFilter[] filters; + private URLFilter filter=null; public URLFilters(Configuration conf) { this.filters = (URLFilter[]) PluginRepository.get(conf).getOrderedPlugins( @@ -41,14 +42,18 @@ public String filter(String urlString) throws URLFilterException { } return urlString; } -/**Get a filter with the full classname if only it is activated through the nutchsite.xml*/ - public URLFilter getFilter(String pid) { + + /** + * Get a filter with the full classname if only it is activated through the + * nutch-site.xml + */ + public URLFilter getFilter(String classname) { if (filter == null) { for (int i = 0; i < this.filters.length; i++) { - if (filters[i].getClass().getName().equals(pid)) { + if (filters[i].getClass().getName().equals(classname)) { filter = filters[i]; break; diff --git a/src/java/org/apache/nutch/parse/ModelURLFilterAbstract.java b/src/java/org/apache/nutch/parse/ModelURLFilterAbstract.java index 6c6bead2cc..58b647ee1d 100644 --- a/src/java/org/apache/nutch/parse/ModelURLFilterAbstract.java +++ b/src/java/org/apache/nutch/parse/ModelURLFilterAbstract.java @@ -1,12 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.nutch.parse; import org.apache.nutch.net.URLFilter; -public abstract class ModelURLFilterAbstract implements URLFilter{ +/** + * An abstract class to make more function prototypes of a url filter plugin + * available in the core Nutch classes + */ +public abstract class ModelURLFilterAbstract implements URLFilter { + + /** Uses text (parse text) to set state of the class */ + public abstract boolean filterParse(String text); + + /** + * Can be used instead of the generic filter(String url) to be called in any + * job other than generator of injector, so that the generic function can be + * short circuited for the generator i.e. the filter won't work in for the + * generator + */ + public abstract boolean filterUrl(String url); + + /** + * Configure the filter once before using the filtering functions, like train + * the classifier once + */ + public abstract void configure(String[] args) throws Exception; - - public abstract void filterParse(String text); - public abstract boolean filterUrl(String url) ; - public abstract void configure(String[] args) ; - } diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java b/src/java/org/apache/nutch/parse/ParseSegment.java index 08472b8b48..1c3ae552c9 100644 --- a/src/java/org/apache/nutch/parse/ParseSegment.java +++ b/src/java/org/apache/nutch/parse/ParseSegment.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. @@ -34,7 +34,6 @@ import org.apache.nutch.protocol.*; import org.apache.nutch.scoring.ScoringFilterException; import org.apache.nutch.scoring.ScoringFilters; - import org.apache.nutch.util.*; import org.apache.hadoop.fs.Path; @@ -57,11 +56,11 @@ public class ParseSegment extends NutchTool implements Tool, private ParseUtil parseUtil; private boolean skipTruncated; - - public static final String PARSER_MODELFILTER="parser.modelfilter"; - public static final String TRAINFILE_MODELFILTER="parser.modelfilter.trainfile"; - public static final String DICTFILE_MODELFILTER="parser.modelfilter.dictionaryfile"; - + + public static final String PARSER_MODELFILTER = "urlfilter.model.filter"; + public static final String TRAINFILE_MODELFILTER = "urlfilter.model.trainfile"; + public static final String DICTFILE_MODELFILTER = "urlfilter.model.wordlist"; + private boolean filterflag; private URLFilters filters; private ModelURLFilterAbstract filter; @@ -80,16 +79,33 @@ public void configure(JobConf job) { skipTruncated = job.getBoolean(SKIP_TRUNCATED, true); filterflag = job.getBoolean(PARSER_MODELFILTER, true); - if(filterflag){ - String[] args=new String[2]; - args[0]=getConf().get(TRAINFILE_MODELFILTER); - args[1]=getConf().get(DICTFILE_MODELFILTER); - - filters = new URLFilters(job); - filter=(ModelURLFilterAbstract) filters.getFilter("org.apache.nutch.urlfilter.model.ModelURLFilter"); - filter.configure(args); - - } + if (filterflag) { + String[] args = new String[2]; + args[0] = getConf().get(TRAINFILE_MODELFILTER); + args[1] = getConf().get(DICTFILE_MODELFILTER); + + if (args[0] == null || args[0].trim().length() == 0 || args[1] == null + || args[1].trim().length() == 0) { + String message = "Model URLFilter: trainfile or wordlist not set in the urlfilter.model.trainfile or urlfilter.model.wordlist"; + if (LOG.isErrorEnabled()) { + filterflag = false; + LOG.error(message); + } + throw new IllegalArgumentException(message); + } else { + try { + filters = new URLFilters(job); + filter = (ModelURLFilterAbstract) filters + .getFilter("org.apache.nutch.urlfilter.model.ModelURLFilter"); + filter.configure(args); + } catch (Exception e) { + // TODO: handle exception + LOG.warn("There was some problem while getting the model filter or training it. Not using the filter"); + filterflag = false; + } + + } + } } public void close() { @@ -162,28 +178,36 @@ public void map(WritableComparable key, Content content, } } -if(filterflag){ - - - - filter.filterParse(parse.getText()); - - ArrayList tempOutlinks= new ArrayList(); - Outlink[] out=null; - for(int i=0;i tempOutlinks = new ArrayList(); + Outlink[] out = null; + for (int i = 0; i < parse.getData().getOutlinks().length; i++) { + LOG.info("ModelURLFilter: Outlink to check:: " + + parse.getData().getOutlinks()[i].getToUrl()); + if (filter.filterUrl(parse.getData().getOutlinks()[i].getToUrl())) { + tempOutlinks.add(parse.getData().getOutlinks()[i]); + LOG.info("ModelURLFilter: found relevent"); + + } else { + LOG.info("ModelURLFilter: found irrelevent"); + } } + out = new Outlink[tempOutlinks.size()]; + for (int i = 0; i < tempOutlinks.size(); i++) { + out[i] = tempOutlinks.get(i); } - out=new Outlink[tempOutlinks.size()]; - for(int i=0;i wordlist = new ArrayList(); - - public ModelURLFilter() throws Exception { + private static final Logger LOG = LoggerFactory + .getLogger(ModelURLFilter.class); - } + private Configuration conf; + private String inputFilePath; + private String dictionaryFile; + private ArrayList wordlist = new ArrayList(); - public void configure(String[] args) { + public ModelURLFilter() throws Exception { - inputFilePath = args[0]; - dictionaryFile = args[1]; - BufferedReader br = null; + } - try { + public void configure(String[] args) throws Exception { - String CurrentLine; + inputFilePath = args[0]; + dictionaryFile = args[1]; + BufferedReader br = null; - Reader reader = conf.getConfResourceAsReader(dictionaryFile); - br = new BufferedReader(reader); - while ((CurrentLine = br.readLine()) != null) { - wordlist.add(CurrentLine); - } + try { - } catch (IOException e) { + String CurrentLine; - e.printStackTrace(); - } finally { - try { - if (br != null) - br.close(); - } catch (IOException ex) { - ex.printStackTrace(); - } - } + Reader reader = conf.getConfResourceAsReader(dictionaryFile); + br = new BufferedReader(reader); + while ((CurrentLine = br.readLine()) != null) { + wordlist.add(CurrentLine); + } - try { + } catch (IOException e) { + LOG.error("Error occured while reading the wordlist"); + throw new Exception("Error occured while reading the wordlist"); + } finally { + try { + if (br != null) + br.close(); + } catch (IOException ex) { + ex.printStackTrace(); + } + } - train(); - } catch (Exception e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - } + try { - public void filterParse(String text) { + train(); + } catch (Exception e) { + // TODO Auto-generated catch block + LOG.error("Error occured while training"); + throw new Exception("Error occured while training"); + } + } - try { - relevent = classify(text); - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } + public boolean filterParse(String text) { - } + try { + return classify(text); + } catch (IOException e) { + // TODO Auto-generated catch block + LOG.error("Error occured while classifying:: " + text); - public boolean filterUrl(String url) { + } - if (!relevent) { - if (!containsWord(url, wordlist)) { - return false; - } - } + return false; + } - return true; - } + public boolean filterUrl(String url) { - public String filter(String url) { + return containsWord(url, wordlist); - return url; + } - } + public String filter(String url) { - public boolean classify(String text) throws IOException { + return url; - // if classified as relevent "1" then return true - if (NBClassifier.classify(text).equals("1")) - return true; - return false; - } + } - public void train() throws Exception { + public boolean classify(String text) throws IOException { - // check if the model file exists, if it does then don't train - NBClassifier.createModel(inputFilePath); + // if classified as relevent "1" then return true + if (NaiveBayesClassifier.classify(text).equals("1")) + return true; + return false; + } - } + public void train() throws Exception { + // check if the model file exists, if it does then don't train + if (!FileSystem.get(conf).exists(new Path("model"))) { + LOG.info("Training the Naive Bayes Model"); + NaiveBayesClassifier.createModel(inputFilePath); + } else { + LOG.info("Model already exists. Skipping training."); + } + } - public boolean containsWord(String url, ArrayList wordlist) { - for (String word : wordlist) { - if (url.contains(word)) { - return true; - } - } + public boolean containsWord(String url, ArrayList wordlist) { + for (String word : wordlist) { + if (url.contains(word)) { + return true; + } + } - return false; - } + return false; + } - public void setConf(Configuration conf) { - this.conf = conf; + public void setConf(Configuration conf) { + this.conf = conf; - } + } - public Configuration getConf() { - return this.conf; - } + public Configuration getConf() { + return this.conf; + } } diff --git a/src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/NBClassifier.java b/src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/NBClassifier.java deleted file mode 100644 index 714aaa4164..0000000000 --- a/src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/NBClassifier.java +++ /dev/null @@ -1,234 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.urlfilter.model; - -import java.io.BufferedReader; -import java.io.FileReader; -import java.io.IOException; -import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.SequenceFile.Writer; -import org.apache.hadoop.io.Text; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.util.Version; -import org.apache.mahout.classifier.naivebayes.BayesUtils; -import org.apache.mahout.classifier.naivebayes.NaiveBayesModel; -import org.apache.mahout.classifier.naivebayes.StandardNaiveBayesClassifier; -import org.apache.mahout.classifier.naivebayes.training.TrainNaiveBayesJob; -import org.apache.mahout.common.Pair; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable; -import org.apache.mahout.math.RandomAccessSparseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.Vector.Element; -import org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles; -import org.apache.mahout.vectorizer.TFIDF; - -import com.google.common.collect.ConcurrentHashMultiset; -import com.google.common.collect.Multiset; - -public class NBClassifier { - - public static Map readDictionnary(Configuration conf, - Path dictionnaryPath) { - Map dictionnary = new HashMap(); - for (Pair pair : new SequenceFileIterable( - dictionnaryPath, true, conf)) { - dictionnary.put(pair.getFirst().toString(), pair.getSecond().get()); - } - return dictionnary; - } - - public static Map readDocumentFrequency(Configuration conf, - Path documentFrequencyPath) { - Map documentFrequency = new HashMap(); - for (Pair pair : new SequenceFileIterable( - documentFrequencyPath, true, conf)) { - documentFrequency - .put(pair.getFirst().get(), pair.getSecond().get()); - } - return documentFrequency; - } - - public static void createModel(String inputTrainFilePath) throws Exception { - - String[] args1 = new String[4]; - - args1[0] = "-i"; - args1[1] = "outseq"; - args1[2] = "-o"; - args1[3] = "vectors"; - - String[] args2 = new String[9]; - - args2[0] = "-i"; - args2[1] = "vectors/tfidf-vectors"; - args2[2] = "-el"; - args2[3] = "-li"; - args2[4] = "labelindex"; - args2[5] = "-o"; - args2[6] = "model"; - args2[7] = "-ow"; - args2[8] = "-c"; - - convertToSeq(inputTrainFilePath, "outseq"); - - SparseVectorsFromSequenceFiles.main(args1); - - TrainNaiveBayesJob.main(args2); - } - - public static String classify(String text) throws IOException { - return classify(text, "model", "labelindex", - "vectors/dictionary.file-0", "vectors/df-count/part-r-00000"); - } - - public static String classify(String text, String modelPath, - String labelIndexPath, String dictionaryPath, - String documentFrequencyPath) throws IOException { - - Configuration configuration = new Configuration(); - - // model is a matrix (wordId, labelId) => probability score - NaiveBayesModel model = NaiveBayesModel.materialize( - new Path(modelPath), configuration); - - StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier( - model); - - // labels is a map label => classId - Map labels = BayesUtils.readLabelIndex(configuration, - new Path(labelIndexPath)); - Map dictionary = readDictionnary(configuration, - new Path(dictionaryPath)); - Map documentFrequency = readDocumentFrequency( - configuration, new Path(documentFrequencyPath)); - - // analyzer used to extract word from text - Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); - // int labelCount = labels.size(); - int documentCount = documentFrequency.get(-1).intValue(); - - Multiset words = ConcurrentHashMultiset.create(); - - // extract words from text - TokenStream ts = analyzer.tokenStream("text", new StringReader(text)); - CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); - ts.reset(); - int wordCount = 0; - while (ts.incrementToken()) { - if (termAtt.length() > 0) { - String word = ts.getAttribute(CharTermAttribute.class) - .toString(); - Integer wordId = dictionary.get(word); - // if the word is not in the dictionary, skip it - if (wordId != null) { - words.add(word); - wordCount++; - } - } - } - - ts.end(); - ts.close(); - // create vector wordId => weight using tfidf - Vector vector = new RandomAccessSparseVector(10000); - TFIDF tfidf = new TFIDF(); - for (Multiset.Entry entry : words.entrySet()) { - String word = entry.getElement(); - int count = entry.getCount(); - Integer wordId = dictionary.get(word); - Long freq = documentFrequency.get(wordId); - double tfIdfValue = tfidf.calculate(count, freq.intValue(), - wordCount, documentCount); - vector.setQuick(wordId, tfIdfValue); - } - // one score for each label - - Vector resultVector = classifier.classifyFull(vector); - double bestScore = -Double.MAX_VALUE; - int bestCategoryId = -1; - for (Element element : resultVector.all()) { - int categoryId = element.index(); - double score = element.get(); - if (score > bestScore) { - bestScore = score; - bestCategoryId = categoryId; - } - - } - - analyzer.close(); - return labels.get(bestCategoryId); - - } - - static void convertToSeq(String inputFileName, String outputDirName) - throws IOException { - Configuration configuration = new Configuration(); - FileSystem fs = FileSystem.get(configuration); - Writer writer = new SequenceFile.Writer(fs, configuration, new Path( - outputDirName + "/chunk-0"), Text.class, Text.class); - - BufferedReader reader = new BufferedReader( - new FileReader(inputFileName)); - Text key = new Text(); - Text value = new Text(); - while (true) { - String line = reader.readLine(); - if (line == null) { - break; - } - String[] tokens = line.split("\t", 3); - if (tokens.length != 3) { - // System.out.println("Skip line: " + line); - continue; - } - String category = tokens[0]; - String id = tokens[1]; - String message = tokens[2]; - key.set("/" + category + "/" + id); - value.set(message); - writer.append(key, value); - - } - reader.close(); - writer.close(); - - } - - public static void main(String args[]) throws Exception { - - // createModel("data/tweets-train.tsv"); - - // example - // String result=classify("how are you doing here bro"); - - // System.out.println(result); - } -} diff --git a/src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/NaiveBayesClassifier.java b/src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/NaiveBayesClassifier.java new file mode 100644 index 0000000000..dc0f1f6f0e --- /dev/null +++ b/src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/NaiveBayesClassifier.java @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.urlfilter.model; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.SequenceFile.Writer; +import org.apache.hadoop.io.Text; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.util.Version; +import org.apache.mahout.classifier.naivebayes.BayesUtils; +import org.apache.mahout.classifier.naivebayes.NaiveBayesModel; +import org.apache.mahout.classifier.naivebayes.StandardNaiveBayesClassifier; +import org.apache.mahout.classifier.naivebayes.training.TrainNaiveBayesJob; +import org.apache.mahout.common.Pair; +import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable; +import org.apache.mahout.math.RandomAccessSparseVector; +import org.apache.mahout.math.Vector; +import org.apache.mahout.math.Vector.Element; +import org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles; +import org.apache.mahout.vectorizer.TFIDF; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.collect.ConcurrentHashMultiset; +import com.google.common.collect.Multiset; + +public class NaiveBayesClassifier { + + private static final Logger LOG = LoggerFactory + .getLogger(NaiveBayesClassifier.class); + + public static Map readDictionnary(Configuration conf, + Path dictionnaryPath) { + Map dictionnary = new HashMap(); + for (Pair pair : new SequenceFileIterable( + dictionnaryPath, true, conf)) { + dictionnary.put(pair.getFirst().toString(), pair.getSecond().get()); + } + return dictionnary; + } + + public static Map readDocumentFrequency(Configuration conf, + Path documentFrequencyPath) { + Map documentFrequency = new HashMap(); + for (Pair pair : new SequenceFileIterable( + documentFrequencyPath, true, conf)) { + documentFrequency.put(pair.getFirst().get(), pair.getSecond().get()); + } + return documentFrequency; + } + + public static void createModel(String inputTrainFilePath) throws Exception { + + String[] args1 = new String[4]; + + args1[0] = "-i"; + args1[1] = "outseq"; + args1[2] = "-o"; + args1[3] = "vectors"; + + String[] args2 = new String[9]; + + args2[0] = "-i"; + args2[1] = "vectors/tfidf-vectors"; + args2[2] = "-el"; + args2[3] = "-li"; + args2[4] = "labelindex"; + args2[5] = "-o"; + args2[6] = "model"; + args2[7] = "-ow"; + args2[8] = "-c"; + + convertToSeq(inputTrainFilePath, "outseq"); + + SparseVectorsFromSequenceFiles.main(args1); + + TrainNaiveBayesJob.main(args2); + } + + public static String classify(String text) throws IOException { + return classify(text, "model", "labelindex", "vectors/dictionary.file-0", + "vectors/df-count/part-r-00000"); + } + + public static String classify(String text, String modelPath, + String labelIndexPath, String dictionaryPath, String documentFrequencyPath) + throws IOException { + + Configuration configuration = new Configuration(); + + // model is a matrix (wordId, labelId) => probability score + NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), + configuration); + + StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier( + model); + + // labels is a map label => classId + Map labels = BayesUtils.readLabelIndex(configuration, + new Path(labelIndexPath)); + Map dictionary = readDictionnary(configuration, new Path( + dictionaryPath)); + Map documentFrequency = readDocumentFrequency(configuration, + new Path(documentFrequencyPath)); + + // analyzer used to extract word from text + Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); + // int labelCount = labels.size(); + int documentCount = documentFrequency.get(-1).intValue(); + + Multiset words = ConcurrentHashMultiset.create(); + + // extract words from text + TokenStream ts = analyzer.tokenStream("text", new StringReader(text)); + CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + ts.reset(); + int wordCount = 0; + while (ts.incrementToken()) { + if (termAtt.length() > 0) { + String word = ts.getAttribute(CharTermAttribute.class).toString(); + Integer wordId = dictionary.get(word); + // if the word is not in the dictionary, skip it + if (wordId != null) { + words.add(word); + wordCount++; + } + } + } + + ts.end(); + ts.close(); + // create vector wordId => weight using tfidf + Vector vector = new RandomAccessSparseVector(10000); + TFIDF tfidf = new TFIDF(); + for (Multiset.Entry entry : words.entrySet()) { + String word = entry.getElement(); + int count = entry.getCount(); + Integer wordId = dictionary.get(word); + Long freq = documentFrequency.get(wordId); + double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, + documentCount); + vector.setQuick(wordId, tfIdfValue); + } + // one score for each label + + Vector resultVector = classifier.classifyFull(vector); + double bestScore = -Double.MAX_VALUE; + int bestCategoryId = -1; + for (Element element : resultVector.all()) { + int categoryId = element.index(); + double score = element.get(); + if (score > bestScore) { + bestScore = score; + bestCategoryId = categoryId; + } + + } + + analyzer.close(); + return labels.get(bestCategoryId); + + } + + static void convertToSeq(String inputFileName, String outputDirName) + throws IOException { + Configuration configuration = new Configuration(); + FileSystem fs = FileSystem.get(configuration); + Writer writer = new SequenceFile.Writer(fs, configuration, new Path( + outputDirName + "/chunk-0"), Text.class, Text.class); + BufferedReader reader = null; + reader = new BufferedReader( + configuration.getConfResourceAsReader(inputFileName)); + Text key = new Text(); + Text value = new Text(); + long uniqueid=0; + while (true) { + uniqueid++; + String line = reader.readLine(); + if (line == null) { + break; + } + String[] tokens = line.split("\t", 2); + if (tokens.length != 2) { + continue; + } + String category = tokens[0]; + String id = ""+uniqueid; + String message = tokens[1]; + key.set("/" + category + "/" + id); + value.set(message); + writer.append(key, value); + + } + reader.close(); + writer.close(); + + } + + +} diff --git a/src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/package-info.java b/src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/package-info.java index a74d4bebad..fb59e507d8 100644 --- a/src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/package-info.java +++ b/src/plugin/urlfilter-model/src/java/org/apache/nutch/urlfilter/model/package-info.java @@ -16,10 +16,12 @@ */ /** - * URL filter plugin to include only URLs which match an element in a given list of - * domain suffixes, domain names, and/or host names. - * See {@link org.apache.nutch.urlfilter.domainblacklist} for the counterpart - * (exclude URLs by host or domain). + * URL filter plugin with a two tier architecture for filtering: + * The filter is called from the parser and looks at the current page that was parsed. + * Does a Naive Bayes Classification on the text of the page and decides if it is relevant or not. + * If relevant then let all the outlinks pass, if not then the second check kicks in, + * which checks for some "hotwords" in the outlink urls itself (from a wordlist provided by the user). + * If a match then let the outlink pass). */ package org.apache.nutch.urlfilter.model;