Permalink
Browse files

NUTCH-140, parse-plugin.xml can now use extension-id and plugin-id

git-svn-id: https://svn.apache.org/repos/asf/lucene/nutch/trunk@379403 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information...
1 parent 1c21856 commit 1fdab8292b7e0acf2f83adec49ec7b4369a771d7 Jerome Charron committed Feb 21, 2006
Showing with 304 additions and 247 deletions.
  1. +7 −2 conf/parse-plugins.dtd
  2. +29 −0 conf/parse-plugins.xml
  3. +20 −6 src/java/org/apache/nutch/parse/ParsePluginList.java
  4. +56 −18 src/java/org/apache/nutch/parse/ParsePluginsReader.java
  5. +17 −14 src/java/org/apache/nutch/parse/ParseUtil.java
  6. +92 −130 src/java/org/apache/nutch/parse/ParserFactory.java
  7. +1 −1 src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
  8. +2 −5 src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
  9. +3 −7 src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
  10. +2 −2 src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
  11. +2 −2 src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java
  12. +1 −1 src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java
  13. +4 −2 ...lugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
  14. +1 −1 src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
  15. +1 −1 src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java
  16. +1 −1 src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java
  17. +1 −1 src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java
  18. +1 −1 src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
  19. +11 −18 src/test/org/apache/nutch/parse/TestParserFactory.java
  20. +52 −34 src/test/org/apache/nutch/parse/parse-plugin-test.xml
View
9 conf/parse-plugins.dtd
@@ -1,7 +1,12 @@
-<!ELEMENT parse-plugins (mimeType+)>
+<!ELEMENT parse-plugins (mimeType+,aliases)>
<!ELEMENT mimeType (plugin+)>
<!ATTLIST mimeType name CDATA #REQUIRED>
<!ELEMENT plugin EMPTY>
<!ATTLIST plugin id CDATA #REQUIRED>
-<!ATTLIST plugin order CDATA ''>
+<!ATTLIST plugin order CDATA ''>
+
+<!ELEMENT aliases (alias+)>
+<!ELEMENT alias EMPTY>
+<!ATTLIST alias name CDATA #REQUIRED>
+<!ATTLIST alias extension-id CDATA #REQUIRED>
View
29 conf/parse-plugins.xml
@@ -218,4 +218,33 @@
<plugin id="parse-ext" />
</mimeType>
+ <!-- alias mappings for parse-xxx names to the actual extension implementation
+ ids described in each plugin's plugin.xml file -->
+ <aliases>
+ <alias name="parse-ext" extension-id="ExtParser" />
+ <alias name="parse-html"
+ extension-id="org.apache.nutch.parse.html.HtmlParser" />
+ <alias name="parse-js" extension-id="JSParser" />
+ <alias name="parse-mp3"
+ extension-id="org.apache.nutch.parse.mp3.MP3Parser" />
+ <alias name="parse-msexcel"
+ extension-id="org.apache.nutch.parse.msexcel.MSExcelParser" />
+ <alias name="parse-mspowerpoint"
+ extension-id="org.apache.nutch.parse.mspowerpoint.MSPowerPointParser" />
+ <alias name="parse-msword"
+ extension-id="org.apache.nutch.parse.msword.MSWordParser" />
+ <alias name="parse-pdf"
+ extension-id="org.apache.nutch.parse.pdf.PdfParser" />
+ <alias name="parse-rss"
+ extension-id="org.apache.nutch.parse.rss.RSSParser" />
+ <alias name="parse-rtf"
+ extension-id="org.apache.nutch.parse.rtf.RTFParseFactory" />
+ <alias name="parse-swf"
+ extension-id="org.apache.nutch.parse.swf.SWFParser" />
+ <alias name="parse-text"
+ extension-id="org.apache.nutch.parse.text.TextParser" />
+ <alias name="parse-zip"
+ extension-id="org.apache.nutch.parse.zip.ZipParser" />
+ </aliases>
+
</parse-plugins>
View
26 src/java/org/apache/nutch/parse/ParsePluginList.java
@@ -19,6 +19,7 @@
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
+import java.util.Map;
/**
@@ -30,27 +31,40 @@
* @author mattmann
* @version 1.0
*/
-public class ParsePluginList {
+class ParsePluginList {
/* a map to link mimeType to an ordered list of parsing plugins */
- private HashMap fMimeTypeToPluginMap = null;
+ private Map fMimeTypeToPluginMap = null;
+
+ /* A list of aliases */
+ private Map aliases = null;
+
/**
* Constructs a new ParsePluginList
*/
- public ParsePluginList() {
+ ParsePluginList() {
fMimeTypeToPluginMap = new HashMap();
+ aliases = new HashMap();
}
- public List getPluginList(String mimeType) {
+ List getPluginList(String mimeType) {
return (List) fMimeTypeToPluginMap.get(mimeType);
}
+
+ void setAliases(Map aliases) {
+ this.aliases = aliases;
+ }
+
+ Map getAliases() {
+ return aliases;
+ }
- public void setPluginList(String mimeType, List l) {
+ void setPluginList(String mimeType, List l) {
fMimeTypeToPluginMap.put(mimeType, l);
}
- public List getSupportedMimeTypes() {
+ List getSupportedMimeTypes() {
return Arrays.asList(fMimeTypeToPluginMap.keySet().toArray(
new String[] {}));
}
View
74 src/java/org/apache/nutch/parse/ParsePluginsReader.java
@@ -16,12 +16,14 @@
package org.apache.nutch.parse;
// JDK imports
+import java.io.InputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
-import java.util.Vector;
+import java.util.Map;
import java.util.logging.Logger;
-import java.io.InputStream;
-import java.net.URL;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
@@ -53,7 +55,7 @@
/** The property name of the parse-plugins location */
private static final String PP_FILE_PROP = "parse.plugin.file";
- /* the parse-plugins file */
+ /** the parse-plugins file */
private String fParsePluginsFile = null;
@@ -111,8 +113,12 @@ public ParsePluginList parse(Configuration conf) {
Element parsePlugins = document.getDocumentElement();
+ // build up the alias hash map
+ Map aliases = getAliases(parsePlugins);
+ // And store it on the parse plugin list
+ pList.setAliases(aliases);
+
// get all the mime type nodes
-
NodeList mimeTypes = parsePlugins.getElementsByTagName("mimeType");
// iterate through the mime types
@@ -125,30 +131,29 @@ public ParsePluginList parse(Configuration conf) {
// iterate through the plugins, add them in order read
// OR if they have a special order="" attribute, then hold those in
- // a
- // separate list, and then insert them into the final list at the
- // order
- // specified
-
+ // a separate list, and then insert them into the final list at the
+ // order specified
if (pluginList != null && pluginList.getLength() > 0) {
- List plugList = new Vector(pluginList.getLength());
+ List plugList = new ArrayList(pluginList.getLength());
- for (int j = 0; j < pluginList.getLength(); j++) {
+ for (int j = 0; j<pluginList.getLength(); j++) {
Element plugin = (Element) pluginList.item(j);
String pluginId = plugin.getAttribute("id");
-
+ String extId = (String) aliases.get(pluginId);
+ if (extId == null) {
+ // Assume an extension id is directly specified
+ extId = pluginId;
+ }
String orderStr = plugin.getAttribute("order");
int order = -1;
-
try {
order = Integer.parseInt(orderStr);
} catch (NumberFormatException ignore) {
}
-
if (order != -1) {
- plugList.add(order - 1, pluginId);
+ plugList.add(order - 1, extId);
} else {
- plugList.add(pluginId);
+ plugList.add(extId);
}
}
@@ -202,7 +207,7 @@ public static void main(String[] args) throws Exception {
System.out.println("MIMETYPE: " + mimeType);
List plugList = prefs.getPluginList(mimeType);
- System.out.println("PLUGINS:");
+ System.out.println("EXTENSION IDs:");
for (Iterator j = plugList.iterator(); j.hasNext();) {
System.out.println((String) j.next());
@@ -226,4 +231,37 @@ public void setFParsePluginsFile(String parsePluginsFile) {
fParsePluginsFile = parsePluginsFile;
}
+ private Map getAliases(Element parsePluginsRoot) {
+
+ Map aliases = new HashMap();
+ NodeList aliasRoot = parsePluginsRoot.getElementsByTagName("aliases");
+
+ if (aliasRoot == null || (aliasRoot != null && aliasRoot.getLength() == 0)) {
+ LOG.warning("No aliases defined in parse-plugins.xml!");
+ return aliases;
+ }
+
+ if (aliasRoot.getLength() > 1) {
+ // log a warning, but try and continue processing
+ LOG.warning("There should only be one \"aliases\" tag in parse-plugins.xml");
+ }
+
+ Element aliasRootElem = (Element)aliasRoot.item(0);
+ NodeList aliasElements = aliasRootElem.getElementsByTagName("alias");
+
+ if (aliasElements != null && aliasElements.getLength() > 0) {
+ for (int i=0; i<aliasElements.getLength(); i++) {
+ Element aliasElem = (Element)aliasElements.item(i);
+ String parsePluginId = aliasElem.getAttribute("name");
+ String extensionId = aliasElem.getAttribute("extension-id");
+ LOG.finest("Found alias: plugin-id: " + parsePluginId +
+ ", extension-id: " + extensionId);
+ if (parsePluginId != null && extensionId != null) {
+ aliases.put(parsePluginId, extensionId);
+ }
+ }
+ }
+ return aliases;
+ }
+
}
View
31 src/java/org/apache/nutch/parse/ParseUtil.java
@@ -36,8 +36,8 @@
public class ParseUtil {
/* our log stream */
- public static final Logger LOG = LogFormatter.getLogger(ParseUtil.class
- .getName());
+ public static final Logger LOG =
+ LogFormatter.getLogger(ParseUtil.class.getName());
private Configuration conf;
private ParserFactory parserFactory;
@@ -84,33 +84,36 @@ public Parse parse(Content content) throws ParseException {
" of type " + content.getContentType());
ParseStatus ps = (parse.getData() != null) ? parse.getData().getStatus() : null;
- return (ps == null) ? new ParseStatus().getEmptyParse(this.conf) : ps.getEmptyParse(this.conf);
+ return (ps == null) ? new ParseStatus().getEmptyParse(this.conf)
+ : ps.getEmptyParse(this.conf);
}
-
+
/**
* Method parses a {@link Content} object using the {@link Parser} specified
- * by the parameter <code>parserId</code>. If a suitable {@link Parser} is not
- * found, then a <code>WARNING</code> level message is logged, and a
- * ParseException is thrown.
- * If the parse is uncessful for any other reason, then a <code>WARNING</code>
- * level message is logged, and a <code>ParseStatus.getEmptyParse() is
+ * by the parameter <code>extId</code>, i.e., the Parser's extension ID.
+ * If a suitable {@link Parser} is not found, then a <code>WARNING</code>
+ * level message is logged, and a ParseException is thrown. If the parse is
+ * uncessful for any other reason, then a <code>WARNING</code> level
+ * message is logged, and a <code>ParseStatus.getEmptyParse()</code> is
* returned.
*
- * @param parserId The ID of the {@link Parser} to use to parse the specified
- * content.
+ * @param extId The extension implementation ID of the {@link Parser} to use
+ * to parse the specified content.
* @param content The content to parse.
+ *
* @return A {@link Parse} object if the parse is successful, otherwise,
* a <code>ParseStatus.getEmptyParse()</code>.
+ *
* @throws ParseException If there is no suitable {@link Parser} found
* to perform the parse.
*/
- public Parse parseByParserId(String parserId, Content content)
+ public Parse parseByExtensionId(String extId, Content content)
throws ParseException {
Parse parse = null;
Parser p = null;
try {
- p = this.parserFactory.getParserById(parserId);
+ p = this.parserFactory.getParserById(extId);
} catch (ParserNotFound e) {
LOG.warning("No suitable parser found when trying to parse content " +
content);
@@ -126,6 +129,6 @@ public Parse parseByParserId(String parserId, Content content)
" of type " + content.getContentType());
return new ParseStatus().getEmptyParse(this.conf);
}
- }
+ }
}
View
222 src/java/org/apache/nutch/parse/ParserFactory.java
@@ -16,9 +16,11 @@
package org.apache.nutch.parse;
// JDK imports
+import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
+import java.util.Map;
import java.util.Vector;
import java.util.logging.Logger;
@@ -32,6 +34,7 @@
import org.apache.nutch.util.mime.MimeType;
import org.apache.nutch.util.mime.MimeTypeException;
+
/** Creates and caches {@link Parser} plugins.*/
public final class ParserFactory {
@@ -63,43 +66,6 @@ public ParserFactory(Configuration conf) {
}
}
-
- /**
- * Returns the appropriate {@link Parser} implementation given a content type
- * and url.
- *
- * @deprecated Since the addition of NUTCH-88, this method is replaced by
- * taking the highest priority {@link Parser} returned from
- * {@link #getParsers(String, String)}.
- *
- * Parser extensions should define the attributes "contentType" and/or
- * "pathSuffix". Content type has priority: the first plugin found whose
- * "contentType" attribute matches the beginning of the content's type is
- * used. If none match, then the first whose "pathSuffix" attribute matches
- * the end of the url's path is used. If neither of these match, then the
- * first plugin whose "pathSuffix" is the empty string is used.
- */
- public Parser getParser(String contentType, String url)
- throws ParserNotFound {
-
- Parser[] parsers = getParsers(contentType, url);
-
- if(parsers != null){
- //give the user the highest priority parser available
- for(int i = 0; i < parsers.length; i++ ){
- Parser p = parsers[i];
- if(p != null){
- return p;
- }
- }
-
- throw new ParserNotFound(url, contentType);
-
- }
- else{
- throw new ParserNotFound(url, contentType);
- }
- }
/**
* Function returns an array of {@link Parser}s for a given content type.
@@ -150,11 +116,11 @@ public Parser getParser(String contentType, String url)
Parser p = null;
try {
//check to see if we've cached this parser instance yet
- p = (Parser) this.conf.getObject(ext.getDescriptor().getPluginId());
+ p = (Parser) this.conf.getObject(ext.getId());
if (p == null) {
// go ahead and instantiate it and then cache it
p = (Parser) ext.getExtensionInstance();
- this.conf.setObject(ext.getDescriptor().getPluginId(),p);
+ this.conf.setObject(ext.getId(),p);
}
parsers.add(p);
} catch (PluginRuntimeException e) {
@@ -168,79 +134,79 @@ public Parser getParser(String contentType, String url)
}
return (Parser[]) parsers.toArray(new Parser[]{});
}
-
+
/**
- * <p>
* Function returns a {@link Parser} instance with the specified
- * <code>parserId</code>. If the Parser instance isn't found, then the
- * function throws a <code>ParserNotFound</code> exception. If the function
- * is able to find the {@link Parser} in the internal
- * <code>PARSER_CACHE</code> then it will return the already instantiated
- * Parser. Otherwise, if it has to instantiate the Parser itself , then this
- * function will cache that Parser in the internal <code>PARSER_CACHE</code>.
+ * <code>extId</code>, representing its extension ID. If the Parser
+ * instance isn't found, then the function throws a
+ * <code>ParserNotFound</code> exception. If the function is able to find
+ * the {@link Parser} in the internal <code>PARSER_CACHE</code> then it
+ * will return the already instantiated Parser. Otherwise, if it has to
+ * instantiate the Parser itself , then this function will cache that Parser
+ * in the internal <code>PARSER_CACHE</code>.
*
- * @param parserId
- * The string ID (e.g., "parse-text", "parse-msword") of the
- * {@link Parser} implementation to return.
+ * @param extId The string extension ID (e.g.,
+ * "org.apache.nutch.parse.rss.RSSParser",
+ * "org.apache.nutch.parse.rtf.RTFParseFactory") of the {@link Parser}
+ * implementation to return.
* @return A {@link Parser} implementation specified by the parameter
- * <code>parserId</code>.
- * @throws ParserNotFound
- * If the Parser is not found (i.e., registered with the extension
- * point), or if the there a {@link PluginRuntimeException}
- * instantiating the {@link Parser}.
+ * <code>extId</code>.
+ * @throws ParserNotFound If the Parser is not found (i.e., registered with
+ * the extension point), or if the there a
+ * {@link PluginRuntimeException} instantiating the {@link Parser}.
*/
- public Parser getParserById(String parserId) throws ParserNotFound {
- // first check the cache
+ public Parser getParserById(String id) throws ParserNotFound {
- if (this.conf.getObject(parserId) != null) {
- return (Parser) this.conf.getObject(parserId);
- } else {
- // get the list of registered parsing extensions
- // then find the right one by Id
+ Extension[] extensions = this.extensionPoint.getExtensions();
+ Extension parserExt = null;
- Extension[] extensions = this.extensionPoint.getExtensions();
- Extension parserExt = getExtensionById(extensions, parserId);
+ if (id != null) {
+ parserExt = getExtension(extensions, id);
+ }
+ if (parserExt == null) {
+ parserExt = getExtensionFromAlias(extensions, id);
+ }
- if (parserExt == null) {
- throw new ParserNotFound("No Parser Found for parserId: " + parserId
- + "!");
- } else {
- // instantiate the Parser
- try {
- Parser p = null;
- p = (Parser) parserExt.getExtensionInstance();
- this.conf.setObject(parserId, p);
- return p;
- } catch (PluginRuntimeException e) {
- LOG.warning("ParserFactory:PluginRuntimeException when "
- + "initializing parser plugin "
- + parserExt.getDescriptor().getPluginId()
- + " instance in getParserById");
- throw new ParserNotFound("No Parser Found for parserId: " + parserId
- + "!");
- }
+ if (parserExt == null) {
+ throw new ParserNotFound("No Parser Found for id [" + id + "]");
+ }
+
+ // first check the cache
+ if (this.conf.getObject(parserExt.getId()) != null) {
+ return (Parser) this.conf.getObject(parserExt.getId());
+
+ // if not found in cache, instantiate the Parser
+ } else {
+ try {
+ Parser p = (Parser) parserExt.getExtensionInstance();
+ this.conf.setObject(parserExt.getId(), p);
+ return p;
+ } catch (PluginRuntimeException e) {
+ LOG.warning("Canno initialize parser " +
+ parserExt.getDescriptor().getPluginId() +
+ " (cause: " + e.toString());
+ throw new ParserNotFound("Cannot init parser for id [" + id + "]");
}
}
}
/**
- * finds the best-suited parse plugin for a given contentType.
+ * Finds the best-suited parse plugin for a given contentType.
*
- * @param contentType
- * Content-Type for which we seek a parse plugin.
- * @return List - List of extensions to be used for this contentType. If none,
- * returns null.
+ * @param contentType Content-Type for which we seek a parse plugin.
+ * @return a list of extensions to be used for this contentType.
+ * If none, returns <code>null</code>.
*/
protected List getExtensions(String contentType) {
// First of all, tries to clean the content-type
String type = null;
try {
- type = MimeType.clean(contentType);
+ type = MimeType.clean(contentType);
} catch (MimeTypeException mte) {
- LOG.info("Could not clean the content-type [" + contentType +
- "], Reason is [" + mte + "]. Using its raw version...");
- type = contentType;
+ LOG.fine("Could not clean the content-type [" + contentType +
+ "], Reason is [" + mte + "]. Using its raw version...");
+ type = contentType;
}
List extensions = (List) this.conf.getObject(type);
@@ -304,19 +270,16 @@ private List findExtensions(String contentType) {
* If none, returns null.
*/
private List matchExtensions(List plugins,
- Extension[] extensions,
- String contentType) {
+ Extension[] extensions,
+ String contentType) {
- List extList = null;
+ List extList = new ArrayList();
if (plugins != null) {
- extList = new Vector(plugins.size());
for (Iterator i = plugins.iterator(); i.hasNext();) {
String parsePluginId = (String) i.next();
- Extension ext = getExtensionByIdAndType(extensions,
- parsePluginId,
- contentType);
+ Extension ext = getExtension(extensions, parsePluginId, contentType);
// the extension returned may be null
// that means that it was not enabled in the plugin.includes
// nutch conf property, but it was mapped in the
@@ -327,8 +290,9 @@ private List matchExtensions(List plugins,
// in either case, LOG the appropriate error message to WARN level
if (ext == null) {
- //try to get it just by its pluginId
- ext = getExtensionById(extensions, parsePluginId);
+ //try to get it just by its pluginId
+ ext = getExtension(extensions, parsePluginId);
+
if (ext != null) {
// plugin was enabled via plugin.includes
// its plugin.xml just doesn't claim to support that
@@ -338,25 +302,21 @@ private List matchExtensions(List plugins,
" via parse-plugins.xml, but " + "its plugin.xml " +
"file does not claim to support contentType: " +
contentType);
-
- //go ahead and load the extension anyways, though
- extList.add(ext);
-
- } else{
+ } else {
// plugin wasn't enabled via plugin.includes
LOG.warning("ParserFactory: Plugin: " + parsePluginId +
" mapped to contentType " + contentType +
" via parse-plugins.xml, but not enabled via " +
"plugin.includes in nutch-default.xml");
}
-
- } else{
+ }
+
+ if (ext != null) {
// add it to the list
extList.add(ext);
}
}
- return extList;
} else {
// okay, there were no list of plugins defined for
// this mimeType, however, there may be plugins registered
@@ -366,19 +326,16 @@ private List matchExtensions(List plugins,
// any extensions where this is the case, throw a
// NotMappedParserException
- List unmappedPlugins = new Vector();
-
- for (int i = 0; i < extensions.length; i++) {
+ for (int i=0; i<extensions.length; i++) {
if (extensions[i].getAttribute("contentType") != null
&& extensions[i].getAttribute("contentType").equals(
contentType)) {
- unmappedPlugins.add(extensions[i].getDescriptor()
- .getPluginId());
+ extList.add(extensions[i].getId());
}
}
- if (unmappedPlugins.size() > 0) {
- LOG.info("The parsing plugins: " + unmappedPlugins +
+ if (extList.size() > 0) {
+ LOG.info("The parsing plugins: " + extList +
" are enabled via the plugin.includes system " +
"property, and all claim to support the content type " +
contentType + ", but they are not mapped to it in the " +
@@ -387,33 +344,38 @@ private List matchExtensions(List plugins,
LOG.fine("ParserFactory:No parse plugins mapped or enabled for " +
"contentType " + contentType);
}
- return null;
}
+
+ return (extList.size() > 0) ? extList : null;
}
private boolean match(Extension extension, String id, String type) {
- return (id.equals(extension.getDescriptor().getPluginId())) &&
- (type.equals(extension.getAttribute("contentType")) ||
- (type.equals(DEFAULT_PLUGIN)));
+ return ((id.equals(extension.getId())) &&
+ (type.equals(extension.getAttribute("contentType")) ||
+ type.equals(DEFAULT_PLUGIN)));
}
- private Extension getExtensionByIdAndType(Extension[] extList,
- String plugId,
- String contentType) {
- for (int i = 0; i < extList.length; i++) {
- if (match(extList[i], plugId, contentType)) {
- return extList[i];
+ /** Get an extension from its id and supported content-type. */
+ private Extension getExtension(Extension[] list, String id, String type) {
+ for (int i=0; i<list.length; i++) {
+ if (match(list[i], id, type)) {
+ return list[i];
}
}
return null;
}
-
- private Extension getExtensionById(Extension[] extList, String plugId) {
- for(int i = 0; i < extList.length; i++){
- if(plugId.equals(extList[i].getDescriptor().getPluginId())){
- return extList[i];
+
+ private Extension getExtension(Extension[] list, String id) {
+ for (int i=0; i<list.length; i++) {
+ if (id.equals(list[i].getId())) {
+ return list[i];
}
}
return null;
}
+
+ private Extension getExtensionFromAlias(Extension[] list, String id) {
+ return getExtension(list, (String) parsePluginList.getAliases().get(id));
+ }
+
}
View
2 src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
@@ -61,7 +61,7 @@ public void pageTest(File file, String url,
Content content =
new Content(url, url, bytes, contentType, new Metadata(), conf);
- Parse parse = new ParseUtil(conf).parseByParserId("parse-html",content);
+ Parse parse = new ParseUtil(conf).parseByExtensionId("parse-html",content);
Metadata metadata = parse.getData().getParseMeta();
assertEquals(license, metadata.get("License-Url"));
View
7 ...plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
@@ -37,8 +37,7 @@
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.hadoop.io.UTF8;
import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.Parser;
-import org.apache.nutch.parse.ParserFactory;
+import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParseException;
import org.apache.nutch.parse.ParserNotFound;
import org.apache.nutch.protocol.Content;
@@ -341,9 +340,7 @@ private static String getUrlContent(String url, Configuration conf) {
try {
protocol = new ProtocolFactory(conf).getProtocol(url);
Content content = protocol.getProtocolOutput(new UTF8(url), new CrawlDatum()).getContent();
- String contentType = content.getContentType();
- Parser parser = new ParserFactory(conf).getParser(contentType, url);
- Parse parse = parser.getParse(content);
+ Parse parse = new ParseUtil(conf).parse(content);
System.out.println("text:" + parse.getText());
return parse.getText();
View
10 ...in/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
@@ -23,8 +23,8 @@
// Nutch imports
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.ParserFactory;
+import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NutchConfiguration;
@@ -48,16 +48,12 @@
public void testMetaHTMLParsing() {
try {
-
+ ParseUtil parser = new ParseUtil(NutchConfiguration.create());
/* loop through the test documents and validate result */
for (int t = 0; t < docs.length; t++) {
-
Content content = getContent(docs[t]);
- Parser parser = new ParserFactory(NutchConfiguration.create()).getParser("text/html", URL);
- Parse parse = parser.getParse(content);
-
+ Parse parse = parser.parse(content);
assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE));
-
}
} catch (Exception e) {
e.printStackTrace(System.out);
View
4 src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
@@ -111,13 +111,13 @@ public void testIt() throws ParseException {
// check external parser that does 'cat'
contentType = "application/vnd.nutch.example.cat";
content.setContentType(contentType);
- parse = new ParseUtil(conf).parseByParserId("parse-ext", content);
+ parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content);
assertEquals(expectedText,parse.getText());
// check external parser that does 'md5sum'
contentType = "application/vnd.nutch.example.md5sum";
content.setContentType(contentType);
- parse = new ParseUtil(conf).parseByParserId("parse-ext", content);
+ parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content);
assertTrue(parse.getText().startsWith(expectedMD5sum));
}
}
View
4 src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java
@@ -73,7 +73,7 @@ public void testId3v2() throws ProtocolException, ParseException {
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum())
.getContent();
- parse = new ParseUtil(conf).parseByParserId("parse-mp3", content);
+ parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content);
Metadata metadata = parse.getData().getParseMeta();
assertEquals("postgresql comment id3v2", metadata.get("COMM-Text"));
assertEquals("postgresql composer id3v2", metadata.get("TCOM-Text"));
@@ -105,7 +105,7 @@ public void testId3v1() throws ProtocolException, ParseException {
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum())
.getContent();
- parse = new ParseUtil(conf).parseByParserId("parse-mp3", content);
+ parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content);
Metadata metadata = parse.getData().getParseMeta();
assertEquals("postgresql comment id3v1", metadata.get("COMM-Text"));
View
2 src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java
@@ -63,7 +63,7 @@ public void testIt() throws ProtocolException, ParseException {
protocol = factory.getProtocol(urlString);
content = protocol.getProtocolOutput(new UTF8(urlString),
new CrawlDatum()).getContent();
- parse = parser.parseByParserId("parse-msexcel", content);
+ parse = parser.parseByExtensionId("parse-msexcel", content);
assertTrue(parse.getText().equals(expectedText));
}
View
6 ...rse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
@@ -126,7 +126,8 @@ protected void tearDown() throws Exception {
*/
public void testContent() throws Exception {
- Parse parse = new ParseUtil(NutchConfiguration.create()).parseByParserId("parse-mspowerpoint",this.content);
+ Parse parse = new ParseUtil(NutchConfiguration.create())
+ .parseByExtensionId("parse-mspowerpoint", this.content);
ParseData data = parse.getData();
String text = parse.getText();
@@ -163,7 +164,8 @@ public void testContent() throws Exception {
*/
public void testMeta() throws Exception {
- Parse parse = new ParseUtil(NutchConfiguration.create()).parseByParserId("parse-mspowerpoint",content);
+ Parse parse = new ParseUtil(NutchConfiguration.create())
+ .parseByExtensionId("parse-mspowerpoint", content);
ParseData data = parse.getData();
View
2 src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
@@ -69,7 +69,7 @@ public void testIt() throws ProtocolException, ParseException {
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
- parse = new ParseUtil(conf).parseByParserId("parse-msword",content);
+ parse = new ParseUtil(conf).parseByExtensionId("parse-msword", content);
assertTrue(parse.getText().startsWith(expectedText));
}
View
2 src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java
@@ -69,7 +69,7 @@ public void testIt() throws ProtocolException, ParseException {
Configuration conf = NutchConfiguration.create();
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
- parse = new ParseUtil(conf).parseByParserId("parse-pdf",content);
+ parse = new ParseUtil(conf).parseByExtensionId("parse-pdf", content);
int index = parse.getText().indexOf(expectedText);
assertTrue(index > 0);
View
2 src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java
@@ -87,7 +87,7 @@ public void testIt() throws ProtocolException, ParseException {
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
- parse = new ParseUtil(conf).parseByParserId("parse-rss",content);
+ parse = new ParseUtil(conf).parseByExtensionId("parse-rss", content);
//check that there are 3 outlinks:
//http://test.channel.com
View
2 src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java
@@ -74,7 +74,7 @@ public void testIt() throws ProtocolException, ParseException {
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum())
.getContent();
- parse = new ParseUtil(conf).parseByParserId("parse-rtf", content);
+ parse = new ParseUtil(conf).parseByExtensionId("parse-rtf", content);
String text = parse.getText();
assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
View
2 src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
@@ -69,7 +69,7 @@ public void testIt() throws ProtocolException, ParseException {
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
- parse = new ParseUtil(conf).parseByParserId("parse-zip",content);
+ parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content);
assertTrue(parse.getText().equals(expectedText));
}
}
View
29 src/test/org/apache/nutch/parse/TestParserFactory.java
@@ -42,18 +42,10 @@ protected void setUp() throws Exception {
conf = NutchConfiguration.create();
conf.set("plugin.includes", ".*");
conf.set("parse.plugin.file",
- "org/apache/nutch/parse/parse-plugin-test.xml");
+ "org/apache/nutch/parse/parse-plugin-test.xml");
parserFactory = new ParserFactory(conf);
}
-
- /** Unit test for <code>getParser(String, String)</code> method. */
- public void testGetParser() throws Exception {
- Parser parser = parserFactory.getParser("text/html", "http://foo.com/");
- assertNotNull(parser);
- parser = parserFactory.getParser("foo/bar", "http://foo.com/");
- assertNotNull(parser);
- }
-
+
/** Unit test for <code>getExtensions(String)</code> method. */
public void testGetExtensions() throws Exception {
Extension ext = (Extension)parserFactory.getExtensions("text/html").get(0);
@@ -70,27 +62,27 @@ public void testGetParsers() throws Exception {
assertNotNull(parsers);
assertEquals(1, parsers.length);
assertEquals("org.apache.nutch.parse.html.HtmlParser",
- parsers[0].getClass().getName());
+ parsers[0].getClass().getName());
- parsers = parserFactory.getParsers("text/html; charset=ISO-8859-1", "http://foo.com");
+ parsers = parserFactory.getParsers("text/html; charset=ISO-8859-1",
+ "http://foo.com");
assertNotNull(parsers);
assertEquals(1, parsers.length);
assertEquals("org.apache.nutch.parse.html.HtmlParser",
- parsers[0].getClass().getName());
-
+ parsers[0].getClass().getName());
parsers = parserFactory.getParsers("application/x-javascript",
- "http://foo.com");
+ "http://foo.com");
assertNotNull(parsers);
assertEquals(1, parsers.length);
assertEquals("org.apache.nutch.parse.js.JSParseFilter",
- parsers[0].getClass().getName());
+ parsers[0].getClass().getName());
parsers = parserFactory.getParsers("text/plain", "http://foo.com");
assertNotNull(parsers);
assertEquals(1, parsers.length);
assertEquals("org.apache.nutch.parse.text.TextParser",
- parsers[0].getClass().getName());
+ parsers[0].getClass().getName());
Parser parser1 = parserFactory.getParsers("text/plain", "http://foo.com")[0];
Parser parser2 = parserFactory.getParsers("*", "http://foo.com")[0];
@@ -102,7 +94,8 @@ public void testGetParsers() throws Exception {
parsers = parserFactory.getParsers("text/rss","http://foo.com");
assertNotNull(parsers);
assertEquals(1,parsers.length);
- assertEquals("org.apache.nutch.parse.rss.RSSParser",parsers[0].getClass().getName());
+ assertEquals("org.apache.nutch.parse.rss.RSSParser",
+ parsers[0].getClass().getName());
}
}
View
86 src/test/org/apache/nutch/parse/parse-plugin-test.xml
@@ -1,46 +1,64 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
- Copyright 2005 The Apache Software Foundation
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-
- Author : mattmann
- Description: Test parse-plugins.xml file.
+ Copyright 2005 The Apache Software Foundation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+ Author : mattmann
+ Description: Test parse-plugins.xml file.
-->
<parse-plugins>
- <!-- by default if the mimeType is set to *, or
- can't be determined, use parse-text -->
- <mimeType name="*">
- <plugin id="parse-text" />
- </mimeType>
+ <!-- by default if the mimeType is set to *, or
+ can't be determined, use parse-text -->
+ <mimeType name="*">
+ <plugin id="parse-text" />
+ </mimeType>
- <!-- test these 4 plugins -->
- <mimeType name="text/html">
- <plugin id="parse-html"/>
- </mimeType>
+ <!-- test these 4 plugins -->
+ <mimeType name="text/html">
+ <!--
+ ! Test that if a parser cannot be instanciated,
+ ! it should not block the process and then the next one is used
+ !-->
+ <plugin id="parse-plugin-that-not-exist"/>
+ <plugin id="parse-html"/>
+ </mimeType>
- <mimeType name="text/plain">
- <plugin id="parse-text"/>
- </mimeType>
+ <mimeType name="text/plain">
+ <!-- Test that an extension-id can be directly used here -->
+ <plugin id="org.apache.nutch.parse.text.TextParser"/>
+ </mimeType>
- <mimeType name="application/x-javascript">
- <plugin id="parse-js"/>
- </mimeType>
+ <mimeType name="application/x-javascript">
+ <plugin id="parse-js"/>
+ </mimeType>
- <mimeType name="text/rss">
- <plugin id="parse-rss"/>
- </mimeType>
+ <mimeType name="text/rss">
+ <plugin id="parse-rss"/>
+ </mimeType>
+ <!-- alias mappings for parse-xxx names to the actual extension implementation
+ ids described in each plugin's plugin.xml file -->
+ <aliases>
+ <alias name="parse-html"
+ extension-id="org.apache.nutch.parse.html.HtmlParser" />
+ <alias name="parse-js"
+ extension-id="JSParser" />
+ <alias name="parse-rss"
+ extension-id="org.apache.nutch.parse.rss.RSSParser" />
+ <alias name="parse-text"
+ extension-id="org.apache.nutch.parse.text.TextParser" />
+ </aliases>
</parse-plugins>

0 comments on commit 1fdab82

Please sign in to comment.