Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

NUTCH-888 : Remove parse-rss

git-svn-id: https://svn.apache.org/repos/asf/nutch/branches/branch-1.3@1099483 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information...
commit 341fba280bf308d704a2c67adf00ecd2a4209f0f 1 parent 2150e90
@jnioche jnioche authored
View
2  CHANGES.txt
@@ -2,6 +2,8 @@ Nutch Change Log
Release 1.3 - 4/21/2011
+* NUTCH-888 Remove parse-rss and add tests for rss to parse-tika (jnioche)
+
* NUTCH-991 SolrDedup must issue a commit (markus)
* NUTCH 986 SolrDedup fails due to date incorrect format (markus)
View
7 conf/parse-plugins.xml
@@ -27,9 +27,9 @@
<mimeType name="*">
<plugin id="parse-tika" />
</mimeType>
-
+
<mimeType name="application/rss+xml">
- <plugin id="parse-rss" />
+ <plugin id="parse-tika" />
<plugin id="feed" />
</mimeType>
@@ -65,7 +65,6 @@
<mimeType name="text/xml">
<plugin id="parse-tika" />
- <plugin id="parse-rss" />
<plugin id="feed" />
</mimeType>
@@ -88,8 +87,6 @@
<alias name="parse-html"
extension-id="org.apache.nutch.parse.html.HtmlParser" />
<alias name="parse-js" extension-id="JSParser" />
- <alias name="parse-rss"
- extension-id="org.apache.nutch.parse.rss.RSSParser" />
<alias name="feed"
extension-id="org.apache.nutch.parse.feed.FeedParser" />
<alias name="parse-swf"
View
3  src/plugin/build.xml
@@ -45,7 +45,6 @@
<ant dir="parse-ext" target="deploy"/>
<ant dir="parse-js" target="deploy"/>
<ant dir="parse-html" target="deploy"/>
- <ant dir="parse-rss" target="deploy"/>
<ant dir="parse-swf" target="deploy"/>
<ant dir="parse-tika" target="deploy"/>
<ant dir="parse-zip" target="deploy"/>
@@ -77,7 +76,6 @@
<ant dir="protocol-file" target="test"/>
<ant dir="protocol-httpclient" target="test"/>
<!--ant dir="parse-ext" target="test"/-->
- <ant dir="parse-rss" target="test"/>
<ant dir="feed" target="test"/>
<ant dir="parse-html" target="test"/>
<ant dir="parse-swf" target="test"/>
@@ -119,7 +117,6 @@
<ant dir="parse-ext" target="clean"/>
<ant dir="parse-js" target="clean"/>
<ant dir="parse-html" target="clean"/>
- <ant dir="parse-rss" target="clean"/>
<ant dir="parse-swf" target="clean"/>
<ant dir="parse-tika" target="clean"/>
<ant dir="parse-zip" target="clean"/>
View
46 src/plugin/parse-rss/build.xml
@@ -1,46 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="parse-rss" default="jar-core">
-
- <import file="../build-plugin.xml"/>
-
- <!-- Build compilation dependencies -->
- <target name="deps-jar">
- <ant target="jar" inheritall="false" dir="../lib-xml"/>
- </target>
-
- <!-- Add compilation dependencies to classpath -->
- <path id="plugin.deps">
- <fileset dir="${nutch.root}/build">
- <include name="**/lib-xml/*.jar" />
- </fileset>
- </path>
-
- <!-- Deploy Unit test dependencies -->
- <target name="deps-test">
- <ant target="deploy" inheritall="false" dir="../lib-xml"/>
- <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
- <ant target="deploy" inheritall="false" dir="../protocol-file"/>
- </target>
-
-
- <!-- for junit test -->
- <mkdir dir="${build.test}/data"/>
- <copy file="sample/rsstest.rss" todir="${build.test}/data"/>
-
-</project>
View
42 src/plugin/parse-rss/ivy.xml
@@ -1,42 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- <dependency org="xmlrpc" name="xmlrpc" rev="1.2" conf="*->master"/>
- </dependencies>
-
-</ivy-module>
View
BIN  src/plugin/parse-rss/lib/commons-feedparser-0.6-fork.jar
Binary file not shown
View
49 src/plugin/parse-rss/plugin.xml
@@ -1,49 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
- id="parse-rss"
- name="RSS Parse Plug-in"
- version="1.0.0"
- provider-name="edu.usc.cs.cs599">
-
-
- <runtime>
- <library name="parse-rss.jar">
- <export name="*"/>
- </library>
- <library name="commons-feedparser-0.6-fork.jar"/>
- <library name="xml-rpc-1.2.jar"/>
- </runtime>
-
- <requires>
- <import plugin="nutch-extensionpoints"/>
- <import plugin="lib-xml"/>
- </requires>
-
- <extension id="org.apache.nutch.parse.rss"
- name="RssParse"
- point="org.apache.nutch.parse.Parser">
-
- <implementation id="org.apache.nutch.parse.rss.RSSParser"
- class="org.apache.nutch.parse.rss.RSSParser">
- <parameter name="contentType" value="application/rss+xml"/>
- <parameter name="pathSuffix" value="rss"/>
- </implementation>
- </extension>
-
-</plugin>
View
21 src/plugin/parse-rss/sample/rsstest.rss
@@ -1,21 +0,0 @@
-<?xml version="1.0" encoding="ISO-8859-1" ?>
-<rss version="0.91">
- <channel>
- <title>TestChannel</title>
- <link>http://test.channel.com/</link>
- <description>Sample RSS File for Junit test</description>
- <language>en-us</language>
-
- <item>
- <title>Home Page of Chris Mattmann</title>
- <link>http://www-scf.usc.edu/~mattmann/</link>
- <description>Chris Mattmann's home page</description>
- </item>
-
- <item>
- <title>Awesome Open Source Search Engine</title>
- <link>http://www.nutch.org/</link>
- <description>Yup, that's what it is</description>
- </item>
- </channel>
-</rss>
View
128 src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/FeedParserListenerImpl.java
@@ -1,128 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.parse.rss;
-
-import org.apache.commons.feedparser.DefaultFeedParserListener;
-import org.apache.commons.feedparser.FeedParserState;
-import org.apache.commons.feedparser.FeedParserException;
-
-import java.util.List;
-import java.util.Vector;
-
-import org.apache.nutch.parse.rss.structs.RSSChannel;
-import org.apache.nutch.parse.rss.structs.RSSItem;
-
-/**
- *
- * @author mattmann
- * @version 1.0
- *
- * <p>
- * Feed parser listener class which builds up an RSS Channel model that can be
- * iterated through to retrieve the parsed information.
- * </p>
- */
-public class FeedParserListenerImpl extends DefaultFeedParserListener {
-
- private List fRssChannels = null;
-
- private RSSChannel fCurrentChannel = null;
-
- /**
- * <p>
- * Default Constructor
- * </p>
- */
- public FeedParserListenerImpl() {
- fRssChannels = new Vector();
- }
-
- /**
- * <p>
- * Gets a {@link List}of {@link RSSChannel}s that the listener parsed from
- * the RSS document.
- * </p>
- *
- * @return A {@link List}of {@link RSSChannel}s.
- */
- public List getChannels() {
- if (fRssChannels.size() > 0) {
- return fRssChannels;
- } else {
- //there was only one channel found
- //add it here, then return it
- fRssChannels.add(fCurrentChannel);
- return fRssChannels;
- }
- }
-
- /**
- * <p>
- * Callback method when the parser encounters an RSS Channel.
- * </p>
- *
- * @param state
- * The current state of the FeedParser.
- * @param title
- * The title of the RSS Channel.
- * @param link
- * A hyperlink to the RSS Channel.
- * @param description
- * The description of the RSS Channel.
- */
- public void onChannel(FeedParserState state, String title, String link,
- String description) throws FeedParserException {
-
- //capture the old channel if it's not null
- if (fCurrentChannel != null) {
- fRssChannels.add(fCurrentChannel);
- }
-
- //System.out.println("Found a new channel: " + title);
-
- fCurrentChannel = new RSSChannel(title, link, description);
-
- }
-
- /**
- * <p>
- * Callback method when the parser encounters an RSS Item.
- * </p>
- *
- * @param state
- * The current state of the FeedParser.
- * @param title
- * The title of the RSS Item.
- * @param link
- * A hyperlink to the RSS Item.
- * @param description
- * The description of the RSS Item.
- * @param permalink
- * A permanent link to the RSS Item.
- */
- public void onItem(FeedParserState state, String title, String link,
- String description, String permalink) throws FeedParserException {
-
- //System.out.println("Found a new published article: " + permalink);
- if (fCurrentChannel != null) { //should never be null
- fCurrentChannel.getItems().add(
- new RSSItem(title, link, description, permalink));
- }
-
- }
-
-}
View
227 src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
@@ -1,227 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.rss;
-
-// JDK imports
-import java.io.ByteArrayInputStream;
-import java.net.MalformedURLException;
-import java.util.List;
-import java.util.Vector;
-
-// Commons Logging imports
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-// Hadoop imports
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.conf.Configuration;
-
-// Nutch imports
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.parse.Parser;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.rss.structs.RSSItem;
-import org.apache.nutch.parse.rss.structs.RSSChannel;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.util.LogUtil;
-import org.apache.nutch.util.NutchConfiguration;
-
-// RSS parsing imports
-import org.apache.commons.feedparser.FeedParserListener;
-import org.apache.commons.feedparser.FeedParser;
-import org.apache.commons.feedparser.FeedParserFactory;
-
-
-/**
- *
- * @author mattmann
- * @version 1.0
- *
- * <p>
- * RSS Parser class for nutch
- * </p>
- */
-public class RSSParser implements Parser {
- public static final Log LOG = LogFactory.getLog("org.apache.nutch.parse.rss");
- private Configuration conf;
-
- /**
- * <p>
- * Implementation method, parses the RSS content, and then returns a
- * {@link ParseImpl}.
- * </p>
- *
- * @param content
- * The content to parse (hopefully an RSS content stream)
- * @return A {@link ParseImpl}which implements the {@link Parse}interface.
- */
- public ParseResult getParse(Content content) {
-
- List theRSSChannels = null;
-
- try {
- byte[] raw = content.getContent();
-
- // create a new FeedParser...
- FeedParser parser = FeedParserFactory.newFeedParser();
-
- // create a listener for handling our callbacks
- FeedParserListener listener = new FeedParserListenerImpl();
-
- // start parsing our feed and have the onItem methods called
- parser.parse(listener, new ByteArrayInputStream(raw), /* resource */
- null);
-
- theRSSChannels = ((FeedParserListenerImpl) listener).getChannels();
-
- } catch (Exception e) { // run time exception
- if (LOG.isWarnEnabled()) {
- e.printStackTrace(LogUtil.getWarnStream(LOG));
- LOG.warn("nutch:parse-rss:RSSParser Exception: " + e.getMessage());
- }
- return new ParseStatus(ParseStatus.FAILED,
- "Can't be handled as rss document. " + e).getEmptyParseResult(content.getUrl(), getConf());
- }
-
- StringBuffer contentTitle = new StringBuffer(), indexText = new StringBuffer();
- List theOutlinks = new Vector();
-
- // for us, the contentTitle will be a concatenation of the titles of the
- // RSS Channels that we've parsed
- // and the index text will be a concatenation of the RSS Channel
- // descriptions, and descriptions of the RSS Items in the channel
-
- // also get the outlinks
-
- if (theRSSChannels != null) {
- for (int i = 0; i < theRSSChannels.size(); i++) {
- RSSChannel r = (RSSChannel) theRSSChannels.get(i);
- contentTitle.append(r.getTitle());
- contentTitle.append(" ");
-
- // concat the description to the index text
- indexText.append(r.getDescription());
- indexText.append(" ");
-
- if (r.getLink() != null) {
- try {
- // get the outlink
- if (r.getDescription()!= null ) {
- theOutlinks.add(new Outlink(r.getLink(), r.getDescription()));
- } else {
- theOutlinks.add(new Outlink(r.getLink(), ""));
- }
- } catch (MalformedURLException e) {
- if (LOG.isWarnEnabled()) {
- LOG.warn("MalformedURL: " + r.getLink());
- LOG.warn("Attempting to continue processing outlinks");
- e.printStackTrace(LogUtil.getWarnStream(LOG));
- }
- continue;
- }
- }
-
- // now get the descriptions of all the underlying RSS Items and
- // then index them too
- for (int j = 0; j < r.getItems().size(); j++) {
- RSSItem theRSSItem = (RSSItem) r.getItems().get(j);
- indexText.append(theRSSItem.getDescription());
- indexText.append(" ");
-
- String whichLink = null;
-
- if (theRSSItem.getPermalink() != null)
- whichLink = theRSSItem.getPermalink();
- else
- whichLink = theRSSItem.getLink();
-
- if (whichLink != null) {
- try {
- if (theRSSItem.getDescription()!=null) {
- theOutlinks.add(new Outlink(whichLink, theRSSItem.getDescription()));
- } else {
- theOutlinks.add(new Outlink(whichLink, ""));
- }
- } catch (MalformedURLException e) {
- if (LOG.isWarnEnabled()) {
- LOG.warn("MalformedURL: " + whichLink);
- LOG.warn("Attempting to continue processing outlinks");
- e.printStackTrace(LogUtil.getWarnStream(LOG));
- }
- continue;
- }
- }
-
- }
-
- }
-
- if (LOG.isTraceEnabled()) {
- LOG.trace("nutch:parse-rss:getParse:indexText=" + indexText);
- LOG.trace("nutch:parse-rss:getParse:contentTitle=" + contentTitle);
- }
-
- } else if (LOG.isTraceEnabled()) {
- LOG.trace("nutch:parse-rss:Error:getParse: No RSS Channels recorded!");
- }
-
- // format the outlinks
- Outlink[] outlinks = (Outlink[]) theOutlinks.toArray(new Outlink[theOutlinks.size()]);
-
- if (LOG.isTraceEnabled()) {
- LOG.trace("nutch:parse-rss:getParse:found " + outlinks.length + " outlinks");
- }
- // if (LOG.isInfoEnabled()) {
- // LOG.info("Outlinks: "+outlinks);
- // }
-
- ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
- contentTitle.toString(), outlinks, content.getMetadata());
- return ParseResult.createParseResult(content.getUrl(), new ParseImpl(indexText.toString(), parseData));
- }
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- }
-
- public Configuration getConf() {
- return this.conf;
- }
-
- public static void main(String[] args) throws Exception {
- //LOG.setLevel(Level.FINE);
- String url = args[0];
- Configuration conf = NutchConfiguration.create();
- RSSParser parser = new RSSParser();
- parser.setConf(conf);
- Protocol protocol = new ProtocolFactory(conf).getProtocol(url);
- Content content = protocol.getProtocolOutput(new Text(url), new CrawlDatum()).getContent();
- Parse parse = parser.getParse(content).get(content.getUrl());
- System.out.println("data: "+ parse.getData());
- System.out.println("text: "+parse.getText());
- }
-
-
-}
View
189 src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/RSSChannel.java
@@ -1,189 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.parse.rss.structs;
-
-import java.util.List;
-import java.util.Vector;
-
-/**
- *
- * <p>
- * Data class for holding RSS Channels to send to Nutch's indexer
- * </p>
- *
- * @author mattmann
- * @version 1.0
- */
-public class RSSChannel {
-
- //description of the channel
- private String fDescription = null;
-
- // link to the channel's page
- private String fLink = null;
-
- // title of the Channel
- private String fTitle = null;
-
- // set of items in the Channel
- private List fItems = null;
-
- /**
- *
- * <p>
- * Default Constructor
- * </p>
- *
- * @param desc
- * The description of the channel.
- * @param link
- * A link to the channel's url.
- * @param title
- * The title of the channel.
- * @param items
- * A list of {@link RSSItem}s for this RSS Channel.
- */
- public RSSChannel(String desc, String link, String title, List items) {
- fDescription = desc;
- fLink = link;
- fTitle = title;
- fItems = items;
-
- }
-
- /**
- *
- * <p>
- * Constructor if you don't have the list of RSS Items ready yet.
- * </p>
- *
- * @param desc
- * The description of the channel.
- * @param link
- * A link to the channel's url.
- * @param title
- * The title of the channel.
- */
- public RSSChannel(String desc, String link, String title) {
- fDescription = desc;
- fLink = link;
- fTitle = title;
- fItems = new Vector();
-
- }
-
- /**
- *
- * <p>
- * Get the list of items for this channel.
- * </p>
- *
- * @return A list of {@link RSSItem}s.
- */
- public List getItems() {
- return fItems;
- }
-
- /**
- *
- * <p>
- * Returns the channel title
- * </p>
- *
- * @return The title of the channel.
- */
-
- public String getTitle() {
- return fTitle;
- }
-
- /**
- *
- * <p>
- * Returns a link to the RSS Channel.
- * </p>
- *
- * @return A {@link String}link to the RSS Channel.
- */
- public String getLink() {
- return fLink;
- }
-
- /**
- *
- * <p>
- * Returns a {@link String}description of the RSS Channel.
- * </p>
- *
- * @return The description of the RSS Channel.
- */
- public String getDescription() {
- return fDescription;
- }
-
- /**
- *
- * <p>
- * Sets the list of RSS items for this channel.
- * </p>
- *
- * @param items
- * A List of {@link RSSItem}s for this RSSChannel.
- */
- public void setItems(List items) {
- fItems = items;
- }
-
- /**
- *
- * <p>
- * Sets the Title for this RSS Channel.
- * </p>
- *
- * @param title
- * The title of this RSSChannel.
- */
- public void setTitle(String title) {
- fTitle = title;
- }
-
- /**
- *
- * <p>
- * Sets the link to this RSSChannel
- * </p>
- *
- * @param link
- * A {@link String}representation of a link to this RSS Channel.
- */
- public void setLink(String link) {
- fLink = link;
- }
-
- /**
- *
- * <p>
- * Sets the description of this RSSChannel
- * </p>
- *
- * @param description
- * A String description of this RSS Channel.
- */
- public void setDescription(String description) {
- fDescription = description;
- }
-}
View
151 src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/RSSItem.java
@@ -1,151 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.rss.structs;
-
-/**
- *
- * <p>
- * Data class for holding RSS Items to send to Nutch's indexer
- * </p>
- *
- * @author mattmann
- * @version 1.0
- */
-public class RSSItem {
-
- //The title of this RSS Item
- private String fTitle = null;
-
- //The link that this RSS Item points to
- private String fLink = null;
-
- //The description of this RSS Item
- private String fDescription = null;
-
- //A permanent link that this RSS Item points to
- private String fPermalink = null;
-
- public RSSItem(String title, String link, String description,
- String permalink) {
- fTitle = title;
- fLink = link;
- fDescription = description;
- fPermalink = permalink;
- }
-
- /**
- *
- * <P>
- * Get the title for this RSS Item
- * </p>
- *
- * @return The title of this RSS Item
- */
- public String getTitle() {
- return fTitle;
- }
-
- /**
- *
- * <p>
- * Gets the link that this RSS Item points to.
- * </p>
- *
- * @return The link that this RSS Items points to.
- */
- public String getLink() {
- return fLink;
- }
-
- /**
- *
- * <p>
- * Gets the Description of this RSS Item
- * </p>
- *
- * @return The description of this RSS Item.
- */
- public String getDescription() {
- return fDescription;
- }
-
- /**
- *
- * <p>
- * If this RSS Item points to a permanent link, then this method returns it.
- * </p>
- *
- * @return The permanent link that this RSS Items points to.
- */
- public String getPermalink() {
- return fPermalink;
- }
-
- /**
- *
- * <p>
- * Sets the title for this RSS Item.
- * </p>
- *
- * @param title
- * The title of this RSS Item
- */
- public void setTitle(String title) {
- fTitle = title;
- }
-
- /**
- *
- * <p>
- * Sets the link that this RSS Item points to.
- * </p>
- *
- * @param link
- * The link that this RSS Item points to.
- */
- public void setLink(String link) {
- fTitle = link;
- }
-
- /**
- *
- * <p>
- * Sets the description of this RSS Item.
- * </p>
- *
- * @param description
- * The description of this RSS Item.
- */
- public void setDescription(String description) {
- fDescription = description;
- }
-
- /**
- *
- * <p>
- * Sets the permanent link that this RSS Item points to.
- * </p>
- *
- * @param permalink
- * The permanent link that this RSS Item points to
- */
- public void setPermalink(String permalink) {
- fPermalink = permalink;
- }
-
-}
View
130 src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java
@@ -1,130 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.rss;
-
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.Outlink;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-
-import junit.framework.TestCase;
-
-/**
- * Unit tests for the RSS Parser based on John Xing's TestPdfParser class.
- *
- * @author mattmann
- * @version 1.0
- */
-public class TestRSSParser extends TestCase {
-
- private String fileSeparator = System.getProperty("file.separator");
-
- // This system property is defined in ./src/plugin/build-plugin.xml
- private String sampleDir = System.getProperty("test.data", ".");
-
- // Make sure sample files are copied to "test.data" as specified in
- // ./src/plugin/parse-rss/build.xml during plugin compilation.
-
- private String[] sampleFiles = { "rsstest.rss" };
-
- /**
- * <p>
- * Default constructor
- * </p>
- *
- * @param name
- * The name of the RSSParserTest
- */
- public TestRSSParser(String name) {
- super(name);
- }
-
- /**
- * <p>
- * The test method: tests out the following 2 asserts:
- * </p>
- *
- * <ul>
- * <li>There are 3 outlinks read from the sample rss file</li>
- * <li>The 3 outlinks read are in fact the correct outlinks from the sample
- * file</li>
- * </ul>
- */
- public void testIt() throws ProtocolException, ParseException {
- String urlString;
- Protocol protocol;
- Content content;
- Parse parse;
-
- Configuration conf = NutchConfiguration.create();
- for (int i = 0; i < sampleFiles.length; i++) {
- urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
- protocol = new ProtocolFactory(conf).getProtocol(urlString);
- content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
- parse = new ParseUtil(conf).parseByExtensionId("parse-rss", content).get(content.getUrl());
-
- //check that there are 3 outlinks:
- //http://test.channel.com
- //http://www-scf.usc.edu/~mattmann/
- //http://www.nutch.org
-
- ParseData theParseData = parse.getData();
-
- Outlink[] theOutlinks = theParseData.getOutlinks();
-
- assertTrue("There aren't 3 outlinks read!", theOutlinks.length == 3);
-
- //now check to make sure that those are the two outlinks
- boolean hasLink1 = false, hasLink2 = false, hasLink3 = false;
-
- for (int j = 0; j < theOutlinks.length; j++) {
- //System.out.println("reading "+theOutlinks[j].getToUrl());
- if (theOutlinks[j].getToUrl().equals(
- "http://www-scf.usc.edu/~mattmann/")) {
- hasLink1 = true;
- }
-
- if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/")) {
- hasLink2 = true;
- }
-
- if (theOutlinks[j].getToUrl()
- .equals("http://test.channel.com/")) {
- hasLink3 = true;
- }
- }
-
- if (!hasLink1 || !hasLink2 || !hasLink3) {
- fail("Outlinks read from sample rss file are not correct!");
- }
- }
- }
-
-}
View
1  src/plugin/parse-tika/build.xml
@@ -29,6 +29,7 @@
<mkdir dir="${build.test}/data"/>
<copy todir="${build.test}/data">
<fileset dir="sample">
+ <include name="*.rss"/>
<include name="*.rtf"/>
<include name="*.pdf"/>
<include name="ootest.*"/>
View
37 src/plugin/parse-tika/sample/rsstest.rss
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="ISO-8859-1" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<rss version="0.91">
+ <channel>
+ <title>TestChannel</title>
+ <link>http://test.channel.com/</link>
+ <description>Sample RSS File for Junit test</description>
+ <language>en-us</language>
+
+ <item>
+ <title>Home Page of Chris Mattmann</title>
+ <link>http://www-scf.usc.edu/~mattmann/</link>
+ <description>Chris Mattmann's home page</description>
+ </item>
+
+ <item>
+ <title>Awesome Open Source Search Engine</title>
+ <link>http://www.nutch.org/</link>
+ <description>Yup, that's what it is</description>
+ </item>
+ </channel>
+</rss>
View
130 src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java
@@ -0,0 +1,130 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import junit.framework.TestCase;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.tika.TikaParser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ *
+ * @author mattmann / jnioche
+ *
+ * Test Suite for the RSS feeds with the {@link TikaParser}.
+ *
+ */
+public class TestFeedParser extends TestCase {
+
+ private String fileSeparator = System.getProperty("file.separator");
+
+ // This system property is defined in ./src/plugin/build-plugin.xml
+ private String sampleDir = System.getProperty("test.data", ".");
+
+ private String[] sampleFiles = { "rsstest.rss" };
+
+ public static final Log LOG = LogFactory.getLog(TestFeedParser.class
+ .getName());
+
+ /**
+ * Default Constructor.
+ *
+ * @param name
+ * The name of this {@link TestCase}.
+ */
+ public TestFeedParser(String name) {
+ super(name);
+ }
+
+ /**
+ * <p>
+ * The test method: tests out the following 2 asserts:
+ * </p>
+ *
+ * <ul>
+ * <li>There are 3 outlinks read from the sample rss file</li>
+ * <li>The 3 outlinks read are in fact the correct outlinks from the sample
+ * file</li>
+ * </ul>
+ */
+ public void testIt() throws ProtocolException, ParseException {
+ String urlString;
+ Protocol protocol;
+ Content content;
+ Parse parse;
+
+ Configuration conf = NutchConfiguration.create();
+ for (int i = 0; i < sampleFiles.length; i++) {
+ urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+ protocol = new ProtocolFactory(conf).getProtocol(urlString);
+ content = protocol.getProtocolOutput(new Text(urlString),
+ new CrawlDatum()).getContent();
+ parse = new ParseUtil(conf).parseByExtensionId("parse-tika",
+ content).get(content.getUrl());
+
+ // check that there are 2 outlinks:
+ // unlike the original parse-rss
+ // tika ignores the URL and description of the channel
+
+ // http://test.channel.com
+ // http://www-scf.usc.edu/~mattmann/
+ // http://www.nutch.org
+
+ ParseData theParseData = parse.getData();
+
+ Outlink[] theOutlinks = theParseData.getOutlinks();
+
+ assertTrue("There aren't 2 outlinks read!",
+ theOutlinks.length == 2);
+
+ // now check to make sure that those are the two outlinks
+ boolean hasLink1 = false, hasLink2 = false;
+
+ for (int j = 0; j < theOutlinks.length; j++) {
+ if (theOutlinks[j].getToUrl().equals(
+ "http://www-scf.usc.edu/~mattmann/")) {
+ hasLink1 = true;
+ }
+
+ if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/")) {
+ hasLink2 = true;
+ }
+ }
+
+ if (!hasLink1 || !hasLink2) {
+ fail("Outlinks read from sample rss file are not correct!");
+ }
+ }
+ }
+
+}
Please sign in to comment.
Something went wrong with that request. Please try again.