diff --git a/CHANGES.txt b/CHANGES.txt
index 9946bc9b93..822bd4acfe 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,11 @@
# Nutch Change Log
+Breaking Changes
+
+ - the plugin parse-swf for parsing Shockwave/Adobe Flash conent was removed (NUTCH-2861)
+
+
+
Nutch 1.18 Release 14/01/2021 (dd/mm/yyyy)
Release Report: https://s.apache.org/lqara
diff --git a/LICENSE.txt b/LICENSE.txt
index 9badcdad67..38ba38252c 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -4831,41 +4831,6 @@ src/plugin/ontology/lib/jena-2.1.jar
This product includes software developed by the
Apache Software Foundation (http://www.apache.org/).
-src/plugin/parse-swf/lib/javaswf.jar
-
-
- Copyright (c) 2001-2005, David N. Main, All rights reserved.
-
- Redistribution and use in source and binary forms, with or
- without modification, are permitted provided that the
- following conditions are met:
-
- 1. Redistributions of source code must retain the above
- copyright notice, this list of conditions and the following
- disclaimer.
-
- 2. Redistributions in binary form must reproduce the above
- copyright notice, this list of conditions and the following
- disclaimer in the documentation and/or other materials
- provided with the distribution.
-
- 3. The name of the author may not be used to endorse or
- promote products derived from this software without specific
- prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY
- EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
- PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
src/plugin/response-json/lib/json-lib-2.2.2-jdk15.jar
Apache License
diff --git a/NOTICE.txt b/NOTICE.txt
index 1c9efd00a0..0f74198b10 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -6,8 +6,5 @@ Foundation (http://www.apache.org/).
This product includes software developed by the following copyright owners:
-Nutch includes JavaSWF:
-Copyright (c) 2001-2005, David N. Main, All rights reserved.
-
Nutch includes Automaton:
This package is Copyright © 2001-2008 Anders Møller. All rights reserved.
diff --git a/build.xml b/build.xml
index 062e586af6..0fd276c126 100644
--- a/build.xml
+++ b/build.xml
@@ -226,7 +226,6 @@
-
@@ -760,7 +759,6 @@
-
@@ -1261,8 +1259,6 @@
-
-
diff --git a/conf/parse-plugins.xml.template b/conf/parse-plugins.xml.template
index 2507976ec5..cd81053443 100644
--- a/conf/parse-plugins.xml.template
+++ b/conf/parse-plugins.xml.template
@@ -51,10 +51,6 @@
-
-
-
-
@@ -93,8 +89,6 @@
-
diff --git a/default.properties b/default.properties
index cf82c8410d..524a8e8e48 100644
--- a/default.properties
+++ b/default.properties
@@ -146,7 +146,6 @@ plugins.parse=\
org.apache.nutch.parse.html*:\
org.apache.nutch.parse.js:\
org.apache.nutch.parse.replace*:\
- org.apache.nutch.parse.swf*:\
org.apache.nutch.parse.tika:\
org.apache.nutch.parse.zip
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index 73780964bb..db7d4d5601 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -64,7 +64,6 @@
-
@@ -135,7 +134,6 @@
-
@@ -213,7 +211,6 @@
-
diff --git a/src/plugin/parse-swf/build.xml b/src/plugin/parse-swf/build.xml
deleted file mode 100644
index f4fb20f42c..0000000000
--- a/src/plugin/parse-swf/build.xml
+++ /dev/null
@@ -1,38 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/src/plugin/parse-swf/ivy.xml b/src/plugin/parse-swf/ivy.xml
deleted file mode 100644
index 956fd25efc..0000000000
--- a/src/plugin/parse-swf/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-
-
-
-
-
-
-
-
-
- Apache Nutch
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/src/plugin/parse-swf/lib/javaswf-LICENSE.txt b/src/plugin/parse-swf/lib/javaswf-LICENSE.txt
deleted file mode 100644
index 4138a6651d..0000000000
--- a/src/plugin/parse-swf/lib/javaswf-LICENSE.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-
- Copyright (c) 2001-2005, David N. Main, All rights reserved.
-
- Redistribution and use in source and binary forms, with or
- without modification, are permitted provided that the
- following conditions are met:
-
- 1. Redistributions of source code must retain the above
- copyright notice, this list of conditions and the following
- disclaimer.
-
- 2. Redistributions in binary form must reproduce the above
- copyright notice, this list of conditions and the following
- disclaimer in the documentation and/or other materials
- provided with the distribution.
-
- 3. The name of the author may not be used to endorse or
- promote products derived from this software without specific
- prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY
- EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
- PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
diff --git a/src/plugin/parse-swf/lib/javaswf.jar b/src/plugin/parse-swf/lib/javaswf.jar
deleted file mode 100644
index 78f9b0bd94..0000000000
Binary files a/src/plugin/parse-swf/lib/javaswf.jar and /dev/null differ
diff --git a/src/plugin/parse-swf/plugin.xml b/src/plugin/parse-swf/plugin.xml
deleted file mode 100644
index 8cc72c04fd..0000000000
--- a/src/plugin/parse-swf/plugin.xml
+++ /dev/null
@@ -1,44 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/src/plugin/parse-swf/sample/test1.swf b/src/plugin/parse-swf/sample/test1.swf
deleted file mode 100644
index cd2019b47e..0000000000
Binary files a/src/plugin/parse-swf/sample/test1.swf and /dev/null differ
diff --git a/src/plugin/parse-swf/sample/test1.txt b/src/plugin/parse-swf/sample/test1.txt
deleted file mode 100644
index 68505d5002..0000000000
--- a/src/plugin/parse-swf/sample/test1.txt
+++ /dev/null
@@ -1,60 +0,0 @@
-
---------
-/go/gnav_cart
-/go/gnav_company
-/go/gnav_devnet
-/go/gnav_downloads
-/go/gnav_fl_minmessage
-/go/gnav_help
-/go/gnav_mm_home
-/go/gnav_products
-/go/gnav_search?loc=en_us
-/go/gnav_showcase
-/go/gnav_solutions
-/go/gnav_store
-/go/gnav_support
-/go/gnav_your_account
-Acquisition Info
-Adobe Home
-AppleGothic
-Array
-Company
-Developers
-Downloads
-Help
-Home
-International
-LocaleManager
-Macromedia Flash Player
-Macromedia Home
-MovieClip
-Products
-Showcase
-Solutions
-Store
-String
-Support
-TextFormat
-To ensure the best possible Internet Experience, please download the latest version of the free
-Verdana
-_sans
-active
-bluePill
-button
-color
-company
-devnet
-downloads
-en_us
-home
-javascript:openCrosslinkWindow('/go/adobeacquisition')
-javascript:openCrosslinkWindow('/go/gnav_adobe_home')
-products
-rollOut
-rollOver
-selected
-showcase
-solutions
-support
-tabHolder
-textColor
diff --git a/src/plugin/parse-swf/sample/test2.swf b/src/plugin/parse-swf/sample/test2.swf
deleted file mode 100644
index eb9b03d7af..0000000000
Binary files a/src/plugin/parse-swf/sample/test2.swf and /dev/null differ
diff --git a/src/plugin/parse-swf/sample/test2.txt b/src/plugin/parse-swf/sample/test2.txt
deleted file mode 100644
index f77b78afb5..0000000000
--- a/src/plugin/parse-swf/sample/test2.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-Impact Impact Impact Arial Arial Arial Webdings Webdings Webdings Verdana Verdana Verdana CourierNew CourierNew CourierNew Bimini Bimini Bimini
---------
-TextFormat
-color
-font
diff --git a/src/plugin/parse-swf/sample/test3.swf b/src/plugin/parse-swf/sample/test3.swf
deleted file mode 100644
index 4df9f1eaae..0000000000
Binary files a/src/plugin/parse-swf/sample/test3.swf and /dev/null differ
diff --git a/src/plugin/parse-swf/sample/test3.txt b/src/plugin/parse-swf/sample/test3.txt
deleted file mode 100644
index 66ae3d8294..0000000000
--- a/src/plugin/parse-swf/sample/test3.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-Mix.
- Edit.
- Master.
- Compose.
- Animate.
- With a single suite of powerful tools
- that work together as one.
- World-class video and audio tools that bring
- new power and efficiency to your film, video,
- DVD, and web workflows.
- Learn more.
diff --git a/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java b/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java
deleted file mode 100644
index 60136d717f..0000000000
--- a/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java
+++ /dev/null
@@ -1,699 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.parse.swf;
-
-import java.lang.invoke.MethodHandles;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Stack;
-import java.util.Vector;
-import java.io.FileInputStream;
-import java.io.IOException;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.OutlinkExtractor;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.parse.Parser;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.NutchConfiguration;
-
-import org.apache.hadoop.conf.Configuration;
-
-import com.anotherbigidea.flash.interfaces.SWFActionBlock;
-import com.anotherbigidea.flash.interfaces.SWFActions;
-import com.anotherbigidea.flash.interfaces.SWFText;
-import com.anotherbigidea.flash.interfaces.SWFVectors;
-import com.anotherbigidea.flash.readers.SWFReader;
-import com.anotherbigidea.flash.readers.TagParser;
-import com.anotherbigidea.flash.structs.AlphaColor;
-import com.anotherbigidea.flash.structs.Color;
-import com.anotherbigidea.flash.structs.Matrix;
-import com.anotherbigidea.flash.structs.Rect;
-import com.anotherbigidea.flash.writers.SWFActionBlockImpl;
-import com.anotherbigidea.flash.writers.SWFTagTypesImpl;
-import com.anotherbigidea.io.InStream;
-
-/**
- * Parser for Flash SWF files. Loosely based on the sample in JavaSWF
- * distribution.
- */
-public class SWFParser implements Parser {
- private static final Logger LOG = LoggerFactory
- .getLogger(MethodHandles.lookup().lookupClass());
-
- private Configuration conf = null;
-
- public SWFParser() {
- //default constructor
- }
-
- @Override
- public void setConf(Configuration conf) {
- this.conf = conf;
- }
-
- @Override
- public Configuration getConf() {
- return conf;
- }
-
- @Override
- public ParseResult getParse(Content content) {
-
- String text = null;
- Vector outlinks = new Vector<>();
-
- try {
-
- byte[] raw = content.getContent();
-
- String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
- if (contentLength != null
- && raw.length != Integer.parseInt(contentLength)) {
- return new ParseStatus(ParseStatus.FAILED,
- ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length
- + " bytes. Parser can't handle incomplete files.")
- .getEmptyParseResult(content.getUrl(), getConf());
- }
- ExtractText extractor = new ExtractText();
-
- // TagParser implements SWFTags and drives a SWFTagTypes interface
- TagParser parser = new TagParser(extractor);
- // use this instead to debug the file
- // TagParser parser = new TagParser( new SWFTagDumper(true, true) );
-
- // SWFReader reads an input file and drives a SWFTags interface
- SWFReader reader = new SWFReader(parser, new InStream(raw));
-
- // read the input SWF file and pass it through the interface pipeline
- reader.readFile();
- text = extractor.getText();
- String atext = extractor.getActionText();
- if (atext != null && atext.length() > 0)
- text += "\n--------\n" + atext;
- // harvest potential outlinks
- String[] links = extractor.getUrls();
- for (int i = 0; i < links.length; i++) {
- Outlink out = new Outlink(links[i], "");
- outlinks.add(out);
- }
- Outlink[] olinks = OutlinkExtractor.getOutlinks(text, conf);
- if (olinks != null)
- for (int i = 0; i < olinks.length; i++) {
- outlinks.add(olinks[i]);
- }
- } catch (Exception e) { // run time exception
- LOG.error("Error, runtime exception: ", e);
- return new ParseStatus(ParseStatus.FAILED,
- "Can't be handled as SWF document. " + e).getEmptyParseResult(
- content.getUrl(), getConf());
- }
- if (text == null)
- text = "";
-
- Outlink[] links = (Outlink[]) outlinks
- .toArray(new Outlink[outlinks.size()]);
- ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", links,
- content.getMetadata());
- return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text,
- parseData));
- }
-
- /**
- * @param args arguments are: 0. Name of input SWF file.
- * @throws IOException if there is a fatal error processing the input
- * file
- */
- public static void main(String[] args) throws IOException {
- FileInputStream in = new FileInputStream(args[0]);
-
- byte[] buf = new byte[in.available()];
- in.read(buf);
- in.close();
- SWFParser parser = new SWFParser();
- ParseResult parseResult = parser.getParse(new Content("file:" + args[0],
- "file:" + args[0], buf, "application/x-shockwave-flash",
- new Metadata(), NutchConfiguration.create()));
- Parse p = parseResult.get("file:" + args[0]);
- System.out.println("Parse Text:");
- System.out.println(p.getText());
- System.out.println("Parse Data:");
- System.out.println(p.getData());
- }
-}
-
-/**
- * Shows how to parse a Flash movie and extract all the text in Text symbols and
- * the initial text in Edit Fields. Output is to System.out.
- *
- * A "pipeline" is set up in the main method:
- *
- * SWFReader-->TagParser-->ExtractText
- *
- * SWFReader reads the input SWF file and separates out the header and the tags.
- * The separated contents are passed to TagParser which parses out the
- * individual tag types and passes them to ExtractText.
- *
- * ExtractText extends SWFTagTypesImpl and overrides some methods.
- */
-class ExtractText extends SWFTagTypesImpl {
- /**
- * Store font info keyed by the font symbol id. Each entry is an int[] of
- * character codes for the correspnding font glyphs (An empty array denotes a
- * System Font).
- */
- protected HashMap fontCodes = new HashMap<>();
-
- public ArrayList strings = new ArrayList<>();
-
- public HashSet actionStrings = new HashSet<>();
-
- public ArrayList urls = new ArrayList<>();
-
- public ExtractText() {
- super(null);
- }
-
- public String getText() {
- StringBuffer res = new StringBuffer();
- Iterator it = strings.iterator();
- while (it.hasNext()) {
- if (res.length() > 0)
- res.append(' ');
- res.append(it.next());
- }
- return res.toString();
- }
-
- public String getActionText() {
- StringBuffer res = new StringBuffer();
- String[] strings = (String[]) actionStrings
- .toArray(new String[actionStrings.size()]);
- Arrays.sort(strings);
- for (int i = 0; i < strings.length; i++) {
- if (i > 0)
- res.append('\n');
- res.append(strings[i]);
- }
- return res.toString();
- }
-
- public String[] getUrls() {
- String[] res = new String[urls.size()];
- int i = 0;
- Iterator it = urls.iterator();
- while (it.hasNext()) {
- res[i] = it.next();
- i++;
- }
- return res;
- }
-
- public void tagDefineFontInfo2(int arg0, String arg1, int arg2, int[] arg3,
- int arg4) throws IOException {
- tagDefineFontInfo(arg0, arg1, arg2, arg3);
- }
-
- /**
- * SWFTagTypes interface Save the Text Font character code info
- */
- public void tagDefineFontInfo(int fontId, String fontName, int flags,
- int[] codes) throws IOException {
- // System.out.println("-defineFontInfo id=" + fontId + ", name=" +
- // fontName);
- fontCodes.put(Integer.valueOf(fontId), codes);
- }
-
- // XXX too much hassle for too little return ... we cannot guess character
- // XXX codes anyway, so we just give up.
- /*
- * public SWFVectors tagDefineFont(int arg0, int arg1) throws IOException {
- * return null; }
- */
-
- /**
- * SWFTagTypes interface. Save the character code info.
- */
- public SWFVectors tagDefineFont2(int id, int flags, String name,
- int numGlyphs, int ascent, int descent, int leading, int[] codes,
- int[] advances, Rect[] bounds, int[] kernCodes1, int[] kernCodes2,
- int[] kernAdjustments) throws IOException {
- fontCodes.put(Integer.valueOf(id), (codes != null) ? codes : new int[0]);
-
- return null;
- }
-
- /**
- * SWFTagTypes interface. Dump any initial text in the field.
- */
- public void tagDefineTextField(int fieldId, String fieldName,
- String initialText, Rect boundary, int flags, AlphaColor textColor,
- int alignment, int fontId, int fontSize, int charLimit, int leftMargin,
- int rightMargin, int indentation, int lineSpacing) throws IOException {
- if (initialText != null) {
- strings.add(initialText);
- }
- }
-
- /**
- * SWFTagTypes interface
- */
- public SWFText tagDefineText(int id, Rect bounds, Matrix matrix)
- throws IOException {
- lastBounds = curBounds;
- curBounds = bounds;
- return new TextDumper();
- }
-
- Rect lastBounds = null;
- Rect curBounds = null;
-
- /**
- * SWFTagTypes interface
- */
- public SWFText tagDefineText2(int id, Rect bounds, Matrix matrix)
- throws IOException {
- lastBounds = curBounds;
- curBounds = bounds;
- return new TextDumper();
- }
-
- public class TextDumper implements SWFText {
- protected Integer fontId;
-
- protected boolean firstY = true;
-
- @Override
- public void font(int fontId, int textHeight) {
- this.fontId = fontId;
- }
-
- @Override
- public void setY(int y) {
- if (firstY)
- firstY = false;
- else
- strings.add("\n"); // Change in Y - dump a new line
- }
-
- /*
- * There are some issues with this method: sometimes SWF files define their
- * own font, so short of OCR we cannot guess what is the glyph code ->
- * character mapping. Additionally, some files don't use literal space
- * character, instead they adjust glyphAdvances. We don't handle it at all -
- * in such cases the text will be all glued together.
- */
- @Override
- public void text(int[] glyphIndices, int[] glyphAdvances) {
- int[] codes = (int[]) fontCodes.get(fontId);
- if (codes == null) {
- // unknown font, better not guess
- strings.add("\n**** ?????????????? ****\n");
- return;
- }
-
- // --Translate the glyph indices to character codes
- char[] chars = new char[glyphIndices.length];
-
- for (int i = 0; i < chars.length; i++) {
- int index = glyphIndices[i];
-
- if (index >= codes.length) // System Font ?
- {
- chars[i] = (char) index;
- } else {
- chars[i] = (char) (codes[index]);
- }
- }
- strings.add(new String(chars));
- }
-
- @Override
- public void color(Color color) {
- }
-
- @Override
- public void setX(int x) {
- }
-
- @Override
- public void done() {
- strings.add("\n");
- }
- }
-
- @Override
- public SWFActions tagDoAction() throws IOException {
- return new NutchSWFActions(actionStrings, urls);
- }
-
- @Override
- public SWFActions tagDoInitAction(int arg0) throws IOException {
- return new NutchSWFActions(actionStrings, urls);
- }
-
-}
-
-/**
- * ActionScript parser. This parser tries to extract free text embedded inside
- * the script, but without polluting it too much with names of variables,
- * methods, etc. Not ideal, but it works.
- */
-class NutchSWFActions extends SWFActionBlockImpl implements SWFActions {
- private HashSet strings = null;
-
- private ArrayList urls = null;
-
- String[] dict = null;
-
- Stack