diff --git a/CHANGES.txt b/CHANGES.txt index 9946bc9b93..822bd4acfe 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,11 @@ # Nutch Change Log +Breaking Changes + + - the plugin parse-swf for parsing Shockwave/Adobe Flash conent was removed (NUTCH-2861) + + + Nutch 1.18 Release 14/01/2021 (dd/mm/yyyy) Release Report: https://s.apache.org/lqara diff --git a/LICENSE.txt b/LICENSE.txt index 9badcdad67..38ba38252c 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -4831,41 +4831,6 @@ src/plugin/ontology/lib/jena-2.1.jar This product includes software developed by the Apache Software Foundation (http://www.apache.org/). -src/plugin/parse-swf/lib/javaswf.jar - - - Copyright (c) 2001-2005, David N. Main, All rights reserved. - - Redistribution and use in source and binary forms, with or - without modification, are permitted provided that the - following conditions are met: - - 1. Redistributions of source code must retain the above - copyright notice, this list of conditions and the following - disclaimer. - - 2. Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials - provided with the distribution. - - 3. The name of the author may not be used to endorse or - promote products derived from this software without specific - prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY - EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, - EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - src/plugin/response-json/lib/json-lib-2.2.2-jdk15.jar Apache License diff --git a/NOTICE.txt b/NOTICE.txt index 1c9efd00a0..0f74198b10 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -6,8 +6,5 @@ Foundation (http://www.apache.org/). This product includes software developed by the following copyright owners: -Nutch includes JavaSWF: -Copyright (c) 2001-2005, David N. Main, All rights reserved. - Nutch includes Automaton: This package is Copyright © 2001-2008 Anders Møller. All rights reserved. diff --git a/build.xml b/build.xml index 062e586af6..0fd276c126 100644 --- a/build.xml +++ b/build.xml @@ -226,7 +226,6 @@ - @@ -760,7 +759,6 @@ - @@ -1261,8 +1259,6 @@ - - diff --git a/conf/parse-plugins.xml.template b/conf/parse-plugins.xml.template index 2507976ec5..cd81053443 100644 --- a/conf/parse-plugins.xml.template +++ b/conf/parse-plugins.xml.template @@ -51,10 +51,6 @@ - - - - @@ -93,8 +89,6 @@ - diff --git a/default.properties b/default.properties index cf82c8410d..524a8e8e48 100644 --- a/default.properties +++ b/default.properties @@ -146,7 +146,6 @@ plugins.parse=\ org.apache.nutch.parse.html*:\ org.apache.nutch.parse.js:\ org.apache.nutch.parse.replace*:\ - org.apache.nutch.parse.swf*:\ org.apache.nutch.parse.tika:\ org.apache.nutch.parse.zip diff --git a/src/plugin/build.xml b/src/plugin/build.xml index 73780964bb..db7d4d5601 100755 --- a/src/plugin/build.xml +++ b/src/plugin/build.xml @@ -64,7 +64,6 @@ - @@ -135,7 +134,6 @@ - @@ -213,7 +211,6 @@ - diff --git a/src/plugin/parse-swf/build.xml b/src/plugin/parse-swf/build.xml deleted file mode 100644 index f4fb20f42c..0000000000 --- a/src/plugin/parse-swf/build.xml +++ /dev/null @@ -1,38 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - diff --git a/src/plugin/parse-swf/ivy.xml b/src/plugin/parse-swf/ivy.xml deleted file mode 100644 index 956fd25efc..0000000000 --- a/src/plugin/parse-swf/ivy.xml +++ /dev/null @@ -1,41 +0,0 @@ - - - - - - - - - - Apache Nutch - - - - - - - - - - - - - - - - diff --git a/src/plugin/parse-swf/lib/javaswf-LICENSE.txt b/src/plugin/parse-swf/lib/javaswf-LICENSE.txt deleted file mode 100644 index 4138a6651d..0000000000 --- a/src/plugin/parse-swf/lib/javaswf-LICENSE.txt +++ /dev/null @@ -1,33 +0,0 @@ - - Copyright (c) 2001-2005, David N. Main, All rights reserved. - - Redistribution and use in source and binary forms, with or - without modification, are permitted provided that the - following conditions are met: - - 1. Redistributions of source code must retain the above - copyright notice, this list of conditions and the following - disclaimer. - - 2. Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials - provided with the distribution. - - 3. The name of the author may not be used to endorse or - promote products derived from this software without specific - prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY - EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, - EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - diff --git a/src/plugin/parse-swf/lib/javaswf.jar b/src/plugin/parse-swf/lib/javaswf.jar deleted file mode 100644 index 78f9b0bd94..0000000000 Binary files a/src/plugin/parse-swf/lib/javaswf.jar and /dev/null differ diff --git a/src/plugin/parse-swf/plugin.xml b/src/plugin/parse-swf/plugin.xml deleted file mode 100644 index 8cc72c04fd..0000000000 --- a/src/plugin/parse-swf/plugin.xml +++ /dev/null @@ -1,44 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - diff --git a/src/plugin/parse-swf/sample/test1.swf b/src/plugin/parse-swf/sample/test1.swf deleted file mode 100644 index cd2019b47e..0000000000 Binary files a/src/plugin/parse-swf/sample/test1.swf and /dev/null differ diff --git a/src/plugin/parse-swf/sample/test1.txt b/src/plugin/parse-swf/sample/test1.txt deleted file mode 100644 index 68505d5002..0000000000 --- a/src/plugin/parse-swf/sample/test1.txt +++ /dev/null @@ -1,60 +0,0 @@ - --------- -/go/gnav_cart -/go/gnav_company -/go/gnav_devnet -/go/gnav_downloads -/go/gnav_fl_minmessage -/go/gnav_help -/go/gnav_mm_home -/go/gnav_products -/go/gnav_search?loc=en_us -/go/gnav_showcase -/go/gnav_solutions -/go/gnav_store -/go/gnav_support -/go/gnav_your_account -Acquisition Info -Adobe Home -AppleGothic -Array -Company -Developers -Downloads -Help -Home -International -LocaleManager -Macromedia Flash Player -Macromedia Home -MovieClip -Products -Showcase -Solutions -Store -String -Support -TextFormat -To ensure the best possible Internet Experience, please download the latest version of the free -Verdana -_sans -active -bluePill -button -color -company -devnet -downloads -en_us -home -javascript:openCrosslinkWindow('/go/adobeacquisition') -javascript:openCrosslinkWindow('/go/gnav_adobe_home') -products -rollOut -rollOver -selected -showcase -solutions -support -tabHolder -textColor diff --git a/src/plugin/parse-swf/sample/test2.swf b/src/plugin/parse-swf/sample/test2.swf deleted file mode 100644 index eb9b03d7af..0000000000 Binary files a/src/plugin/parse-swf/sample/test2.swf and /dev/null differ diff --git a/src/plugin/parse-swf/sample/test2.txt b/src/plugin/parse-swf/sample/test2.txt deleted file mode 100644 index f77b78afb5..0000000000 --- a/src/plugin/parse-swf/sample/test2.txt +++ /dev/null @@ -1,5 +0,0 @@ -Impact Impact Impact Arial Arial Arial Webdings Webdings Webdings Verdana Verdana Verdana CourierNew CourierNew CourierNew Bimini Bimini Bimini --------- -TextFormat -color -font diff --git a/src/plugin/parse-swf/sample/test3.swf b/src/plugin/parse-swf/sample/test3.swf deleted file mode 100644 index 4df9f1eaae..0000000000 Binary files a/src/plugin/parse-swf/sample/test3.swf and /dev/null differ diff --git a/src/plugin/parse-swf/sample/test3.txt b/src/plugin/parse-swf/sample/test3.txt deleted file mode 100644 index 66ae3d8294..0000000000 --- a/src/plugin/parse-swf/sample/test3.txt +++ /dev/null @@ -1,11 +0,0 @@ -Mix. - Edit. - Master. - Compose. - Animate. - With a single suite of powerful tools - that work together as one. - World-class video and audio tools that bring - new power and efficiency to your film, video, - DVD, and web workflows. - Learn more. diff --git a/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java b/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java deleted file mode 100644 index 60136d717f..0000000000 --- a/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java +++ /dev/null @@ -1,699 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.parse.swf; - -import java.lang.invoke.MethodHandles; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Stack; -import java.util.Vector; -import java.io.FileInputStream; -import java.io.IOException; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.parse.OutlinkExtractor; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseImpl; -import org.apache.nutch.parse.ParseResult; -import org.apache.nutch.parse.ParseStatus; -import org.apache.nutch.parse.Parser; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.util.NutchConfiguration; - -import org.apache.hadoop.conf.Configuration; - -import com.anotherbigidea.flash.interfaces.SWFActionBlock; -import com.anotherbigidea.flash.interfaces.SWFActions; -import com.anotherbigidea.flash.interfaces.SWFText; -import com.anotherbigidea.flash.interfaces.SWFVectors; -import com.anotherbigidea.flash.readers.SWFReader; -import com.anotherbigidea.flash.readers.TagParser; -import com.anotherbigidea.flash.structs.AlphaColor; -import com.anotherbigidea.flash.structs.Color; -import com.anotherbigidea.flash.structs.Matrix; -import com.anotherbigidea.flash.structs.Rect; -import com.anotherbigidea.flash.writers.SWFActionBlockImpl; -import com.anotherbigidea.flash.writers.SWFTagTypesImpl; -import com.anotherbigidea.io.InStream; - -/** - * Parser for Flash SWF files. Loosely based on the sample in JavaSWF - * distribution. - */ -public class SWFParser implements Parser { - private static final Logger LOG = LoggerFactory - .getLogger(MethodHandles.lookup().lookupClass()); - - private Configuration conf = null; - - public SWFParser() { - //default constructor - } - - @Override - public void setConf(Configuration conf) { - this.conf = conf; - } - - @Override - public Configuration getConf() { - return conf; - } - - @Override - public ParseResult getParse(Content content) { - - String text = null; - Vector outlinks = new Vector<>(); - - try { - - byte[] raw = content.getContent(); - - String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH); - if (contentLength != null - && raw.length != Integer.parseInt(contentLength)) { - return new ParseStatus(ParseStatus.FAILED, - ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length - + " bytes. Parser can't handle incomplete files.") - .getEmptyParseResult(content.getUrl(), getConf()); - } - ExtractText extractor = new ExtractText(); - - // TagParser implements SWFTags and drives a SWFTagTypes interface - TagParser parser = new TagParser(extractor); - // use this instead to debug the file - // TagParser parser = new TagParser( new SWFTagDumper(true, true) ); - - // SWFReader reads an input file and drives a SWFTags interface - SWFReader reader = new SWFReader(parser, new InStream(raw)); - - // read the input SWF file and pass it through the interface pipeline - reader.readFile(); - text = extractor.getText(); - String atext = extractor.getActionText(); - if (atext != null && atext.length() > 0) - text += "\n--------\n" + atext; - // harvest potential outlinks - String[] links = extractor.getUrls(); - for (int i = 0; i < links.length; i++) { - Outlink out = new Outlink(links[i], ""); - outlinks.add(out); - } - Outlink[] olinks = OutlinkExtractor.getOutlinks(text, conf); - if (olinks != null) - for (int i = 0; i < olinks.length; i++) { - outlinks.add(olinks[i]); - } - } catch (Exception e) { // run time exception - LOG.error("Error, runtime exception: ", e); - return new ParseStatus(ParseStatus.FAILED, - "Can't be handled as SWF document. " + e).getEmptyParseResult( - content.getUrl(), getConf()); - } - if (text == null) - text = ""; - - Outlink[] links = (Outlink[]) outlinks - .toArray(new Outlink[outlinks.size()]); - ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", links, - content.getMetadata()); - return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, - parseData)); - } - - /** - * @param args arguments are: 0. Name of input SWF file. - * @throws IOException if there is a fatal error processing the input - * file - */ - public static void main(String[] args) throws IOException { - FileInputStream in = new FileInputStream(args[0]); - - byte[] buf = new byte[in.available()]; - in.read(buf); - in.close(); - SWFParser parser = new SWFParser(); - ParseResult parseResult = parser.getParse(new Content("file:" + args[0], - "file:" + args[0], buf, "application/x-shockwave-flash", - new Metadata(), NutchConfiguration.create())); - Parse p = parseResult.get("file:" + args[0]); - System.out.println("Parse Text:"); - System.out.println(p.getText()); - System.out.println("Parse Data:"); - System.out.println(p.getData()); - } -} - -/** - * Shows how to parse a Flash movie and extract all the text in Text symbols and - * the initial text in Edit Fields. Output is to System.out. - * - * A "pipeline" is set up in the main method: - * - * SWFReader-->TagParser-->ExtractText - * - * SWFReader reads the input SWF file and separates out the header and the tags. - * The separated contents are passed to TagParser which parses out the - * individual tag types and passes them to ExtractText. - * - * ExtractText extends SWFTagTypesImpl and overrides some methods. - */ -class ExtractText extends SWFTagTypesImpl { - /** - * Store font info keyed by the font symbol id. Each entry is an int[] of - * character codes for the correspnding font glyphs (An empty array denotes a - * System Font). - */ - protected HashMap fontCodes = new HashMap<>(); - - public ArrayList strings = new ArrayList<>(); - - public HashSet actionStrings = new HashSet<>(); - - public ArrayList urls = new ArrayList<>(); - - public ExtractText() { - super(null); - } - - public String getText() { - StringBuffer res = new StringBuffer(); - Iterator it = strings.iterator(); - while (it.hasNext()) { - if (res.length() > 0) - res.append(' '); - res.append(it.next()); - } - return res.toString(); - } - - public String getActionText() { - StringBuffer res = new StringBuffer(); - String[] strings = (String[]) actionStrings - .toArray(new String[actionStrings.size()]); - Arrays.sort(strings); - for (int i = 0; i < strings.length; i++) { - if (i > 0) - res.append('\n'); - res.append(strings[i]); - } - return res.toString(); - } - - public String[] getUrls() { - String[] res = new String[urls.size()]; - int i = 0; - Iterator it = urls.iterator(); - while (it.hasNext()) { - res[i] = it.next(); - i++; - } - return res; - } - - public void tagDefineFontInfo2(int arg0, String arg1, int arg2, int[] arg3, - int arg4) throws IOException { - tagDefineFontInfo(arg0, arg1, arg2, arg3); - } - - /** - * SWFTagTypes interface Save the Text Font character code info - */ - public void tagDefineFontInfo(int fontId, String fontName, int flags, - int[] codes) throws IOException { - // System.out.println("-defineFontInfo id=" + fontId + ", name=" + - // fontName); - fontCodes.put(Integer.valueOf(fontId), codes); - } - - // XXX too much hassle for too little return ... we cannot guess character - // XXX codes anyway, so we just give up. - /* - * public SWFVectors tagDefineFont(int arg0, int arg1) throws IOException { - * return null; } - */ - - /** - * SWFTagTypes interface. Save the character code info. - */ - public SWFVectors tagDefineFont2(int id, int flags, String name, - int numGlyphs, int ascent, int descent, int leading, int[] codes, - int[] advances, Rect[] bounds, int[] kernCodes1, int[] kernCodes2, - int[] kernAdjustments) throws IOException { - fontCodes.put(Integer.valueOf(id), (codes != null) ? codes : new int[0]); - - return null; - } - - /** - * SWFTagTypes interface. Dump any initial text in the field. - */ - public void tagDefineTextField(int fieldId, String fieldName, - String initialText, Rect boundary, int flags, AlphaColor textColor, - int alignment, int fontId, int fontSize, int charLimit, int leftMargin, - int rightMargin, int indentation, int lineSpacing) throws IOException { - if (initialText != null) { - strings.add(initialText); - } - } - - /** - * SWFTagTypes interface - */ - public SWFText tagDefineText(int id, Rect bounds, Matrix matrix) - throws IOException { - lastBounds = curBounds; - curBounds = bounds; - return new TextDumper(); - } - - Rect lastBounds = null; - Rect curBounds = null; - - /** - * SWFTagTypes interface - */ - public SWFText tagDefineText2(int id, Rect bounds, Matrix matrix) - throws IOException { - lastBounds = curBounds; - curBounds = bounds; - return new TextDumper(); - } - - public class TextDumper implements SWFText { - protected Integer fontId; - - protected boolean firstY = true; - - @Override - public void font(int fontId, int textHeight) { - this.fontId = fontId; - } - - @Override - public void setY(int y) { - if (firstY) - firstY = false; - else - strings.add("\n"); // Change in Y - dump a new line - } - - /* - * There are some issues with this method: sometimes SWF files define their - * own font, so short of OCR we cannot guess what is the glyph code -> - * character mapping. Additionally, some files don't use literal space - * character, instead they adjust glyphAdvances. We don't handle it at all - - * in such cases the text will be all glued together. - */ - @Override - public void text(int[] glyphIndices, int[] glyphAdvances) { - int[] codes = (int[]) fontCodes.get(fontId); - if (codes == null) { - // unknown font, better not guess - strings.add("\n**** ?????????????? ****\n"); - return; - } - - // --Translate the glyph indices to character codes - char[] chars = new char[glyphIndices.length]; - - for (int i = 0; i < chars.length; i++) { - int index = glyphIndices[i]; - - if (index >= codes.length) // System Font ? - { - chars[i] = (char) index; - } else { - chars[i] = (char) (codes[index]); - } - } - strings.add(new String(chars)); - } - - @Override - public void color(Color color) { - } - - @Override - public void setX(int x) { - } - - @Override - public void done() { - strings.add("\n"); - } - } - - @Override - public SWFActions tagDoAction() throws IOException { - return new NutchSWFActions(actionStrings, urls); - } - - @Override - public SWFActions tagDoInitAction(int arg0) throws IOException { - return new NutchSWFActions(actionStrings, urls); - } - -} - -/** - * ActionScript parser. This parser tries to extract free text embedded inside - * the script, but without polluting it too much with names of variables, - * methods, etc. Not ideal, but it works. - */ -class NutchSWFActions extends SWFActionBlockImpl implements SWFActions { - private HashSet strings = null; - - private ArrayList urls = null; - - String[] dict = null; - - Stack stack = null; - - public NutchSWFActions(HashSet strings, ArrayList urls) { - this.strings = strings; - this.urls = urls; - stack = new SmallStack(100, strings); - } - - @Override - public void lookupTable(String[] values) throws IOException { - for (int i = 0; i < values.length; i++) { - if (!strings.contains(values[i])) - strings.add(values[i]); - } - super.lookupTable(values); - dict = values; - } - - @Override - public void defineLocal() throws IOException { - stack.pop(); - super.defineLocal(); - } - - public void getURL(int vars, int mode) { - } - - @Override - public void getURL(String url, String target) throws IOException { - stack.push(url); - stack.push(target); - strings.remove(url); - strings.remove(target); - urls.add(url); - super.getURL(url, target); - } - - public SWFActionBlock.TryCatchFinally _try(String var) throws IOException { - strings.remove(var); - return super._try(var); - } - - @Override - public void comment(String var) throws IOException { - strings.remove(var); - super.comment(var); - } - - public void goToFrame(String var) throws IOException { - stack.push(var); - strings.remove(var); - super.gotoFrame(var); - } - - public void ifJump(String var) throws IOException { - strings.remove(var); - super.ifJump(var); - } - - public void jump(String var) throws IOException { - strings.remove(var); - super.jump(var); - } - - public void jumpLabel(String var) throws IOException { - strings.remove(var); - super.jumpLabel(var); - } - - public void lookup(int var) throws IOException { - if (dict != null && var >= 0 && var < dict.length) { - stack.push(dict[var]); - } - super.lookup(var); - } - - public void push(String var) throws IOException { - stack.push(var); - strings.remove(var); - super.push(var); - } - - public void setTarget(String var) throws IOException { - stack.push(var); - strings.remove(var); - super.setTarget(var); - } - - public SWFActionBlock startFunction(String var, String[] params) - throws IOException { - stack.push(var); - strings.remove(var); - if (params != null) { - for (int i = 0; i < params.length; i++) { - strings.remove(params[i]); - } - } - return this; - } - - public SWFActionBlock startFunction2(String var, int arg1, int arg2, - String[] params, int[] arg3) throws IOException { - stack.push(var); - strings.remove(var); - if (params != null) { - for (int i = 0; i < params.length; i++) { - strings.remove(params[i]); - } - } - return this; - } - - public void waitForFrame(int num, String var) throws IOException { - stack.push(var); - strings.remove(var); - super.waitForFrame(num, var); - } - - public void waitForFrame(String var) throws IOException { - stack.push(var); - strings.remove(var); - super.waitForFrame(var); - } - - public void done() throws IOException { - while (stack.size() > 0) { - strings.remove(stack.pop()); - } - } - - public SWFActionBlock start(int arg0, int arg1) throws IOException { - return this; - } - - public SWFActionBlock start(int arg0) throws IOException { - return this; - } - - public void add() throws IOException { - super.add(); - } - - public void asciiToChar() throws IOException { - super.asciiToChar(); - } - - public void asciiToCharMB() throws IOException { - super.asciiToCharMB(); - } - - public void push(int var) throws IOException { - if (dict != null && var >= 0 && var < dict.length) { - stack.push(dict[var]); - } - super.push(var); - } - - public void callFunction() throws IOException { - strings.remove(stack.pop()); - super.callFunction(); - } - - public void callMethod() throws IOException { - strings.remove(stack.pop()); - super.callMethod(); - } - - public void getMember() throws IOException { - // 0: name - String val = (String) stack.pop(); - strings.remove(val); - super.getMember(); - } - - public void setMember() throws IOException { - // 0: value -1: name - stack.pop(); // value - String name = (String) stack.pop(); - strings.remove(name); - super.setMember(); - } - - public void setProperty() throws IOException { - super.setProperty(); - } - - public void setVariable() throws IOException { - super.setVariable(); - } - - public void call() throws IOException { - strings.remove(stack.pop()); - super.call(); - } - - public void setTarget() throws IOException { - strings.remove(stack.pop()); - super.setTarget(); - } - - public void pop() throws IOException { - strings.remove(stack.pop()); - super.pop(); - } - - public void push(boolean arg0) throws IOException { - stack.push("" + arg0); - super.push(arg0); - } - - public void push(double arg0) throws IOException { - stack.push("" + arg0); - super.push(arg0); - } - - public void push(float arg0) throws IOException { - stack.push("" + arg0); - super.push(arg0); - } - - public void pushNull() throws IOException { - stack.push(""); - super.pushNull(); - } - - public void pushRegister(int arg0) throws IOException { - stack.push("" + arg0); - super.pushRegister(arg0); - } - - public void pushUndefined() throws IOException { - stack.push("???"); - super.pushUndefined(); - } - - public void getProperty() throws IOException { - stack.pop(); - super.getProperty(); - } - - public void getVariable() throws IOException { - strings.remove(stack.pop()); - super.getVariable(); - } - - public void gotoFrame(boolean arg0) throws IOException { - stack.push("" + arg0); - super.gotoFrame(arg0); - } - - public void gotoFrame(int arg0) throws IOException { - stack.push("" + arg0); - super.gotoFrame(arg0); - } - - public void gotoFrame(String arg0) throws IOException { - stack.push("" + arg0); - strings.remove(arg0); - super.gotoFrame(arg0); - } - - public void newObject() throws IOException { - stack.pop(); - super.newObject(); - } - - public SWFActionBlock startWith() throws IOException { - return this; - } - -} - -/* - * Small bottom-less stack. - */ -class SmallStack extends Stack { - - private static final long serialVersionUID = 1L; - - private int maxSize; - - private HashSet strings = null; - - public SmallStack(int maxSize, HashSet strings) { - this.maxSize = maxSize; - this.strings = strings; - } - - public Object push(Object o) { - // limit max size - if (this.size() > maxSize) { - String val = (String) remove(0); - strings.remove(val); - } - return super.push(o); - } - - public Object pop() { - // tolerate underruns - if (this.size() == 0) - return null; - else - return super.pop(); - } -} diff --git a/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/package-info.java b/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/package-info.java deleted file mode 100644 index 5942e64189..0000000000 --- a/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/package-info.java +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Parse Flash SWF files. - */ -package org.apache.nutch.parse.swf; - diff --git a/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java b/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java deleted file mode 100644 index 688e9b9cf6..0000000000 --- a/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.parse.swf; - -import java.io.FileInputStream; -import java.io.InputStreamReader; - -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.hadoop.io.Text; -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseException; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.util.NutchConfiguration; -import org.junit.Assert; -import org.junit.Test; - -/** - * Unit tests for SWFParser. - */ -public class TestSWFParser { - - private String fileSeparator = System.getProperty("file.separator"); - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); - - private String[] sampleFiles = new String[] { "test1.swf", "test2.swf", - "test3.swf" }; - private String[] sampleTexts = new String[] { "test1.txt", "test2.txt", - "test3.txt" }; - - @Test - public void testIt() throws ProtocolException, ParseException { - String urlString; - Protocol protocol; - Content content; - Parse parse; - Configuration conf = NutchConfiguration.create(); - - for (int i = 0; i < sampleFiles.length; i++) { - urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; - - protocol = new ProtocolFactory(conf).getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - - parse = new ParseUtil(conf).parse(content).get(content.getUrl()); - - String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim(); - Assert.assertTrue(sampleTexts[i].equals(text)); - } - } - - public TestSWFParser() { - for (int i = 0; i < sampleFiles.length; i++) { - try { - // read the test string - FileInputStream fis = new FileInputStream(sampleDir + fileSeparator - + sampleTexts[i]); - StringBuffer sb = new StringBuffer(); - int len = 0; - InputStreamReader isr = new InputStreamReader(fis, "UTF-8"); - char[] buf = new char[1024]; - while ((len = isr.read(buf)) > 0) { - sb.append(buf, 0, len); - } - isr.close(); - sampleTexts[i] = sb.toString().replaceAll("[ \t\r\n]+", " ").trim(); - } catch (Exception e) { - e.printStackTrace(); - } - } - } - -}