From bd3ecfcddeaf13262e477ba29c5256ebd44e32db Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Thu, 26 May 2016 11:15:02 -0700 Subject: [PATCH] TIKA-1978 Invocation of java.net.URL.equals(Object), which blocks to do domain name resolution, in org.apache.tika.parser.geo.topic.GeoParser.initialize(URL) 2.x branch --- .../tika/parser/geo/topic/GeoParser.java | 43 ++++++++++--------- .../parser/geo/topic/GeoParserConfig.java | 4 +- .../apache/tika/parser/geo/topic/GeoTag.java | 33 +++++++------- .../parser/geo/topic/NameEntityExtractor.java | 11 ++--- 4 files changed, 48 insertions(+), 43 deletions(-) diff --git a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java index eaef6ad735d..303f87846a4 100644 --- a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java +++ b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java @@ -20,19 +20,21 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; +import java.net.URISyntaxException; import java.net.URL; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; +import java.util.List; +import java.util.Map; import java.util.Set; +import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.exec.CommandLine; import org.apache.commons.exec.DefaultExecutor; -import org.apache.commons.exec.ExecuteException; import org.apache.commons.exec.ExecuteWatchdog; import org.apache.commons.exec.PumpStreamHandler; -import org.apache.commons.exec.environment.EnvironmentUtils; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; @@ -57,7 +59,7 @@ public class GeoParser extends AbstractParser { private boolean initialized; private URL modelUrl; - private NameEntityExtractor extractor; + private transient NameEntityExtractor extractor; private boolean available; @Override @@ -70,9 +72,12 @@ public Set getSupportedTypes(ParseContext parseContext) { * @param modelUrl the URL to NER model */ public void initialize(URL modelUrl) { - if (this.modelUrl != null && this.modelUrl.equals(modelUrl)) { - // Previously initialized for the same URL, no initialization needed - return; + try { + if (this.modelUrl != null && this.modelUrl.toURI().equals(modelUrl.toURI())) { + return; + } + } catch (URISyntaxException e1) { + LOG.log(Level.SEVERE, e1.getMessage(), e1); } this.modelUrl = modelUrl; @@ -112,7 +117,7 @@ public void parse(InputStream stream, ContentHandler handler, String bestner = extractor.bestNameEntity; /*------------------------resolve geonames for each ner, store results in a hashmap---------------------*/ - HashMap> resolvedGeonames = searchGeoNames(locationNameEntities); + HashMap> resolvedGeonames = (HashMap>) searchGeoNames(locationNameEntities); /*----------------store locationNameEntities and their geonames in a geotag, each input has one geotag---------------------*/ GeoTag geotag = new GeoTag(); @@ -120,22 +125,21 @@ public void parse(InputStream stream, ContentHandler handler, /* add resolved entities in metadata */ - metadata.add("Geographic_NAME", geotag.Geographic_NAME); - metadata.add("Geographic_LONGITUDE", geotag.Geographic_LONGTITUDE); - metadata.add("Geographic_LATITUDE", geotag.Geographic_LATITUDE); + metadata.add("Geographic_NAME", geotag.geoNAME); + metadata.add("Geographic_LONGITUDE", geotag.geoLONGTITUDE); + metadata.add("Geographic_LATITUDE", geotag.geoLATITUDE); for (int i = 0; i < geotag.alternatives.size(); ++i) { - GeoTag alter = (GeoTag) geotag.alternatives.get(i); - metadata.add("Optional_NAME" + (i + 1), alter.Geographic_NAME); + GeoTag alter = geotag.alternatives.get(i); + metadata.add("Optional_NAME" + (i + 1), alter.geoNAME); metadata.add("Optional_LONGITUDE" + (i + 1), - alter.Geographic_LONGTITUDE); + alter.geoLONGTITUDE); metadata.add("Optional_LATITUDE" + (i + 1), - alter.Geographic_LATITUDE); + alter.geoLATITUDE); } } - public HashMap> searchGeoNames( - ArrayList locationNameEntities) throws ExecuteException, - IOException { + public Map> searchGeoNames( + List locationNameEntities) throws IOException { CommandLine cmdLine = new CommandLine("lucene-geo-gazetteer"); ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); cmdLine.addArgument("-s"); @@ -150,17 +154,16 @@ public HashMap> searchGeoNames( exec.setWatchdog(watchdog); PumpStreamHandler streamHandler = new PumpStreamHandler(outputStream); exec.setStreamHandler(streamHandler); - int exitValue = exec.execute(cmdLine, EnvironmentUtils.getProcEnvironment()); String outputJson = outputStream.toString("UTF-8"); JSONArray json = (JSONArray) JSONValue.parse(outputJson); - HashMap> returnHash = new HashMap>(); + HashMap> returnHash = new HashMap<>(); for (int i = 0; i < json.size(); i++) { JSONObject obj = (JSONObject) json.get(i); for (Object key : obj.keySet()) { String theKey = (String) key; JSONArray vals = (JSONArray) obj.get(theKey); - ArrayList stringVals = new ArrayList( + ArrayList stringVals = new ArrayList<>( vals.size()); for (int j = 0; j < vals.size(); j++) { String val = (String) vals.get(j); diff --git a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java index 305e66305bf..56272e1c0e0 100644 --- a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java +++ b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java @@ -30,7 +30,7 @@ public GeoParserConfig() { this.nerModelUrl = GeoParserConfig.class.getResource("en-ner-location.bin"); } - public void setNERModelPath(String path) { + public void setNERModelPath(String path) throws MalformedURLException { if (path == null) return; File file = new File(path); @@ -40,7 +40,7 @@ public void setNERModelPath(String path) { try { this.nerModelUrl = file.toURI().toURL(); } catch (MalformedURLException e) { - throw new RuntimeException(e); + throw new MalformedURLException(e.getMessage()); } } diff --git a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java index bccaef1ba73..fe4b9c69024 100644 --- a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java +++ b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java @@ -18,18 +18,19 @@ package org.apache.tika.parser.geo.topic; import java.util.ArrayList; -import java.util.HashMap; +import java.util.Map; +import java.util.Map.Entry; public class GeoTag { - String Geographic_NAME; - String Geographic_LONGTITUDE; - String Geographic_LATITUDE; - ArrayList alternatives = new ArrayList(); + String geoNAME; + String geoLONGTITUDE; + String geoLATITUDE; + ArrayList alternatives = new ArrayList<>(); public void setMain(String name, String longitude, String latitude) { - Geographic_NAME = name; - Geographic_LONGTITUDE = longitude; - Geographic_LATITUDE = latitude; + geoNAME = name; + geoLONGTITUDE = longitude; + geoLATITUDE = latitude; } public void addAlternative(GeoTag geotag) { @@ -44,20 +45,20 @@ public void addAlternative(GeoTag geotag) { * @param bestNER best name entity among all the extracted entities for the * input stream */ - public void toGeoTag(HashMap> resolvedGeonames, + public void toGeoTag(Map> resolvedGeonames, String bestNER) { - for (String key : resolvedGeonames.keySet()) { + for (Entry> key : resolvedGeonames.entrySet()) { ArrayList cur = resolvedGeonames.get(key); if (key.equals(bestNER)) { - this.Geographic_NAME = cur.get(0); - this.Geographic_LONGTITUDE = cur.get(1); - this.Geographic_LATITUDE = cur.get(2); + this.geoNAME = cur.get(0); + this.geoLONGTITUDE = cur.get(1); + this.geoLATITUDE = cur.get(2); } else { GeoTag alter = new GeoTag(); - alter.Geographic_NAME = cur.get(0); - alter.Geographic_LONGTITUDE = cur.get(1); - alter.Geographic_LATITUDE = cur.get(2); + alter.geoNAME = cur.get(0); + alter.geoLONGTITUDE = cur.get(1); + alter.geoLATITUDE = cur.get(2); this.addAlternative(alter); } } diff --git a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java index 3c6f0e84c55..822d3434d35 100644 --- a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java +++ b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java @@ -43,11 +43,11 @@ public class NameEntityExtractor { private final NameFinderME nameFinder; public NameEntityExtractor(URL modelUrl) throws IOException { - this.locationNameEntities = new ArrayList(); + this.locationNameEntities = new ArrayList<>(); this.bestNameEntity = null; TokenNameFinderModel model = new TokenNameFinderModel(modelUrl); this.nameFinder = new NameFinderME(model); - this.tf = new HashMap(); + this.tf = new HashMap<>(); } /* @@ -59,7 +59,7 @@ public NameEntityExtractor(URL modelUrl) throws IOException { */ public void getAllNameEntitiesfromInput(InputStream stream) throws IOException { String[] in = IOUtils.toString(stream, UTF_8).split(" "); - Span nameE[]; + Span[] nameE; //name finder is not thread safe https://opennlp.apache.org/documentation/1.5.2-incubating/manual/opennlp.html#tools.namefind synchronized (nameFinder) { @@ -89,7 +89,7 @@ public void getAllNameEntitiesfromInput(InputStream stream) throws IOException { * ArrayList */ public void getBestNameEntity() { - if (this.locationNameEntities.size() == 0) + if (this.locationNameEntities.isEmpty()) return; for (int i = 0; i < this.locationNameEntities.size(); ++i) { @@ -100,10 +100,11 @@ public void getBestNameEntity() { tf.put(this.locationNameEntities.get(i), 1); } int max = 0; - List> list = new ArrayList>( + List> list = new ArrayList<>( tf.entrySet()); Collections.shuffle(list); Collections.sort(list, new Comparator>() { + @Override public int compare(Map.Entry o1, Map.Entry o2) { // Descending Order