Skip to content

Commit

Permalink
OPENNLP-1124: Optimize XML parser configuration
Browse files Browse the repository at this point in the history
  • Loading branch information
kottmann committed Aug 31, 2017
1 parent 52573ea commit c2c14e9
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 32 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,14 @@
import java.util.ArrayList;
import java.util.List;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.xml.sax.SAXException;

import opennlp.tools.parser.Parse;
import opennlp.tools.util.FilterObjectStream;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.XmlUtil;

public class ConstitParseSampleStream extends FilterObjectStream<byte[], Parse> {

Expand All @@ -40,13 +39,7 @@ public class ConstitParseSampleStream extends FilterObjectStream<byte[], Parse>

protected ConstitParseSampleStream(ObjectStream<byte[]> samples) {
super(samples);

SAXParserFactory factory = SAXParserFactory.newInstance();
try {
saxParser = factory.newSAXParser();
} catch (ParserConfigurationException | SAXException e) {
throw new IllegalStateException(e);
}
saxParser = XmlUtil.createSaxParser();
}

public Parse read() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@
import java.util.Map.Entry;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.w3c.dom.Document;
import org.w3c.dom.Node;
Expand All @@ -40,6 +38,7 @@

import opennlp.tools.tokenize.TokenSample;
import opennlp.tools.util.Span;
import opennlp.tools.util.XmlUtil;

/**
* A structure to hold an Irish Sentence Bank document, which is a collection
Expand Down Expand Up @@ -154,8 +153,7 @@ public static IrishSentenceBankDocument parse(InputStream is) throws IOException
IrishSentenceBankDocument document = new IrishSentenceBankDocument();

try {
DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
DocumentBuilder docBuilder = XmlUtil.createDocumentBuilder();
Document doc = docBuilder.parse(is);

String root = doc.getDocumentElement().getNodeName();
Expand Down Expand Up @@ -262,8 +260,6 @@ public static IrishSentenceBankDocument parse(InputStream is) throws IOException
}
}
return document;
} catch (ParserConfigurationException e) {
throw new IllegalStateException(e);
} catch (SAXException e) {
throw new IOException("Failed to parse IrishSentenceBank document", e);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,15 @@
import java.util.Collections;
import java.util.List;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;

import opennlp.tools.util.XmlUtil;

/**
* A structure to hold the letsmt document. The documents contains sentences and depending on the
* source it either contains tokenized text (words) or an un-tokenized sentence string.
Expand Down Expand Up @@ -118,18 +118,14 @@ public List<LetsmtSentence> getSentences() {
}

static LetsmtDocument parse(InputStream letsmtXmlIn) throws IOException {
SAXParserFactory spf = SAXParserFactory.newInstance();
SAXParser saxParser = XmlUtil.createSaxParser();

try {
SAXParser saxParser = spf.newSAXParser();

XMLReader xmlReader = saxParser.getXMLReader();
LetsmtDocumentHandler docHandler = new LetsmtDocumentHandler();
xmlReader.setContentHandler(docHandler);
xmlReader.parse(new InputSource(letsmtXmlIn));
return new LetsmtDocument(docHandler.sentences);
} catch (ParserConfigurationException e) {
throw new IllegalStateException(e);
} catch (SAXException e) {
throw new IOException("Failed to parse letsmt xml!", e);
}
Expand Down
60 changes: 60 additions & 0 deletions opennlp-tools/src/main/java/opennlp/tools/util/XmlUtil.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package opennlp.tools.util;

import javax.xml.XMLConstants;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.xml.sax.SAXException;

public class XmlUtil {

/**
* Create a new DocumentBuilder which processes XML securely.
*
* @return a DocumentBuilder
*/
public static DocumentBuilder createDocumentBuilder() {
try {
DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
documentBuilderFactory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
return documentBuilderFactory.newDocumentBuilder();
} catch (ParserConfigurationException e) {
throw new IllegalStateException(e);
}
}

/**
* Create a new SAXParser which processes XML securely.
*
* @return a SAXParser
*/
public static SAXParser createSaxParser() {
SAXParserFactory spf = SAXParserFactory.newInstance();
try {
spf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
return spf.newSAXParser();
} catch (ParserConfigurationException | SAXException e) {
throw new IllegalStateException(e);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@
import java.util.Objects;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
Expand All @@ -45,6 +43,7 @@
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.postag.POSModel;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.XmlUtil;
import opennlp.tools.util.ext.ExtensionLoader;
import opennlp.tools.util.model.ArtifactSerializer;
import opennlp.tools.util.model.DictionarySerializer;
Expand Down Expand Up @@ -735,15 +734,8 @@ static AdaptiveFeatureGenerator createGenerator(Element generatorElement,

private static org.w3c.dom.Document createDOM(InputStream xmlDescriptorIn)
throws IOException {
DocumentBuilderFactory documentBuilderFacoty = DocumentBuilderFactory.newInstance();

DocumentBuilder documentBuilder;

try {
documentBuilder = documentBuilderFacoty.newDocumentBuilder();
} catch (ParserConfigurationException e) {
throw new IllegalStateException(e);
}
DocumentBuilder documentBuilder = XmlUtil.createDocumentBuilder();

org.w3c.dom.Document xmlDescriptorDOM;

Expand Down

0 comments on commit c2c14e9

Please sign in to comment.