Skip to content
Permalink
Browse files
Merge into master
  • Loading branch information
lewismc committed Feb 23, 2018
2 parents 778d05e + 63ffc9e commit 073190bd7cb948ce1faf5df7dae61eb8257416ce
Show file tree
Hide file tree
Showing 20 changed files with 293 additions and 89 deletions.
@@ -61,6 +61,7 @@ public ExtractorGroup filterByMIMEType(MIMEType mimeType) {
return new ExtractorGroup(matching);
}

@Override
public Iterator<ExtractorFactory<?>> iterator() {
return factories.iterator();
}
@@ -116,7 +116,7 @@ public synchronized boolean loadJAR(File jar) {
* @return list of exceptions raised during the loading.
*/
public synchronized Throwable[] loadJARs(File... jars) {
final List<Throwable> result = new ArrayList<Throwable>();
final List<Throwable> result = new ArrayList<>();
for (File jar : jars) {
try {
loadJAR(jar);
@@ -158,7 +158,7 @@ public synchronized boolean loadClassDir(File classDir) {
* @return list of exceptions raised during the loading.
*/
public synchronized Throwable[] loadClassDirs(File... classDirs) {
final List<Throwable> result = new ArrayList<Throwable>();
final List<Throwable> result = new ArrayList<>();
for (File classDir : classDirs) {
try {
loadClassDir(classDir);
@@ -178,14 +178,15 @@ public synchronized Throwable[] loadClassDirs(File... classDirs) {
* Loads all the JARs detected in a given directory.
*
* @param jarDir directory containing the JARs to be loaded.
* Example '/usr/local/apache-tomcat-7.0.72/webapps/apache-any23-service-2.2-SNAPSHOT/WEB-INF/lib/apache-any23-openie'
* @return <code>true</code> if all JARs in dir are loaded.
*/
public synchronized boolean loadJARDir(File jarDir) {
if(jarDir == null)
throw new NullPointerException("JAR dir must be not null.");
if( ! jarDir.exists() )
if(!jarDir.exists() )
throw new IllegalArgumentException("Given directory doesn't exist:" + jarDir.getAbsolutePath());
if(! jarDir.isDirectory() )
if(!jarDir.isDirectory() )
throw new IllegalArgumentException(
"given file exists and it is not a directory: " + jarDir.getAbsolutePath()
);
@@ -210,7 +211,7 @@ public boolean accept(File dir, String name) {
* @return list of errors occurred during loading.
*/
public synchronized Throwable[] loadFiles(File... files) {
final List<Throwable> errors = new ArrayList<Throwable>();
final List<Throwable> errors = new ArrayList<>();
for(File file : files) {
try {
if (file.isFile() && file.getName().endsWith(".jar")) {
@@ -263,6 +264,7 @@ public synchronized Iterator<Tool> getTools() throws IOException {
* @return not <code>null</code> list of plugin classes.
* @throws IOException if there is an error obtaining Extractors.
*/
@SuppressWarnings("rawtypes")
public synchronized Iterator<ExtractorFactory> getExtractors() throws IOException {
return getPlugins(ExtractorFactory.class);
}
@@ -312,7 +314,8 @@ public synchronized ExtractorGroup configureExtractors(

final StringBuilder report = new StringBuilder();
try {
final List<ExtractorFactory<?>> newFactoryList = new ArrayList<ExtractorFactory<?>>();
final List<ExtractorFactory<?>> newFactoryList = new ArrayList<>();
@SuppressWarnings("rawtypes")
Iterator<ExtractorFactory> extractors = getExtractors();
while (extractors.hasNext()) {
ExtractorFactory<?> factory = extractors.next();
@@ -386,7 +389,7 @@ public synchronized Iterator<Tool> getApplicableTools(File... pluginLocations) t
*/
private File[] getPluginLocations(String pluginDirsList) {
final String[] locationsStr = pluginDirsList.split(PLUGIN_DIRS_LIST_SEPARATOR);
final List<File> locations = new ArrayList<File>();
final List<File> locations = new ArrayList<>();
for(String locationStr : locationsStr) {
final File location = new File(locationStr);
if( ! location.exists()) {
@@ -404,16 +407,16 @@ private File[] getPluginLocations(String pluginDirsList) {
*/
private static final class DynamicClassLoader extends URLClassLoader {

private final Set<String> addedURLs = new HashSet<String>();
private final Set<String> addedURLs = new HashSet<>();

private final List<File> jars;

private final List<File> dirs;

public DynamicClassLoader(URL[] urls) {
super(urls, Any23PluginManager.class.getClassLoader());
jars = new ArrayList<File>();
dirs = new ArrayList<File>();
jars = new ArrayList<>();
dirs = new ArrayList<>();
}

public DynamicClassLoader() {
@@ -98,7 +98,8 @@ public class Any23 {
* @param extractorGroup the group of extractors to be applied.
*/
public Any23(Configuration configuration, ExtractorGroup extractorGroup) {
if(configuration == null) throw new NullPointerException("configuration must be not null.");
if(configuration == null)
throw new NullPointerException("configuration must be not null.");
this.configuration = configuration;
logger.debug( configuration.getConfigurationDump() );

@@ -259,7 +260,8 @@ public void setMIMETypeDetector(MIMETypeDetector detector) {
* @throws IOException if an error occurs while initializing the internal {@link org.apache.any23.http.HTTPClient}.
*/
public DocumentSource createDocumentSource(String documentIRI) throws URISyntaxException, IOException {
if(documentIRI == null) throw new NullPointerException("documentIRI cannot be null.");
if(documentIRI == null)
throw new NullPointerException("documentIRI cannot be null.");
if (documentIRI.toLowerCase().startsWith("file:")) {
return new FileDocumentSource( new File(new URI(documentIRI)) );
}
@@ -453,7 +455,7 @@ public ExtractionReport extract(ExtractionParameters eps, DocumentSource in, Tri
}

private String getAcceptHeader() {
Collection<MIMEType> mimeTypes = new ArrayList<MIMEType>();
Collection<MIMEType> mimeTypes = new ArrayList<>();
for (ExtractorFactory<?> factory : factories) {
mimeTypes.addAll(factory.getSupportedMIMETypes());
}
@@ -30,20 +30,21 @@
* Singleton class acting as a register for all the various
* {@link Extractor}.
*/
@SuppressWarnings("rawtypes")
public class ExtractorRegistryImpl extends org.eclipse.rdf4j.common.lang.service.ServiceRegistry<String, ExtractorFactory> implements ExtractorRegistry {

/**
* The instance.
*/
private static ExtractorRegistry instance = null;

/**
* Public constructor for ExtractorRegistryImpl. Should normally call getInstance.
*/
public ExtractorRegistryImpl() {
super(ExtractorFactory.class);
}

/**
* The instance.
*/
private static ExtractorRegistry instance = null;

/**
* @return returns the {@link ExtractorRegistry} instance.
*/

This file was deleted.

@@ -46,16 +46,16 @@
*/
public class HTMLScraperExtractor implements Extractor.ContentExtractor {

public final static IRI PAGE_CONTENT_DE_PROPERTY =
public static final IRI PAGE_CONTENT_DE_PROPERTY =
SimpleValueFactory.getInstance().createIRI(SINDICE.NS + "pagecontent/de");
public final static IRI PAGE_CONTENT_AE_PROPERTY =
public static final IRI PAGE_CONTENT_AE_PROPERTY =
SimpleValueFactory.getInstance().createIRI(SINDICE.NS + "pagecontent/ae");
public final static IRI PAGE_CONTENT_LCE_PROPERTY =
public static final IRI PAGE_CONTENT_LCE_PROPERTY =
SimpleValueFactory.getInstance().createIRI(SINDICE.NS + "pagecontent/lce");
public final static IRI PAGE_CONTENT_CE_PROPERTY =
public static final IRI PAGE_CONTENT_CE_PROPERTY =
SimpleValueFactory.getInstance().createIRI(SINDICE.NS + "pagecontent/ce");

private final List<ExtractionRule> extractionRules = new ArrayList<ExtractionRule>();
private final List<ExtractionRule> extractionRules = new ArrayList<>();

public HTMLScraperExtractor() {
loadDefaultRules();
@@ -66,7 +66,7 @@ public void addTextExtractor(String name, IRI property, BoilerpipeExtractor extr
}

public String[] getTextExtractors() {
final List<String> extractors = new ArrayList<String>();
final List<String> extractors = new ArrayList<>();
for(ExtractionRule er : extractionRules) {
extractors.add(er.name);
}
@@ -57,6 +57,11 @@
<artifactId>apache-any23-basic-crawler</artifactId>
<version>1.0.6-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.any23.plugins</groupId>
<artifactId>apache-any23-openie</artifactId>
<version>${project.parent.version}</version>
</dependency>

<!-- BEGIN: Test Dependencies -->
<dependency>
@@ -56,6 +56,9 @@ public class PluginIT {
private static final File CRAWLER_TARGET_DIR = new File(PLUGIN_DIR + "basic-crawler/target/classes");
private static final File CRAWLER_DEPENDENCY_DIR = new File(PLUGIN_DIR + "basic-crawler/target/dependency");

private static final File OPENIE_TARGET_DIR = new File(PLUGIN_DIR + "openie/target/classes");
private static final File OPENIE_DEPENDENCY_DIR = new File(PLUGIN_DIR + "openie/target/dependency");

private Any23PluginManager manager;

@Before
@@ -79,13 +82,15 @@ public void after() {
public void testDetectExtractorPlugins() throws IOException, InstantiationException, IllegalAccessException {
final ExtractorGroup extractorGroup = manager.getApplicableExtractors(
new ExtractorRegistryImpl(),
HTML_SCRAPER_TARGET_DIR, // Required to satisfy class dependencies.
HTML_SCRAPER_TARGET_DIR,
HTML_SCRAPER_DEPENDENCY_DIR,
OFFICE_SCRAPER_TARGET_DIR,
OFFICE_SCRAPER_DEPENDENCY_DIR // Required to satisfy class dependencies.
OFFICE_SCRAPER_DEPENDENCY_DIR,
OPENIE_TARGET_DIR,
OPENIE_DEPENDENCY_DIR
);
try {
Class.forName("org.apache.any23.extractor.openie.OpenIEExtractor", false, this.getClass().getClassLoader());
Class.forName("org.apache.any23.plugin.extractor.openie.OpenIEExtractor", false, this.getClass().getClassLoader());
assertEquals("Did not find the number of expected extractors", NUM_OF_EXTRACTORS_INCL_OPENIE ,
extractorGroup.getNumOfExtractors()
);
@@ -15,16 +15,22 @@
See the License for the specific language governing permissions and
limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<artifactId>apache-any23</artifactId>
<groupId>org.apache.any23</groupId>
<artifactId>apache-any23</artifactId>
<version>2.2-SNAPSHOT</version>
<relativePath>../</relativePath>
<relativePath>../../pom.xml</relativePath>
</parent>

<groupId>org.apache.any23.plugins</groupId>
<artifactId>apache-any23-openie</artifactId>

<name>Apache Any23 :: Plugins :: OpenIE</name>
<description>Open Information Extraction module.</description>

<repositories>
<repository>
<snapshots>
@@ -46,19 +52,15 @@
</pluginRepository>
</pluginRepositories>

<artifactId>apache-any23-openie</artifactId>

<name>Apache Any23 :: OpenIE</name>
<description>Open Information Extraction module.</description>

<dependencies>
<dependency>
<groupId>${project.groupId}</groupId>
<groupId>org.apache.any23</groupId>
<artifactId>apache-any23-core</artifactId>
<version>${project.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<groupId>org.apache.any23</groupId>
<artifactId>apache-any23-test-resources</artifactId>
<version>${project.version}</version>
<scope>test</scope>
@@ -67,20 +69,20 @@
<dependency>
<groupId>org.allenai.openie</groupId>
<artifactId>openie_2.11</artifactId>
<version>4.2.6</version>
<version>${openie_2.11.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.allenai.openie</groupId>
<artifactId>openie_2.11</artifactId>
<version>4.2.6</version>
<version>${openie_2.11.version}</version>
<scope>compile</scope>
<type>pom</type>
</dependency>
<dependency>
<groupId>edu.washington.cs.knowitall</groupId>
<artifactId>openregex</artifactId>
<version>1.1.1</version>
<version>${openregex.version}</version>
<scope>runtime</scope>
</dependency>
<dependency>
@@ -104,6 +106,17 @@
<skipTests>true</skipTests>
</configuration>
</plugin>
<!-- Generates the distribution package -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<appendAssemblyId>false</appendAssemblyId>
<descriptors>
<descriptor>${basedir}/src/main/assembly/bin.xml</descriptor>
</descriptors>
</configuration>
</plugin>
</plugins>
<pluginManagement>
<plugins>
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.extractor.openie;
package org.apache.any23.plugin.extractor.openie;

import java.io.IOException;
import java.util.List;
@@ -27,6 +27,9 @@
import org.apache.any23.configuration.DefaultConfiguration;
import org.apache.any23.extractor.ExtractionContext;
import org.apache.any23.extractor.ExtractorDescription;
import org.apache.any23.extractor.ExtractorFactory;
import org.apache.any23.plugin.Author;
import org.apache.any23.plugin.ExtractorPlugin;
import org.apache.any23.rdf.RDFUtils;
import org.apache.any23.util.StreamUtils;
import org.apache.tika.Tika;
@@ -59,12 +62,11 @@
* extractor able to generate <i>RDF</i> statements from
* sentences representing relations in the text.
*/
public class OpenIEExtractor implements Extractor.TagSoupDOMExtractor {
@Author(name="Lewis John McGibbney (lewismc@apache.org)")
public class OpenIEExtractor implements Extractor.TagSoupDOMExtractor, ExtractorPlugin {

private static final Logger LOG = LoggerFactory.getLogger(OpenIEExtractor.class);

private IRI documentRoot;

/**
* default constructor
*/
@@ -86,7 +88,7 @@ public void run(ExtractionParameters extractionParameters,
throws IOException, ExtractionException {

IRI documentIRI = context.getDocumentIRI();
documentRoot = RDFUtils.iri(documentIRI.toString() + "root");
RDFUtils.iri(documentIRI.toString() + "root");
out.writeNamespace(RDF.PREFIX, RDF.NAMESPACE);
out.writeNamespace(RDFS.PREFIX, RDFS.NAMESPACE);
LOG.debug("Processing: {}", documentIRI.toString());
@@ -127,4 +129,9 @@ public void run(ExtractionParameters extractionParameters,
}
}
}

@Override
public ExtractorFactory<?> getExtractorFactory() {
return (ExtractorFactory<?>) OpenIEExtractorFactory.getDescriptionInstance();
}
}
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.extractor.openie;
package org.apache.any23.plugin.extractor.openie;

import java.util.Arrays;

@@ -28,7 +28,7 @@
*
*/
public class OpenIEExtractorFactory extends SimpleExtractorFactory<OpenIEExtractor>
implements ExtractorFactory<OpenIEExtractor> {
implements ExtractorFactory<OpenIEExtractor> {

public static final String NAME = "openie";

@@ -0,0 +1 @@
org.apache.any23.plugin.extractor.openie.OpenIEExtractorFactory

0 comments on commit 073190b

Please sign in to comment.