diff --git a/Dockerfile b/Dockerfile index 4e01422..2a88f73 100644 --- a/Dockerfile +++ b/Dockerfile @@ -28,4 +28,4 @@ FROM openjdk:17-slim RUN mkdir /usr/local/jar -COPY --from=builder /usr/src/dogeared-extruder/target/extruder-1.1.jar /usr/local/jar/dogeared-extruder.jar \ No newline at end of file +COPY --from=builder /usr/src/dogeared-extruder/target/extruder-2.0.jar /usr/local/jar/dogeared-extruder.jar diff --git a/Makefile b/Makefile index a18aa13..6791dc7 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ build: todo mvn install run: - java -jar target/extruder-1.0.jar server + java -jar target/extruder-2.0.jar server todo: echo "# Generated automatically at" `date` > TODO.txt diff --git a/README.md b/README.md index e35865f..b11d540 100644 --- a/README.md +++ b/README.md @@ -5,16 +5,12 @@ This is a meant to be a simple HTTP Pony to wrap the `boilerpipe` and `Tika` and clones of the `readability` text extraction libraries using the `dropwizard` framework. -Important +Version "2" -- This package was not updated between May 2014 and February 2022. -There is a [v2 branch](https://github.com/aaronland/dogeared-extruder/tree/v2) for this package with up-to-date dependencies. -Unfortunately, some of those dependencies contain changes that need to be accounted for in this package's code. That -work is underway. Any help or suggestions would be appreciated. - -In the meantime, known security vulnerabilities for older dependencies have been addressed. +In February 2022 "version 2" was released which introduces no new user-facing features but updated the internal code, where necessary, to account for updated dependencies and known security vulnerabilities. Quick start -- @@ -24,7 +20,7 @@ To start the server: $> cd dogeared-extruder $> make build ... JAVA STUFF ... - $> java -jar target/extruder-1.1.jar server + $> java -jar target/extruder-2.0.jar server ... MOAR JAVA STUFF ... INFO [2013-08-30 12:49:12,184] org.eclipse.jetty.server.AbstractConnector: Started InstrumentedBlockingChannelConnector@0.0.0.0:8080 INFO [2013-08-30 12:49:12,189] org.eclipse.jetty.server.AbstractConnector: Started SocketConnector@0.0.0.0:8081 diff --git a/TODO.txt b/TODO.txt index b044809..81b95fe 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,11 +1,10 @@ -# Generated automatically at Tue Feb 15 22:45:38 PST 2022 +# Generated automatically at Fri Feb 18 17:32:35 PST 2022 -./src/main/java/info/aaronland/extruder/ExtruderService.java:30: // TODO: put me in the config file... (20130908/straup) ./src/main/java/info/aaronland/extruder/JavaReadabilityResource.java:54: // TODO: trap MalformedURLExceptions and return NOT_ACCEPTABLE here (20130901/straup) -./src/main/java/info/aaronland/extruder/ExtruderApplication.java~:30: // TODO: put me in the config file... (20130908/straup) ./src/main/java/info/aaronland/extruder/BoilerpipeResource.java:52: // TODO: trap MalformedURLExceptions and return NOT_ACCEPTABLE here (20130901/straup) +./src/main/java/info/aaronland/extruder/ExtruderApplication.java:40: // TODO: put me in the config file... (20130908/straup) ./src/main/java/com/basistech/readability/Readability.java:93: // TODO: reset the results. ./src/main/java/com/basistech/readability/Readability.java:368: * http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 TODO: Shouldn't this be a reverse ./src/main/java/com/basistech/readability/Readability.java:686: * at the same time without effecting the traversal. TODO: Consider taking into account original -./src/main/java/info/aaronland/extruder/TikaResource.java:140: // TO DO: figure out how to make this return HTML instead of text +./src/main/java/info/aaronland/extruder/TikaResource.java:139: // TO DO: figure out how to make this return HTML instead of text ./src/main/java/info/aaronland/extruder/Upload.java:20: // TO DO: sort out file extensions etc. diff --git a/configuration.yml b/configuration.yml index ecfd447..743e4ec 100644 --- a/configuration.yml +++ b/configuration.yml @@ -1,2 +1,8 @@ logging: level: INFO + loggers: + info.aaronland.extruder: DEBUG +viewRendererConfiguration: + freemarker: + strict_syntax: yes + whitespace_stripping: yes \ No newline at end of file diff --git a/pom.xml b/pom.xml index f9699cd..44bd755 100644 --- a/pom.xml +++ b/pom.xml @@ -8,38 +8,53 @@ info.aaronland.extruder extruder - 1.1 + 2.0 - com.yammer.dropwizard + io.dropwizard dropwizard-core - 0.6.2 + 2.0.28 - com.yammer.dropwizard + io.dropwizard dropwizard-views - 0.6.2 + 2.0.28 - com.yammer.metrics - metrics-core - 2.2.0 + io.dropwizard + dropwizard-forms + 2.0.28 - com.sun.jersey.contribs - jersey-multipart - - - 1.17.1 - + io.dropwizard + dropwizard-views-freemarker + 2.0.28 + + + + io.dropwizard.metrics + metrics-core + 4.2.8 + + + + + org.glassfish.jersey.media + jersey-media-multipart + 2.29.1 + + @@ -53,28 +68,41 @@ org.apache.tika tika-core - 1.22 + 2.3.0 org.apache.tika tika-parsers - 1.22 + 2.3.0 + pom + + org.apache.tika + tika-parsers-standard-package + 2.3.0 + + - + javax.xml.bind jaxb-api 2.3.1 + + javax.mail + javax.mail-api + 1.6.2 + + org.apache.commons commons-lang3 - 3.1 + 3.12.0 @@ -86,7 +114,7 @@ net.sourceforge.nekohtml nekohtml - 1.9.18 + 1.9.22 @@ -102,7 +130,7 @@ commons-io commons-io - 2.7 + 2.11.0 jar compile @@ -120,7 +148,7 @@ org.apache.maven.plugins maven-compiler-plugin - 3.0 + 3.10.0 1.7 1.7 @@ -132,7 +160,7 @@ org.apache.maven.plugins maven-shade-plugin - 1.6 + 3.2.4 package @@ -144,7 +172,7 @@ - info.aaronland.extruder.ExtruderService + info.aaronland.extruder.ExtruderApplication @@ -166,7 +194,7 @@ org.codehaus.mojo exec-maven-plugin - 1.2.1 + 3.0.0 info.aaronland.extruder.ExtruderService diff --git a/src/main/java/com/basistech/readability/FilePageReader.java b/src/main/java/com/basistech/readability/FilePageReader.java index 51a665a..73b3230 100644 --- a/src/main/java/com/basistech/readability/FilePageReader.java +++ b/src/main/java/com/basistech/readability/FilePageReader.java @@ -22,7 +22,9 @@ import java.io.FileInputStream; import java.io.IOException; -import org.apache.tika.io.IOUtils; +import org.apache.commons.io.IOUtils ; + +// import org.apache.tika.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/src/main/java/info/aaronland/extruder/BoilerpipeResource.java b/src/main/java/info/aaronland/extruder/BoilerpipeResource.java index 2969870..d168321 100644 --- a/src/main/java/info/aaronland/extruder/BoilerpipeResource.java +++ b/src/main/java/info/aaronland/extruder/BoilerpipeResource.java @@ -7,9 +7,9 @@ import java.io.InputStream; import java.io.File; -import com.sun.jersey.core.header.FormDataContentDisposition; -import com.sun.jersey.multipart.FormDataMultiPart; -import com.sun.jersey.multipart.FormDataBodyPart; +import org.glassfish.jersey.media.multipart.FormDataContentDisposition; +import org.glassfish.jersey.media.multipart.FormDataMultiPart; +import org.glassfish.jersey.media.multipart.FormDataBodyPart; import javax.ws.rs.core.MediaType; import javax.ws.rs.GET; diff --git a/src/main/java/info/aaronland/extruder/DocumentView.java b/src/main/java/info/aaronland/extruder/DocumentView.java index 604db12..6588393 100644 --- a/src/main/java/info/aaronland/extruder/DocumentView.java +++ b/src/main/java/info/aaronland/extruder/DocumentView.java @@ -1,11 +1,7 @@ package info.aaronland.extruder; import info.aaronland.extruder.Document; -import com.yammer.dropwizard.views.View; - -import com.google.common.base.Charsets; -import com.google.common.base.Optional; -import java.nio.charset.Charset; +import io.dropwizard.views.View; public class DocumentView extends View { private final Document document; @@ -19,13 +15,4 @@ public Document getDocument(){ return document; } - // Because in com/codahale/dropwizard/views/freemarker/FreemarkerViewRenderer.java this: - // final Charset charset = view.getCharset().or(Charset.forName(configuration.getEncoding(locale))); - // And since the default encoding for en-us is ISO-8859-1... good times - // (20130908/straup) - - public Optional getCharset(){ - return Optional.of(Charsets.UTF_8); - } - } diff --git a/src/main/java/info/aaronland/extruder/ExtruderApplication.java b/src/main/java/info/aaronland/extruder/ExtruderApplication.java new file mode 100644 index 0000000..a21e1a8 --- /dev/null +++ b/src/main/java/info/aaronland/extruder/ExtruderApplication.java @@ -0,0 +1,46 @@ +package info.aaronland.extruder; + +import io.dropwizard.Application; +import io.dropwizard.setup.Bootstrap; +import io.dropwizard.setup.Environment; +import io.dropwizard.views.ViewBundle; +import io.dropwizard.forms.MultiPartBundle; + +import info.aaronland.extruder.ExtruderConfiguration; + +import java.net.URL; +import java.util.Map; + +public class ExtruderApplication extends Application { + + public static void main(String[] args) throws Exception { + new ExtruderApplication().run(args); + } + + public void initialize(Bootstrap bootstrap) { + + bootstrap.addBundle(new MultiPartBundle()); + + bootstrap.addBundle(new ViewBundle(){ + + @Override + public Map> getViewConfiguration(ExtruderConfiguration config) { + return config.getViewRendererConfiguration(); + } + }); + } + + @Override + public void run(ExtruderConfiguration conf, Environment env) throws Exception { + + env.jersey().register(new BoilerpipeResource()); + env.jersey().register(new TikaResource()); + env.jersey().register(new JavaReadabilityResource()); + + // TODO: put me in the config file... (20130908/straup) + URL healthcheck_url = new URL("https://github.com/aaronland/dogeared-extruder/"); + + env.healthChecks().register("internets", new InternetsHealthCheck(healthcheck_url)); + } + +} diff --git a/src/main/java/info/aaronland/extruder/ExtruderConfiguration.java b/src/main/java/info/aaronland/extruder/ExtruderConfiguration.java index 3fa35fb..8b39822 100644 --- a/src/main/java/info/aaronland/extruder/ExtruderConfiguration.java +++ b/src/main/java/info/aaronland/extruder/ExtruderConfiguration.java @@ -1,9 +1,27 @@ package info.aaronland.extruder; -import com.yammer.dropwizard.config.Configuration; +import io.dropwizard.Configuration; import javax.validation.Valid; +import javax.validation.constraints.NotNull; + +import java.util.Collections; +import java.util.Map; + +import com.fasterxml.jackson.annotation.JsonProperty; public class ExtruderConfiguration extends Configuration { + @NotNull + private Map> viewRendererConfiguration = Collections.emptyMap(); + + @JsonProperty("viewRendererConfiguration") + public Map> getViewRendererConfiguration() { + return viewRendererConfiguration; + } + + @JsonProperty("viewRendererConfiguration") + public void setViewRendererConfiguration(Map> viewRendererConfiguration) { + this.viewRendererConfiguration = viewRendererConfiguration; + } } diff --git a/src/main/java/info/aaronland/extruder/ExtruderService.java b/src/main/java/info/aaronland/extruder/ExtruderService.java deleted file mode 100644 index b6cb9c0..0000000 --- a/src/main/java/info/aaronland/extruder/ExtruderService.java +++ /dev/null @@ -1,35 +0,0 @@ -package info.aaronland.extruder; - -import com.yammer.dropwizard.Service; -import com.yammer.dropwizard.config.Bootstrap; -import com.yammer.dropwizard.config.Environment; -import com.yammer.dropwizard.views.ViewBundle; - -import info.aaronland.extruder.ExtruderConfiguration; - -import java.net.URL; - -public class ExtruderService extends Service { - - public static void main(String[] args) throws Exception { - new ExtruderService().run(args); - } - - @Override - public void initialize(Bootstrap bootstrap) { - bootstrap.setName("extruder"); - bootstrap.addBundle(new ViewBundle()); - } - - @Override - public void run(ExtruderConfiguration conf, Environment env) throws Exception { - env.addResource(new BoilerpipeResource()); - env.addResource(new TikaResource()); - env.addResource(new JavaReadabilityResource()); - - // TODO: put me in the config file... (20130908/straup) - URL healthcheck_url = new URL("http://collection.cooperhewitt.org/objects/random/"); - env.addHealthCheck(new InternetsHealthCheck(healthcheck_url)); - } - -} diff --git a/src/main/java/info/aaronland/extruder/InternetsHealthCheck.java b/src/main/java/info/aaronland/extruder/InternetsHealthCheck.java index 0e2d5cf..7e608b2 100644 --- a/src/main/java/info/aaronland/extruder/InternetsHealthCheck.java +++ b/src/main/java/info/aaronland/extruder/InternetsHealthCheck.java @@ -1,6 +1,6 @@ package info.aaronland.extruder; -import com.yammer.metrics.core.HealthCheck; +import com.codahale.metrics.health.HealthCheck; import java.net.URL; import java.net.HttpURLConnection; @@ -10,7 +10,7 @@ public class InternetsHealthCheck extends HealthCheck { private URL url; public InternetsHealthCheck(URL url) { - super("InternetsHealthCheck"); + super(); this.url = url; } diff --git a/src/main/java/info/aaronland/extruder/JavaReadabilityResource.java b/src/main/java/info/aaronland/extruder/JavaReadabilityResource.java index 5a2473e..fe24b11 100644 --- a/src/main/java/info/aaronland/extruder/JavaReadabilityResource.java +++ b/src/main/java/info/aaronland/extruder/JavaReadabilityResource.java @@ -15,9 +15,9 @@ import javax.ws.rs.core.Response; import javax.ws.rs.core.Response.Status; -import com.sun.jersey.core.header.FormDataContentDisposition; -import com.sun.jersey.multipart.FormDataMultiPart; -import com.sun.jersey.multipart.FormDataBodyPart; +import org.glassfish.jersey.media.multipart.FormDataContentDisposition; +import org.glassfish.jersey.media.multipart.FormDataMultiPart; +import org.glassfish.jersey.media.multipart.FormDataBodyPart; import java.io.InputStream; import java.io.File; diff --git a/src/main/java/info/aaronland/extruder/TikaResource.java b/src/main/java/info/aaronland/extruder/TikaResource.java index 17d7298..ee9bc2d 100644 --- a/src/main/java/info/aaronland/extruder/TikaResource.java +++ b/src/main/java/info/aaronland/extruder/TikaResource.java @@ -12,9 +12,10 @@ import java.io.ByteArrayOutputStream; import java.io.ByteArrayInputStream; -import com.sun.jersey.core.header.FormDataContentDisposition; -import com.sun.jersey.multipart.FormDataMultiPart; -import com.sun.jersey.multipart.FormDataBodyPart; +import org.glassfish.jersey.media.multipart.FormDataContentDisposition; +// import org.glassfish.jersey.media.multipart.FormDataMultiPart; +import org.glassfish.jersey.media.multipart.FormDataBodyPart; +import org.glassfish.jersey.media.multipart.FormDataParam; import javax.ws.rs.GET; import javax.ws.rs.POST; @@ -35,6 +36,7 @@ import org.slf4j.LoggerFactory; import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; @@ -49,7 +51,7 @@ public class TikaResource { private static final Logger LOGGER = LoggerFactory.getLogger(TikaResource.class); - + @GET public Response extrudeThisUrl(@QueryParam("url") String uri){ @@ -87,14 +89,11 @@ public Response extrudeThisUrl(@QueryParam("url") String uri){ return Response.status(Response.Status.OK).entity(view).build(); } - + @POST @Consumes(MediaType.MULTIPART_FORM_DATA) - public Response extrudeThisFile(FormDataMultiPart formParams){ - - FormDataBodyPart stream = formParams.getField("file"); - InputStream upload = stream.getValueAs(InputStream.class); - + public Response extrudeThisFile(@FormDataParam("file") InputStream upload) { + // MOON LANGUAGE – if there's a better way to make it so that // Tika doesn't complain that the stream (upload) is already // closed I would love to hear about it... (20130831/straup) @@ -144,7 +143,7 @@ public Response extrudeThisFile(FormDataMultiPart formParams){ // (20130901/straup) private Document extrudeThis(InputStream buffer){ - + String text; String title; @@ -165,15 +164,14 @@ private Document extrudeThis(InputStream buffer){ text = handler.toString(); text = unwrapText(text); - // http://www.celinio.net/techblog/?p=1295 - title = metadata.get(Metadata.TITLE); + title = metadata.get(TikaCoreProperties.TITLE); if (title == null){ String type = "mystery"; try { - String content_type = metadata.get(Metadata.CONTENT_TYPE); + String content_type = metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT); String[] parts = content_type.split("/"); type = parts[1]; } diff --git a/src/main/resources/info/aaronland/extruder/document.ftl b/src/main/resources/info/aaronland/extruder/document.ftl index 4894c00..f88758a 100644 --- a/src/main/resources/info/aaronland/extruder/document.ftl +++ b/src/main/resources/info/aaronland/extruder/document.ftl @@ -2,7 +2,7 @@ - ${document.getTitle()?html} + ${document.getTitle()}