diff --git a/Dockerfile b/Dockerfile
index 4e01422..2a88f73 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -28,4 +28,4 @@ FROM openjdk:17-slim
RUN mkdir /usr/local/jar
-COPY --from=builder /usr/src/dogeared-extruder/target/extruder-1.1.jar /usr/local/jar/dogeared-extruder.jar
\ No newline at end of file
+COPY --from=builder /usr/src/dogeared-extruder/target/extruder-2.0.jar /usr/local/jar/dogeared-extruder.jar
diff --git a/Makefile b/Makefile
index a18aa13..6791dc7 100644
--- a/Makefile
+++ b/Makefile
@@ -7,7 +7,7 @@ build: todo
mvn install
run:
- java -jar target/extruder-1.0.jar server
+ java -jar target/extruder-2.0.jar server
todo:
echo "# Generated automatically at" `date` > TODO.txt
diff --git a/README.md b/README.md
index e35865f..b11d540 100644
--- a/README.md
+++ b/README.md
@@ -5,16 +5,12 @@ This is a meant to be a simple HTTP Pony to wrap the `boilerpipe` and `Tika` and
clones of the `readability` text extraction libraries using the `dropwizard`
framework.
-Important
+Version "2"
--
This package was not updated between May 2014 and February 2022.
-There is a [v2 branch](https://github.com/aaronland/dogeared-extruder/tree/v2) for this package with up-to-date dependencies.
-Unfortunately, some of those dependencies contain changes that need to be accounted for in this package's code. That
-work is underway. Any help or suggestions would be appreciated.
-
-In the meantime, known security vulnerabilities for older dependencies have been addressed.
+In February 2022 "version 2" was released which introduces no new user-facing features but updated the internal code, where necessary, to account for updated dependencies and known security vulnerabilities.
Quick start
--
@@ -24,7 +20,7 @@ To start the server:
$> cd dogeared-extruder
$> make build
... JAVA STUFF ...
- $> java -jar target/extruder-1.1.jar server
+ $> java -jar target/extruder-2.0.jar server
... MOAR JAVA STUFF ...
INFO [2013-08-30 12:49:12,184] org.eclipse.jetty.server.AbstractConnector: Started InstrumentedBlockingChannelConnector@0.0.0.0:8080
INFO [2013-08-30 12:49:12,189] org.eclipse.jetty.server.AbstractConnector: Started SocketConnector@0.0.0.0:8081
diff --git a/TODO.txt b/TODO.txt
index b044809..81b95fe 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -1,11 +1,10 @@
-# Generated automatically at Tue Feb 15 22:45:38 PST 2022
+# Generated automatically at Fri Feb 18 17:32:35 PST 2022
-./src/main/java/info/aaronland/extruder/ExtruderService.java:30: // TODO: put me in the config file... (20130908/straup)
./src/main/java/info/aaronland/extruder/JavaReadabilityResource.java:54: // TODO: trap MalformedURLExceptions and return NOT_ACCEPTABLE here (20130901/straup)
-./src/main/java/info/aaronland/extruder/ExtruderApplication.java~:30: // TODO: put me in the config file... (20130908/straup)
./src/main/java/info/aaronland/extruder/BoilerpipeResource.java:52: // TODO: trap MalformedURLExceptions and return NOT_ACCEPTABLE here (20130901/straup)
+./src/main/java/info/aaronland/extruder/ExtruderApplication.java:40: // TODO: put me in the config file... (20130908/straup)
./src/main/java/com/basistech/readability/Readability.java:93: // TODO: reset the results.
./src/main/java/com/basistech/readability/Readability.java:368: * http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 TODO: Shouldn't this be a reverse
./src/main/java/com/basistech/readability/Readability.java:686: * at the same time without effecting the traversal. TODO: Consider taking into account original
-./src/main/java/info/aaronland/extruder/TikaResource.java:140: // TO DO: figure out how to make this return HTML instead of text
+./src/main/java/info/aaronland/extruder/TikaResource.java:139: // TO DO: figure out how to make this return HTML instead of text
./src/main/java/info/aaronland/extruder/Upload.java:20: // TO DO: sort out file extensions etc.
diff --git a/configuration.yml b/configuration.yml
index ecfd447..743e4ec 100644
--- a/configuration.yml
+++ b/configuration.yml
@@ -1,2 +1,8 @@
logging:
level: INFO
+ loggers:
+ info.aaronland.extruder: DEBUG
+viewRendererConfiguration:
+ freemarker:
+ strict_syntax: yes
+ whitespace_stripping: yes
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index f9699cd..44bd755 100644
--- a/pom.xml
+++ b/pom.xml
@@ -8,38 +8,53 @@
info.aaronland.extruder
extruder
- 1.1
+ 2.0
- com.yammer.dropwizard
+ io.dropwizard
dropwizard-core
- 0.6.2
+ 2.0.28
- com.yammer.dropwizard
+ io.dropwizard
dropwizard-views
- 0.6.2
+ 2.0.28
- com.yammer.metrics
- metrics-core
- 2.2.0
+ io.dropwizard
+ dropwizard-forms
+ 2.0.28
- com.sun.jersey.contribs
- jersey-multipart
-
-
- 1.17.1
-
+ io.dropwizard
+ dropwizard-views-freemarker
+ 2.0.28
+
+
+
+ io.dropwizard.metrics
+ metrics-core
+ 4.2.8
+
+
+
+
+ org.glassfish.jersey.media
+ jersey-media-multipart
+ 2.29.1
+
+
@@ -53,28 +68,41 @@
org.apache.tika
tika-core
- 1.22
+ 2.3.0
org.apache.tika
tika-parsers
- 1.22
+ 2.3.0
+ pom
+
+ org.apache.tika
+ tika-parsers-standard-package
+ 2.3.0
+
+
-
+
javax.xml.bind
jaxb-api
2.3.1
+
+ javax.mail
+ javax.mail-api
+ 1.6.2
+
+
org.apache.commons
commons-lang3
- 3.1
+ 3.12.0
@@ -86,7 +114,7 @@
net.sourceforge.nekohtml
nekohtml
- 1.9.18
+ 1.9.22
@@ -102,7 +130,7 @@
commons-io
commons-io
- 2.7
+ 2.11.0
jar
compile
@@ -120,7 +148,7 @@
org.apache.maven.plugins
maven-compiler-plugin
- 3.0
+ 3.10.0
1.7
1.7
@@ -132,7 +160,7 @@
org.apache.maven.plugins
maven-shade-plugin
- 1.6
+ 3.2.4
package
@@ -144,7 +172,7 @@
- info.aaronland.extruder.ExtruderService
+ info.aaronland.extruder.ExtruderApplication
@@ -166,7 +194,7 @@
org.codehaus.mojo
exec-maven-plugin
- 1.2.1
+ 3.0.0
info.aaronland.extruder.ExtruderService
diff --git a/src/main/java/com/basistech/readability/FilePageReader.java b/src/main/java/com/basistech/readability/FilePageReader.java
index 51a665a..73b3230 100644
--- a/src/main/java/com/basistech/readability/FilePageReader.java
+++ b/src/main/java/com/basistech/readability/FilePageReader.java
@@ -22,7 +22,9 @@
import java.io.FileInputStream;
import java.io.IOException;
-import org.apache.tika.io.IOUtils;
+import org.apache.commons.io.IOUtils ;
+
+// import org.apache.tika.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
diff --git a/src/main/java/info/aaronland/extruder/BoilerpipeResource.java b/src/main/java/info/aaronland/extruder/BoilerpipeResource.java
index 2969870..d168321 100644
--- a/src/main/java/info/aaronland/extruder/BoilerpipeResource.java
+++ b/src/main/java/info/aaronland/extruder/BoilerpipeResource.java
@@ -7,9 +7,9 @@
import java.io.InputStream;
import java.io.File;
-import com.sun.jersey.core.header.FormDataContentDisposition;
-import com.sun.jersey.multipart.FormDataMultiPart;
-import com.sun.jersey.multipart.FormDataBodyPart;
+import org.glassfish.jersey.media.multipart.FormDataContentDisposition;
+import org.glassfish.jersey.media.multipart.FormDataMultiPart;
+import org.glassfish.jersey.media.multipart.FormDataBodyPart;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.GET;
diff --git a/src/main/java/info/aaronland/extruder/DocumentView.java b/src/main/java/info/aaronland/extruder/DocumentView.java
index 604db12..6588393 100644
--- a/src/main/java/info/aaronland/extruder/DocumentView.java
+++ b/src/main/java/info/aaronland/extruder/DocumentView.java
@@ -1,11 +1,7 @@
package info.aaronland.extruder;
import info.aaronland.extruder.Document;
-import com.yammer.dropwizard.views.View;
-
-import com.google.common.base.Charsets;
-import com.google.common.base.Optional;
-import java.nio.charset.Charset;
+import io.dropwizard.views.View;
public class DocumentView extends View {
private final Document document;
@@ -19,13 +15,4 @@ public Document getDocument(){
return document;
}
- // Because in com/codahale/dropwizard/views/freemarker/FreemarkerViewRenderer.java this:
- // final Charset charset = view.getCharset().or(Charset.forName(configuration.getEncoding(locale)));
- // And since the default encoding for en-us is ISO-8859-1... good times
- // (20130908/straup)
-
- public Optional getCharset(){
- return Optional.of(Charsets.UTF_8);
- }
-
}
diff --git a/src/main/java/info/aaronland/extruder/ExtruderApplication.java b/src/main/java/info/aaronland/extruder/ExtruderApplication.java
new file mode 100644
index 0000000..a21e1a8
--- /dev/null
+++ b/src/main/java/info/aaronland/extruder/ExtruderApplication.java
@@ -0,0 +1,46 @@
+package info.aaronland.extruder;
+
+import io.dropwizard.Application;
+import io.dropwizard.setup.Bootstrap;
+import io.dropwizard.setup.Environment;
+import io.dropwizard.views.ViewBundle;
+import io.dropwizard.forms.MultiPartBundle;
+
+import info.aaronland.extruder.ExtruderConfiguration;
+
+import java.net.URL;
+import java.util.Map;
+
+public class ExtruderApplication extends Application {
+
+ public static void main(String[] args) throws Exception {
+ new ExtruderApplication().run(args);
+ }
+
+ public void initialize(Bootstrap bootstrap) {
+
+ bootstrap.addBundle(new MultiPartBundle());
+
+ bootstrap.addBundle(new ViewBundle(){
+
+ @Override
+ public Map> getViewConfiguration(ExtruderConfiguration config) {
+ return config.getViewRendererConfiguration();
+ }
+ });
+ }
+
+ @Override
+ public void run(ExtruderConfiguration conf, Environment env) throws Exception {
+
+ env.jersey().register(new BoilerpipeResource());
+ env.jersey().register(new TikaResource());
+ env.jersey().register(new JavaReadabilityResource());
+
+ // TODO: put me in the config file... (20130908/straup)
+ URL healthcheck_url = new URL("https://github.com/aaronland/dogeared-extruder/");
+
+ env.healthChecks().register("internets", new InternetsHealthCheck(healthcheck_url));
+ }
+
+}
diff --git a/src/main/java/info/aaronland/extruder/ExtruderConfiguration.java b/src/main/java/info/aaronland/extruder/ExtruderConfiguration.java
index 3fa35fb..8b39822 100644
--- a/src/main/java/info/aaronland/extruder/ExtruderConfiguration.java
+++ b/src/main/java/info/aaronland/extruder/ExtruderConfiguration.java
@@ -1,9 +1,27 @@
package info.aaronland.extruder;
-import com.yammer.dropwizard.config.Configuration;
+import io.dropwizard.Configuration;
import javax.validation.Valid;
+import javax.validation.constraints.NotNull;
+
+import java.util.Collections;
+import java.util.Map;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
public class ExtruderConfiguration extends Configuration {
+ @NotNull
+ private Map> viewRendererConfiguration = Collections.emptyMap();
+
+ @JsonProperty("viewRendererConfiguration")
+ public Map> getViewRendererConfiguration() {
+ return viewRendererConfiguration;
+ }
+
+ @JsonProperty("viewRendererConfiguration")
+ public void setViewRendererConfiguration(Map> viewRendererConfiguration) {
+ this.viewRendererConfiguration = viewRendererConfiguration;
+ }
}
diff --git a/src/main/java/info/aaronland/extruder/ExtruderService.java b/src/main/java/info/aaronland/extruder/ExtruderService.java
deleted file mode 100644
index b6cb9c0..0000000
--- a/src/main/java/info/aaronland/extruder/ExtruderService.java
+++ /dev/null
@@ -1,35 +0,0 @@
-package info.aaronland.extruder;
-
-import com.yammer.dropwizard.Service;
-import com.yammer.dropwizard.config.Bootstrap;
-import com.yammer.dropwizard.config.Environment;
-import com.yammer.dropwizard.views.ViewBundle;
-
-import info.aaronland.extruder.ExtruderConfiguration;
-
-import java.net.URL;
-
-public class ExtruderService extends Service {
-
- public static void main(String[] args) throws Exception {
- new ExtruderService().run(args);
- }
-
- @Override
- public void initialize(Bootstrap bootstrap) {
- bootstrap.setName("extruder");
- bootstrap.addBundle(new ViewBundle());
- }
-
- @Override
- public void run(ExtruderConfiguration conf, Environment env) throws Exception {
- env.addResource(new BoilerpipeResource());
- env.addResource(new TikaResource());
- env.addResource(new JavaReadabilityResource());
-
- // TODO: put me in the config file... (20130908/straup)
- URL healthcheck_url = new URL("http://collection.cooperhewitt.org/objects/random/");
- env.addHealthCheck(new InternetsHealthCheck(healthcheck_url));
- }
-
-}
diff --git a/src/main/java/info/aaronland/extruder/InternetsHealthCheck.java b/src/main/java/info/aaronland/extruder/InternetsHealthCheck.java
index 0e2d5cf..7e608b2 100644
--- a/src/main/java/info/aaronland/extruder/InternetsHealthCheck.java
+++ b/src/main/java/info/aaronland/extruder/InternetsHealthCheck.java
@@ -1,6 +1,6 @@
package info.aaronland.extruder;
-import com.yammer.metrics.core.HealthCheck;
+import com.codahale.metrics.health.HealthCheck;
import java.net.URL;
import java.net.HttpURLConnection;
@@ -10,7 +10,7 @@ public class InternetsHealthCheck extends HealthCheck {
private URL url;
public InternetsHealthCheck(URL url) {
- super("InternetsHealthCheck");
+ super();
this.url = url;
}
diff --git a/src/main/java/info/aaronland/extruder/JavaReadabilityResource.java b/src/main/java/info/aaronland/extruder/JavaReadabilityResource.java
index 5a2473e..fe24b11 100644
--- a/src/main/java/info/aaronland/extruder/JavaReadabilityResource.java
+++ b/src/main/java/info/aaronland/extruder/JavaReadabilityResource.java
@@ -15,9 +15,9 @@
import javax.ws.rs.core.Response;
import javax.ws.rs.core.Response.Status;
-import com.sun.jersey.core.header.FormDataContentDisposition;
-import com.sun.jersey.multipart.FormDataMultiPart;
-import com.sun.jersey.multipart.FormDataBodyPart;
+import org.glassfish.jersey.media.multipart.FormDataContentDisposition;
+import org.glassfish.jersey.media.multipart.FormDataMultiPart;
+import org.glassfish.jersey.media.multipart.FormDataBodyPart;
import java.io.InputStream;
import java.io.File;
diff --git a/src/main/java/info/aaronland/extruder/TikaResource.java b/src/main/java/info/aaronland/extruder/TikaResource.java
index 17d7298..ee9bc2d 100644
--- a/src/main/java/info/aaronland/extruder/TikaResource.java
+++ b/src/main/java/info/aaronland/extruder/TikaResource.java
@@ -12,9 +12,10 @@
import java.io.ByteArrayOutputStream;
import java.io.ByteArrayInputStream;
-import com.sun.jersey.core.header.FormDataContentDisposition;
-import com.sun.jersey.multipart.FormDataMultiPart;
-import com.sun.jersey.multipart.FormDataBodyPart;
+import org.glassfish.jersey.media.multipart.FormDataContentDisposition;
+// import org.glassfish.jersey.media.multipart.FormDataMultiPart;
+import org.glassfish.jersey.media.multipart.FormDataBodyPart;
+import org.glassfish.jersey.media.multipart.FormDataParam;
import javax.ws.rs.GET;
import javax.ws.rs.POST;
@@ -35,6 +36,7 @@
import org.slf4j.LoggerFactory;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
@@ -49,7 +51,7 @@
public class TikaResource {
private static final Logger LOGGER = LoggerFactory.getLogger(TikaResource.class);
-
+
@GET
public Response extrudeThisUrl(@QueryParam("url") String uri){
@@ -87,14 +89,11 @@ public Response extrudeThisUrl(@QueryParam("url") String uri){
return Response.status(Response.Status.OK).entity(view).build();
}
-
+
@POST
@Consumes(MediaType.MULTIPART_FORM_DATA)
- public Response extrudeThisFile(FormDataMultiPart formParams){
-
- FormDataBodyPart stream = formParams.getField("file");
- InputStream upload = stream.getValueAs(InputStream.class);
-
+ public Response extrudeThisFile(@FormDataParam("file") InputStream upload) {
+
// MOON LANGUAGE – if there's a better way to make it so that
// Tika doesn't complain that the stream (upload) is already
// closed I would love to hear about it... (20130831/straup)
@@ -144,7 +143,7 @@ public Response extrudeThisFile(FormDataMultiPart formParams){
// (20130901/straup)
private Document extrudeThis(InputStream buffer){
-
+
String text;
String title;
@@ -165,15 +164,14 @@ private Document extrudeThis(InputStream buffer){
text = handler.toString();
text = unwrapText(text);
- // http://www.celinio.net/techblog/?p=1295
- title = metadata.get(Metadata.TITLE);
+ title = metadata.get(TikaCoreProperties.TITLE);
if (title == null){
String type = "mystery";
try {
- String content_type = metadata.get(Metadata.CONTENT_TYPE);
+ String content_type = metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT);
String[] parts = content_type.split("/");
type = parts[1];
}
diff --git a/src/main/resources/info/aaronland/extruder/document.ftl b/src/main/resources/info/aaronland/extruder/document.ftl
index 4894c00..f88758a 100644
--- a/src/main/resources/info/aaronland/extruder/document.ftl
+++ b/src/main/resources/info/aaronland/extruder/document.ftl
@@ -2,7 +2,7 @@
- ${document.getTitle()?html}
+ ${document.getTitle()}