From e704c32848a78caf417561e086287e70e4e33599 Mon Sep 17 00:00:00 2001 From: thisisaaronland Date: Tue, 15 Feb 2022 17:44:01 -0800 Subject: [PATCH 01/15] snapshot: start making it work with all the package updates --- Dockerfile | 34 +++++++++++++++ TODO.txt | 6 +-- pom.xml | 41 ++++++++++--------- .../info/aaronland/extruder/DocumentView.java | 2 +- ...rService.java => ExtruderApplication.java} | 12 +++--- .../extruder/ExtruderConfiguration.java | 2 +- .../extruder/InternetsHealthCheck.java | 3 +- 7 files changed, 68 insertions(+), 32 deletions(-) create mode 100644 Dockerfile rename src/main/java/info/aaronland/extruder/{ExtruderService.java => ExtruderApplication.java} (73%) diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..2084b23 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,34 @@ +# Copied from: https://github.com/carlossg/docker-maven/blob/master/openjdk-17-slim/mvn-entrypoint.sh + +FROM openjdk:17-jdk-slim AS builder + +RUN apt-get update \ + && apt-get install -y curl procps \ + && rm -rf /var/lib/apt/lists/* + +ARG MAVEN_VERSION=3.8.4 +ARG USER_HOME_DIR="/root" +ARG SHA=a9b2d825eacf2e771ed5d6b0e01398589ac1bfa4171f36154d1b5787879605507802f699da6f7cfc80732a5282fd31b28e4cd6052338cbef0fa1358b48a5e3c8 +ARG BASE_URL=https://apache.osuosl.org/maven/maven-3/${MAVEN_VERSION}/binaries + +RUN mkdir -p /usr/share/maven /usr/share/maven/ref \ + && curl -fsSL -o /tmp/apache-maven.tar.gz ${BASE_URL}/apache-maven-${MAVEN_VERSION}-bin.tar.gz \ + && echo "${SHA} /tmp/apache-maven.tar.gz" | sha512sum -c - \ + && tar -xzf /tmp/apache-maven.tar.gz -C /usr/share/maven --strip-components=1 \ + && rm -f /tmp/apache-maven.tar.gz \ + && ln -s /usr/share/maven/bin/mvn /usr/bin/mvn + +ENV MAVEN_HOME /usr/share/maven +ENV MAVEN_CONFIG "$USER_HOME_DIR/.m2" + +# COPY mvn-entrypoint.sh /usr/local/bin/mvn-entrypoint.sh +# COPY settings-docker.xml /usr/share/maven/ref/ + +COPY . /usr/src/dogeared-extruder + +WORKDIR /usr/src/dogeared-extruder + +RUN mvn clean && mvn install + +# FROM openjdk:17-slim + diff --git a/TODO.txt b/TODO.txt index b769efa..3b1917f 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,10 +1,10 @@ -# Generated automatically at Tue Feb 25 13:58:20 CET 2014 +# Generated automatically at Tue Feb 15 17:01:30 PST 2022 ./src/main/java/info/aaronland/extruder/ExtruderService.java:30: // TODO: put me in the config file... (20130908/straup) -./src/main/java/info/aaronland/extruder/BoilerpipeResource.java:52: // TODO: trap MalformedURLExceptions and return NOT_ACCEPTABLE here (20130901/straup) ./src/main/java/info/aaronland/extruder/JavaReadabilityResource.java:54: // TODO: trap MalformedURLExceptions and return NOT_ACCEPTABLE here (20130901/straup) +./src/main/java/info/aaronland/extruder/BoilerpipeResource.java:52: // TODO: trap MalformedURLExceptions and return NOT_ACCEPTABLE here (20130901/straup) ./src/main/java/com/basistech/readability/Readability.java:93: // TODO: reset the results. ./src/main/java/com/basistech/readability/Readability.java:368: * http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 TODO: Shouldn't this be a reverse ./src/main/java/com/basistech/readability/Readability.java:686: * at the same time without effecting the traversal. TODO: Consider taking into account original -./src/main/java/info/aaronland/extruder/Upload.java:20: // TO DO: sort out file extensions etc. ./src/main/java/info/aaronland/extruder/TikaResource.java:140: // TO DO: figure out how to make this return HTML instead of text +./src/main/java/info/aaronland/extruder/Upload.java:20: // TO DO: sort out file extensions etc. diff --git a/pom.xml b/pom.xml index 823e58f..e578ec2 100644 --- a/pom.xml +++ b/pom.xml @@ -8,28 +8,28 @@ info.aaronland.extruder extruder - 1.0 + 2.0 - com.yammer.dropwizard + io.dropwizard dropwizard-core - 0.6.2 + 2.0.28 - com.yammer.dropwizard + io.dropwizard dropwizard-views - 0.6.2 + 2.0.28 - com.yammer.metrics + io.dropwizard.metrics metrics-core - 2.2.0 + 4.2.8 @@ -37,7 +37,7 @@ jersey-multipart - 1.17.1 + 1.19.4 @@ -53,13 +53,14 @@ org.apache.tika tika-core - 1.4 + 2.3.0 org.apache.tika tika-parsers - 1.4 + 2.3.0 + pom @@ -67,7 +68,7 @@ org.apache.commons commons-lang3 - 3.1 + 3.12.0 @@ -79,7 +80,7 @@ net.sourceforge.nekohtml nekohtml - 1.9.18 + 1.9.22 @@ -87,7 +88,7 @@ org.jsoup jsoup - 1.4.1 + 1.14.3 jar compile @@ -95,7 +96,7 @@ commons-io commons-io - 2.0.1 + 2.11.0 jar compile @@ -103,7 +104,7 @@ org.apache.httpcomponents httpclient - 4.0.3 + 4.5.13 @@ -113,10 +114,10 @@ org.apache.maven.plugins maven-compiler-plugin - 3.0 + 3.10.0 - 1.6 - 1.6 + 1.7 + 1.7 @@ -125,7 +126,7 @@ org.apache.maven.plugins maven-shade-plugin - 1.6 + 3.2.4 package @@ -159,7 +160,7 @@ org.codehaus.mojo exec-maven-plugin - 1.2.1 + 3.0.0 info.aaronland.extruder.ExtruderService diff --git a/src/main/java/info/aaronland/extruder/DocumentView.java b/src/main/java/info/aaronland/extruder/DocumentView.java index 604db12..e342fcc 100644 --- a/src/main/java/info/aaronland/extruder/DocumentView.java +++ b/src/main/java/info/aaronland/extruder/DocumentView.java @@ -1,7 +1,7 @@ package info.aaronland.extruder; import info.aaronland.extruder.Document; -import com.yammer.dropwizard.views.View; +import io.dropwizard.views.View; import com.google.common.base.Charsets; import com.google.common.base.Optional; diff --git a/src/main/java/info/aaronland/extruder/ExtruderService.java b/src/main/java/info/aaronland/extruder/ExtruderApplication.java similarity index 73% rename from src/main/java/info/aaronland/extruder/ExtruderService.java rename to src/main/java/info/aaronland/extruder/ExtruderApplication.java index b6cb9c0..0b9099e 100644 --- a/src/main/java/info/aaronland/extruder/ExtruderService.java +++ b/src/main/java/info/aaronland/extruder/ExtruderApplication.java @@ -1,18 +1,18 @@ package info.aaronland.extruder; -import com.yammer.dropwizard.Service; -import com.yammer.dropwizard.config.Bootstrap; -import com.yammer.dropwizard.config.Environment; -import com.yammer.dropwizard.views.ViewBundle; +import io.dropwizard.Application; +import io.dropwizard.setup.Bootstrap; +import io.dropwizard.setup.Environment; +import io.dropwizard.views.ViewBundle; import info.aaronland.extruder.ExtruderConfiguration; import java.net.URL; -public class ExtruderService extends Service { +public class ExtruderApplication extends Application { public static void main(String[] args) throws Exception { - new ExtruderService().run(args); + new ExtruderApplication().run(args); } @Override diff --git a/src/main/java/info/aaronland/extruder/ExtruderConfiguration.java b/src/main/java/info/aaronland/extruder/ExtruderConfiguration.java index 3fa35fb..3aa94e2 100644 --- a/src/main/java/info/aaronland/extruder/ExtruderConfiguration.java +++ b/src/main/java/info/aaronland/extruder/ExtruderConfiguration.java @@ -1,6 +1,6 @@ package info.aaronland.extruder; -import com.yammer.dropwizard.config.Configuration; +import io.dropwizard.Configuration; import javax.validation.Valid; diff --git a/src/main/java/info/aaronland/extruder/InternetsHealthCheck.java b/src/main/java/info/aaronland/extruder/InternetsHealthCheck.java index 0e2d5cf..48b8ada 100644 --- a/src/main/java/info/aaronland/extruder/InternetsHealthCheck.java +++ b/src/main/java/info/aaronland/extruder/InternetsHealthCheck.java @@ -1,6 +1,7 @@ package info.aaronland.extruder; -import com.yammer.metrics.core.HealthCheck; +// import io.dropwizard.metrics.health.HealthCheck; +import com.codahale.metrics.health.HealthCheck; import java.net.URL; import java.net.HttpURLConnection; From b9b71f847f5f41f6686bfdb9062f0449182b7b66 Mon Sep 17 00:00:00 2001 From: thisisaaronland Date: Tue, 15 Feb 2022 18:40:20 -0800 Subject: [PATCH 02/15] update deps --- Dockerfile | 12 ++++++------ pom.xml | 6 ++++++ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2084b23..3f419ab 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,11 +18,6 @@ RUN mkdir -p /usr/share/maven /usr/share/maven/ref \ && rm -f /tmp/apache-maven.tar.gz \ && ln -s /usr/share/maven/bin/mvn /usr/bin/mvn -ENV MAVEN_HOME /usr/share/maven -ENV MAVEN_CONFIG "$USER_HOME_DIR/.m2" - -# COPY mvn-entrypoint.sh /usr/local/bin/mvn-entrypoint.sh -# COPY settings-docker.xml /usr/share/maven/ref/ COPY . /usr/src/dogeared-extruder @@ -30,5 +25,10 @@ WORKDIR /usr/src/dogeared-extruder RUN mvn clean && mvn install -# FROM openjdk:17-slim +FROM openjdk:17-slim + +FROM openjdk:17-slim + +RUN mkdir /usr/local/jar +COPY --from=builder /usr/src/dogeared-extruder/target/extruder-2.0.jar /usr/local/jar/dogeared-extruder.jar diff --git a/pom.xml b/pom.xml index e578ec2..ded56bf 100644 --- a/pom.xml +++ b/pom.xml @@ -65,6 +65,12 @@ + + javax.xml.bind + jaxb-api + 2.3.1 + + org.apache.commons commons-lang3 From e1a303f5255f3a9db33b74b58f63208ebde416c5 Mon Sep 17 00:00:00 2001 From: thisisaaronland Date: Tue, 15 Feb 2022 23:08:48 -0800 Subject: [PATCH 03/15] snapshot: fewer errors --- TODO.txt | 2 +- pom.xml | 10 ++-------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/TODO.txt b/TODO.txt index e1b7c04..12e65b0 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,4 +1,4 @@ -# Generated automatically at Tue Feb 15 23:04:49 PST 2022 +# Generated automatically at Tue Feb 15 23:07:58 PST 2022 ./src/main/java/info/aaronland/extruder/JavaReadabilityResource.java:54: // TODO: trap MalformedURLExceptions and return NOT_ACCEPTABLE here (20130901/straup) ./src/main/java/info/aaronland/extruder/ExtruderApplication.java~:30: // TODO: put me in the config file... (20130908/straup) diff --git a/pom.xml b/pom.xml index 5c52a99..74a8054 100644 --- a/pom.xml +++ b/pom.xml @@ -54,24 +54,18 @@ org.apache.tika tika-core 2.3.0 - + org.apache.tika tika-parsers 2.3.0 + pom - - - javax.xml.bind - jaxb-api - 2.3.1 - - javax.xml.bind jaxb-api From 2d7f87e60191883bb7dcc2a412c4fe6b985f3aac Mon Sep 17 00:00:00 2001 From: thisisaaronland Date: Tue, 15 Feb 2022 23:23:07 -0800 Subject: [PATCH 04/15] snapshot: still fewer errors --- TODO.txt | 2 +- .../java/info/aaronland/extruder/DocumentView.java | 12 ++++++------ .../info/aaronland/extruder/ExtruderApplication.java | 8 ++++---- .../aaronland/extruder/InternetsHealthCheck.java | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/TODO.txt b/TODO.txt index 12e65b0..647f918 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,4 +1,4 @@ -# Generated automatically at Tue Feb 15 23:07:58 PST 2022 +# Generated automatically at Tue Feb 15 23:22:32 PST 2022 ./src/main/java/info/aaronland/extruder/JavaReadabilityResource.java:54: // TODO: trap MalformedURLExceptions and return NOT_ACCEPTABLE here (20130901/straup) ./src/main/java/info/aaronland/extruder/ExtruderApplication.java~:30: // TODO: put me in the config file... (20130908/straup) diff --git a/src/main/java/info/aaronland/extruder/DocumentView.java b/src/main/java/info/aaronland/extruder/DocumentView.java index e342fcc..92f1079 100644 --- a/src/main/java/info/aaronland/extruder/DocumentView.java +++ b/src/main/java/info/aaronland/extruder/DocumentView.java @@ -3,9 +3,9 @@ import info.aaronland.extruder.Document; import io.dropwizard.views.View; -import com.google.common.base.Charsets; -import com.google.common.base.Optional; -import java.nio.charset.Charset; +//import com.google.common.base.Charsets; +//import com.google.common.base.Optional; +//import java.nio.charset.Charset; public class DocumentView extends View { private final Document document; @@ -24,8 +24,8 @@ public Document getDocument(){ // And since the default encoding for en-us is ISO-8859-1... good times // (20130908/straup) - public Optional getCharset(){ - return Optional.of(Charsets.UTF_8); - } + //public Optional getCharset(){ + //return Optional.of(Charsets.UTF_8); + // } } diff --git a/src/main/java/info/aaronland/extruder/ExtruderApplication.java b/src/main/java/info/aaronland/extruder/ExtruderApplication.java index 0b9099e..f0062b6 100644 --- a/src/main/java/info/aaronland/extruder/ExtruderApplication.java +++ b/src/main/java/info/aaronland/extruder/ExtruderApplication.java @@ -17,15 +17,15 @@ public static void main(String[] args) throws Exception { @Override public void initialize(Bootstrap bootstrap) { - bootstrap.setName("extruder"); + // bootstrap.setName("extruder"); bootstrap.addBundle(new ViewBundle()); } @Override public void run(ExtruderConfiguration conf, Environment env) throws Exception { - env.addResource(new BoilerpipeResource()); - env.addResource(new TikaResource()); - env.addResource(new JavaReadabilityResource()); + env.jersey().register(new BoilerpipeResource()); + env.jersey().register(new TikaResource()); + env.jersey().register(new JavaReadabilityResource()); // TODO: put me in the config file... (20130908/straup) URL healthcheck_url = new URL("http://collection.cooperhewitt.org/objects/random/"); diff --git a/src/main/java/info/aaronland/extruder/InternetsHealthCheck.java b/src/main/java/info/aaronland/extruder/InternetsHealthCheck.java index 48b8ada..324730b 100644 --- a/src/main/java/info/aaronland/extruder/InternetsHealthCheck.java +++ b/src/main/java/info/aaronland/extruder/InternetsHealthCheck.java @@ -11,7 +11,7 @@ public class InternetsHealthCheck extends HealthCheck { private URL url; public InternetsHealthCheck(URL url) { - super("InternetsHealthCheck"); + super(); this.url = url; } From c8b225ff10fd087c2340bfd20decd3d4b14a126f Mon Sep 17 00:00:00 2001 From: thisisaaronland Date: Wed, 16 Feb 2022 09:00:34 -0800 Subject: [PATCH 05/15] snapshot: just basistech/readability Tika errors now --- TODO.txt | 5 ++--- configuration.yml | 2 +- .../java/info/aaronland/extruder/ExtruderApplication.java | 5 +++-- .../java/info/aaronland/extruder/InternetsHealthCheck.java | 1 - src/main/java/info/aaronland/extruder/TikaResource.java | 6 +++--- 5 files changed, 9 insertions(+), 10 deletions(-) diff --git a/TODO.txt b/TODO.txt index 647f918..6686ab0 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,11 +1,10 @@ -# Generated automatically at Tue Feb 15 23:22:32 PST 2022 +# Generated automatically at Wed Feb 16 09:00:09 PST 2022 ./src/main/java/info/aaronland/extruder/JavaReadabilityResource.java:54: // TODO: trap MalformedURLExceptions and return NOT_ACCEPTABLE here (20130901/straup) -./src/main/java/info/aaronland/extruder/ExtruderApplication.java~:30: // TODO: put me in the config file... (20130908/straup) ./src/main/java/info/aaronland/extruder/BoilerpipeResource.java:52: // TODO: trap MalformedURLExceptions and return NOT_ACCEPTABLE here (20130901/straup) ./src/main/java/info/aaronland/extruder/ExtruderApplication.java:30: // TODO: put me in the config file... (20130908/straup) ./src/main/java/com/basistech/readability/Readability.java:93: // TODO: reset the results. ./src/main/java/com/basistech/readability/Readability.java:368: * http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 TODO: Shouldn't this be a reverse ./src/main/java/com/basistech/readability/Readability.java:686: * at the same time without effecting the traversal. TODO: Consider taking into account original -./src/main/java/info/aaronland/extruder/TikaResource.java:140: // TO DO: figure out how to make this return HTML instead of text +./src/main/java/info/aaronland/extruder/TikaResource.java:141: // TO DO: figure out how to make this return HTML instead of text ./src/main/java/info/aaronland/extruder/Upload.java:20: // TO DO: sort out file extensions etc. diff --git a/configuration.yml b/configuration.yml index ecfd447..dc1fe17 100644 --- a/configuration.yml +++ b/configuration.yml @@ -1,2 +1,2 @@ logging: - level: INFO + level: INFO \ No newline at end of file diff --git a/src/main/java/info/aaronland/extruder/ExtruderApplication.java b/src/main/java/info/aaronland/extruder/ExtruderApplication.java index f0062b6..ae9a5ac 100644 --- a/src/main/java/info/aaronland/extruder/ExtruderApplication.java +++ b/src/main/java/info/aaronland/extruder/ExtruderApplication.java @@ -28,8 +28,9 @@ public void run(ExtruderConfiguration conf, Environment env) throws Exception { env.jersey().register(new JavaReadabilityResource()); // TODO: put me in the config file... (20130908/straup) - URL healthcheck_url = new URL("http://collection.cooperhewitt.org/objects/random/"); - env.addHealthCheck(new InternetsHealthCheck(healthcheck_url)); + URL healthcheck_url = new URL("https://github.com/aaronland/dogeared-extruder/"); + + env.healthChecks().register("internets", new InternetsHealthCheck(healthcheck_url)); } } diff --git a/src/main/java/info/aaronland/extruder/InternetsHealthCheck.java b/src/main/java/info/aaronland/extruder/InternetsHealthCheck.java index 324730b..7e608b2 100644 --- a/src/main/java/info/aaronland/extruder/InternetsHealthCheck.java +++ b/src/main/java/info/aaronland/extruder/InternetsHealthCheck.java @@ -1,6 +1,5 @@ package info.aaronland.extruder; -// import io.dropwizard.metrics.health.HealthCheck; import com.codahale.metrics.health.HealthCheck; import java.net.URL; diff --git a/src/main/java/info/aaronland/extruder/TikaResource.java b/src/main/java/info/aaronland/extruder/TikaResource.java index 17d7298..cf1dbb5 100644 --- a/src/main/java/info/aaronland/extruder/TikaResource.java +++ b/src/main/java/info/aaronland/extruder/TikaResource.java @@ -35,6 +35,7 @@ import org.slf4j.LoggerFactory; import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; @@ -165,15 +166,14 @@ private Document extrudeThis(InputStream buffer){ text = handler.toString(); text = unwrapText(text); - // http://www.celinio.net/techblog/?p=1295 - title = metadata.get(Metadata.TITLE); + title = metadata.get(TikaCoreProperties.TITLE); if (title == null){ String type = "mystery"; try { - String content_type = metadata.get(Metadata.CONTENT_TYPE); + String content_type = metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT); String[] parts = content_type.split("/"); type = parts[1]; } From 1607ce84e633b46ec21cc9a13cc3e7af156e3fb9 Mon Sep 17 00:00:00 2001 From: thisisaaronland Date: Wed, 16 Feb 2022 09:16:19 -0800 Subject: [PATCH 06/15] snapshot: add tika standard-parsers package --- TODO.txt | 4 ++-- pom.xml | 7 ++++++- .../java/info/aaronland/extruder/ExtruderApplication.java | 1 - 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/TODO.txt b/TODO.txt index 6686ab0..7dcd2ed 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,8 +1,8 @@ -# Generated automatically at Wed Feb 16 09:00:09 PST 2022 +# Generated automatically at Wed Feb 16 09:14:45 PST 2022 ./src/main/java/info/aaronland/extruder/JavaReadabilityResource.java:54: // TODO: trap MalformedURLExceptions and return NOT_ACCEPTABLE here (20130901/straup) ./src/main/java/info/aaronland/extruder/BoilerpipeResource.java:52: // TODO: trap MalformedURLExceptions and return NOT_ACCEPTABLE here (20130901/straup) -./src/main/java/info/aaronland/extruder/ExtruderApplication.java:30: // TODO: put me in the config file... (20130908/straup) +./src/main/java/info/aaronland/extruder/ExtruderApplication.java:29: // TODO: put me in the config file... (20130908/straup) ./src/main/java/com/basistech/readability/Readability.java:93: // TODO: reset the results. ./src/main/java/com/basistech/readability/Readability.java:368: * http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 TODO: Shouldn't this be a reverse ./src/main/java/com/basistech/readability/Readability.java:686: * at the same time without effecting the traversal. TODO: Consider taking into account original diff --git a/pom.xml b/pom.xml index 74a8054..8156caa 100644 --- a/pom.xml +++ b/pom.xml @@ -54,7 +54,6 @@ org.apache.tika tika-core 2.3.0 - @@ -64,6 +63,12 @@ pom + + org.apache.tika + tika-parsers-standard-package + 2.3.0 + + diff --git a/src/main/java/info/aaronland/extruder/ExtruderApplication.java b/src/main/java/info/aaronland/extruder/ExtruderApplication.java index ae9a5ac..414f76b 100644 --- a/src/main/java/info/aaronland/extruder/ExtruderApplication.java +++ b/src/main/java/info/aaronland/extruder/ExtruderApplication.java @@ -17,7 +17,6 @@ public static void main(String[] args) throws Exception { @Override public void initialize(Bootstrap bootstrap) { - // bootstrap.setName("extruder"); bootstrap.addBundle(new ViewBundle()); } From 6802052d36220d4f8d560ad31335eb737b65e315 Mon Sep 17 00:00:00 2001 From: thisisaaronland Date: Wed, 16 Feb 2022 09:20:30 -0800 Subject: [PATCH 07/15] snapshot: builds --- TODO.txt | 2 +- src/main/java/com/basistech/readability/FilePageReader.java | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/TODO.txt b/TODO.txt index 7dcd2ed..f4654c6 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,4 +1,4 @@ -# Generated automatically at Wed Feb 16 09:14:45 PST 2022 +# Generated automatically at Wed Feb 16 09:20:02 PST 2022 ./src/main/java/info/aaronland/extruder/JavaReadabilityResource.java:54: // TODO: trap MalformedURLExceptions and return NOT_ACCEPTABLE here (20130901/straup) ./src/main/java/info/aaronland/extruder/BoilerpipeResource.java:52: // TODO: trap MalformedURLExceptions and return NOT_ACCEPTABLE here (20130901/straup) diff --git a/src/main/java/com/basistech/readability/FilePageReader.java b/src/main/java/com/basistech/readability/FilePageReader.java index 51a665a..73b3230 100644 --- a/src/main/java/com/basistech/readability/FilePageReader.java +++ b/src/main/java/com/basistech/readability/FilePageReader.java @@ -22,7 +22,9 @@ import java.io.FileInputStream; import java.io.IOException; -import org.apache.tika.io.IOUtils; +import org.apache.commons.io.IOUtils ; + +// import org.apache.tika.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; From 62642fe396161da6a7aa535c3a693f9676f25d92 Mon Sep 17 00:00:00 2001 From: thisisaaronland Date: Wed, 16 Feb 2022 09:29:01 -0800 Subject: [PATCH 08/15] snapshot: add java.mail dependency; builds but does not start yet --- Makefile | 2 +- TODO.txt | 2 +- pom.xml | 8 +++++++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index a18aa13..6791dc7 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ build: todo mvn install run: - java -jar target/extruder-1.0.jar server + java -jar target/extruder-2.0.jar server todo: echo "# Generated automatically at" `date` > TODO.txt diff --git a/TODO.txt b/TODO.txt index f4654c6..7e41158 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,4 +1,4 @@ -# Generated automatically at Wed Feb 16 09:20:02 PST 2022 +# Generated automatically at Wed Feb 16 09:26:11 PST 2022 ./src/main/java/info/aaronland/extruder/JavaReadabilityResource.java:54: // TODO: trap MalformedURLExceptions and return NOT_ACCEPTABLE here (20130901/straup) ./src/main/java/info/aaronland/extruder/BoilerpipeResource.java:52: // TODO: trap MalformedURLExceptions and return NOT_ACCEPTABLE here (20130901/straup) diff --git a/pom.xml b/pom.xml index 8156caa..0458833 100644 --- a/pom.xml +++ b/pom.xml @@ -76,6 +76,12 @@ jaxb-api 2.3.1 + + + javax.mail + javax.mail-api + 1.6.2 + org.apache.commons @@ -150,7 +156,7 @@ - info.aaronland.extruder.ExtruderService + info.aaronland.extruder.ExtruderApplication From c2d17ebf10bbf3e2a350cb6c9eb125549428603d Mon Sep 17 00:00:00 2001 From: thisisaaronland Date: Wed, 16 Feb 2022 09:54:17 -0800 Subject: [PATCH 09/15] snapshot: sort out build and basic run depedencies; fails on actual reading/extruding --- TODO.txt | 2 +- pom.xml | 14 ++++++-------- .../aaronland/extruder/BoilerpipeResource.java | 6 +++--- .../extruder/JavaReadabilityResource.java | 6 +++--- .../java/info/aaronland/extruder/TikaResource.java | 6 +++--- 5 files changed, 16 insertions(+), 18 deletions(-) diff --git a/TODO.txt b/TODO.txt index 7e41158..dac0850 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,4 +1,4 @@ -# Generated automatically at Wed Feb 16 09:26:11 PST 2022 +# Generated automatically at Wed Feb 16 09:52:03 PST 2022 ./src/main/java/info/aaronland/extruder/JavaReadabilityResource.java:54: // TODO: trap MalformedURLExceptions and return NOT_ACCEPTABLE here (20130901/straup) ./src/main/java/info/aaronland/extruder/BoilerpipeResource.java:52: // TODO: trap MalformedURLExceptions and return NOT_ACCEPTABLE here (20130901/straup) diff --git a/pom.xml b/pom.xml index 0458833..34afa0f 100644 --- a/pom.xml +++ b/pom.xml @@ -31,15 +31,13 @@ metrics-core 4.2.8 - + - com.sun.jersey.contribs - jersey-multipart - - - 1.19.4 - - + org.glassfish.jersey.media + jersey-media-multipart + 3.0.4 + + diff --git a/src/main/java/info/aaronland/extruder/BoilerpipeResource.java b/src/main/java/info/aaronland/extruder/BoilerpipeResource.java index 2969870..d168321 100644 --- a/src/main/java/info/aaronland/extruder/BoilerpipeResource.java +++ b/src/main/java/info/aaronland/extruder/BoilerpipeResource.java @@ -7,9 +7,9 @@ import java.io.InputStream; import java.io.File; -import com.sun.jersey.core.header.FormDataContentDisposition; -import com.sun.jersey.multipart.FormDataMultiPart; -import com.sun.jersey.multipart.FormDataBodyPart; +import org.glassfish.jersey.media.multipart.FormDataContentDisposition; +import org.glassfish.jersey.media.multipart.FormDataMultiPart; +import org.glassfish.jersey.media.multipart.FormDataBodyPart; import javax.ws.rs.core.MediaType; import javax.ws.rs.GET; diff --git a/src/main/java/info/aaronland/extruder/JavaReadabilityResource.java b/src/main/java/info/aaronland/extruder/JavaReadabilityResource.java index 5a2473e..fe24b11 100644 --- a/src/main/java/info/aaronland/extruder/JavaReadabilityResource.java +++ b/src/main/java/info/aaronland/extruder/JavaReadabilityResource.java @@ -15,9 +15,9 @@ import javax.ws.rs.core.Response; import javax.ws.rs.core.Response.Status; -import com.sun.jersey.core.header.FormDataContentDisposition; -import com.sun.jersey.multipart.FormDataMultiPart; -import com.sun.jersey.multipart.FormDataBodyPart; +import org.glassfish.jersey.media.multipart.FormDataContentDisposition; +import org.glassfish.jersey.media.multipart.FormDataMultiPart; +import org.glassfish.jersey.media.multipart.FormDataBodyPart; import java.io.InputStream; import java.io.File; diff --git a/src/main/java/info/aaronland/extruder/TikaResource.java b/src/main/java/info/aaronland/extruder/TikaResource.java index cf1dbb5..f676fed 100644 --- a/src/main/java/info/aaronland/extruder/TikaResource.java +++ b/src/main/java/info/aaronland/extruder/TikaResource.java @@ -12,9 +12,9 @@ import java.io.ByteArrayOutputStream; import java.io.ByteArrayInputStream; -import com.sun.jersey.core.header.FormDataContentDisposition; -import com.sun.jersey.multipart.FormDataMultiPart; -import com.sun.jersey.multipart.FormDataBodyPart; +import org.glassfish.jersey.media.multipart.FormDataContentDisposition; +import org.glassfish.jersey.media.multipart.FormDataMultiPart; +import org.glassfish.jersey.media.multipart.FormDataBodyPart; import javax.ws.rs.GET; import javax.ws.rs.POST; From 13d88d9f2e97708b60370c101e314ccdc82f1c5f Mon Sep 17 00:00:00 2001 From: thisisaaronland Date: Wed, 16 Feb 2022 14:52:24 -0800 Subject: [PATCH 10/15] snapshot: load freemarker stuff; new HTML errors --- TODO.txt | 2 +- pom.xml | 6 ++++++ .../java/info/aaronland/extruder/DocumentView.java | 13 ------------- 3 files changed, 7 insertions(+), 14 deletions(-) diff --git a/TODO.txt b/TODO.txt index dac0850..ef172cf 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,4 +1,4 @@ -# Generated automatically at Wed Feb 16 09:52:03 PST 2022 +# Generated automatically at Wed Feb 16 14:47:00 PST 2022 ./src/main/java/info/aaronland/extruder/JavaReadabilityResource.java:54: // TODO: trap MalformedURLExceptions and return NOT_ACCEPTABLE here (20130901/straup) ./src/main/java/info/aaronland/extruder/BoilerpipeResource.java:52: // TODO: trap MalformedURLExceptions and return NOT_ACCEPTABLE here (20130901/straup) diff --git a/pom.xml b/pom.xml index 34afa0f..e82d2cd 100644 --- a/pom.xml +++ b/pom.xml @@ -26,6 +26,12 @@ 2.0.28 + + io.dropwizard + dropwizard-views-freemarker + 2.0.28 + + io.dropwizard.metrics metrics-core diff --git a/src/main/java/info/aaronland/extruder/DocumentView.java b/src/main/java/info/aaronland/extruder/DocumentView.java index 92f1079..6588393 100644 --- a/src/main/java/info/aaronland/extruder/DocumentView.java +++ b/src/main/java/info/aaronland/extruder/DocumentView.java @@ -3,10 +3,6 @@ import info.aaronland.extruder.Document; import io.dropwizard.views.View; -//import com.google.common.base.Charsets; -//import com.google.common.base.Optional; -//import java.nio.charset.Charset; - public class DocumentView extends View { private final Document document; @@ -19,13 +15,4 @@ public Document getDocument(){ return document; } - // Because in com/codahale/dropwizard/views/freemarker/FreemarkerViewRenderer.java this: - // final Charset charset = view.getCharset().or(Charset.forName(configuration.getEncoding(locale))); - // And since the default encoding for en-us is ISO-8859-1... good times - // (20130908/straup) - - //public Optional getCharset(){ - //return Optional.of(Charsets.UTF_8); - // } - } From f906ba3364e13bf77bc109ea7f4b742a042dde7a Mon Sep 17 00:00:00 2001 From: thisisaaronland Date: Fri, 18 Feb 2022 14:00:54 -0800 Subject: [PATCH 11/15] fix HTML escaping for document.ftl --- TODO.txt | 2 +- src/main/resources/info/aaronland/extruder/document.ftl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/TODO.txt b/TODO.txt index ef172cf..d871033 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,4 +1,4 @@ -# Generated automatically at Wed Feb 16 14:47:00 PST 2022 +# Generated automatically at Fri Feb 18 13:58:24 PST 2022 ./src/main/java/info/aaronland/extruder/JavaReadabilityResource.java:54: // TODO: trap MalformedURLExceptions and return NOT_ACCEPTABLE here (20130901/straup) ./src/main/java/info/aaronland/extruder/BoilerpipeResource.java:52: // TODO: trap MalformedURLExceptions and return NOT_ACCEPTABLE here (20130901/straup) diff --git a/src/main/resources/info/aaronland/extruder/document.ftl b/src/main/resources/info/aaronland/extruder/document.ftl index 4894c00..f88758a 100644 --- a/src/main/resources/info/aaronland/extruder/document.ftl +++ b/src/main/resources/info/aaronland/extruder/document.ftl @@ -2,7 +2,7 @@ - ${document.getTitle()?html} + ${document.getTitle()}