Skip to content

Commit

Permalink
Merge pull request #336 from sebastian-nagel/NUTCH-2583-upgrade-depen…
Browse files Browse the repository at this point in the history
…dencies

NUTCH-2583 Upgrading Nutch's dependencies (contributed by Ralf)
NUTCH-2584 Upgrade parse-tika to use Tika 1.18
NUTCH-2589 HTML redirections are not followed when using parse-tika
  • Loading branch information
sebastian-nagel committed Jun 2, 2018
2 parents 0cec7b5 + 107b364 commit 2544fad
Show file tree
Hide file tree
Showing 17 changed files with 276 additions and 200 deletions.
67 changes: 34 additions & 33 deletions ivy/ivy.xml
Expand Up @@ -34,23 +34,23 @@
</publications>

<dependencies>
<dependency org="org.slf4j" name="slf4j-api" rev="1.6.1" conf="*->master" />
<dependency org="org.slf4j" name="slf4j-log4j12" rev="1.6.1" conf="*->master" />
<dependency org="org.slf4j" name="slf4j-api" rev="1.7.25" conf="*->master" />
<dependency org="org.slf4j" name="slf4j-log4j12" rev="1.7.25" conf="*->master" />

<!--dependency org="log4j" name="log4j" rev="1.2.15" conf="*->default">
<exclude org="javax.jms" name="jms" />
<exclude org="com.sun.jdmk" name="jmxtools" />
<exclude org="com.sun.jmx" name="jmxri" />
</dependency-->

<dependency org="commons-lang" name="commons-lang" rev="2.6" conf="*->default" />
<dependency org="commons-collections" name="commons-collections" rev="3.2.1" conf="*->master" />
<dependency org="commons-httpclient" name="commons-httpclient" rev="3.1" conf="*->master" />
<dependency org="commons-codec" name="commons-codec" rev="1.10" conf="*->default" />
<dependency org="org.apache.commons" name="commons-compress" rev="1.14" conf="*->default" />
<dependency org="org.apache.commons" name="commons-lang3" rev="3.7" conf="*->default" />
<dependency org="org.apache.commons" name="commons-collections4" rev="4.1" conf="*->master" />
<dependency org="org.apache.httpcomponents" name="httpclient" rev="4.5.5" conf="*->master" />
<dependency org="commons-codec" name="commons-codec" rev="1.11" conf="*->default" />
<dependency org="org.apache.commons" name="commons-compress" rev="1.16.1" conf="*->default" />
<dependency org="org.apache.commons" name="commons-jexl" rev="2.1.1" />
<dependency org="com.tdunning" name="t-digest" rev="3.2" />

<!-- Hadoop Dependencies -->
<dependency org="org.apache.hadoop" name="hadoop-common" rev="2.7.4" conf="*->default">
<exclude org="hsqldb" name="hsqldb" />
Expand All @@ -65,14 +65,14 @@
<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="2.7.4" conf="*->default"/>
<!-- End of Hadoop Dependencies -->

<dependency org="org.apache.tika" name="tika-core" rev="1.17" />
<dependency org="com.ibm.icu" name="icu4j" rev="55.1" />
<dependency org="org.apache.tika" name="tika-core" rev="1.18" />
<dependency org="com.ibm.icu" name="icu4j" rev="61.1" />

<dependency org="xerces" name="xercesImpl" rev="2.11.0" />
<dependency org="xerces" name="xmlParserAPIs" rev="2.6.2" />
<dependency org="oro" name="oro" rev="2.0.8" />

<dependency org="com.google.guava" name="guava" rev="18.0" />
<dependency org="com.google.guava" name="guava" rev="25.0-jre" />

<dependency org="com.github.crawler-commons" name="crawler-commons" rev="0.9">
<exclude org="org.apache.tika"/>
Expand All @@ -81,14 +81,14 @@
<dependency org="com.martinkl.warc" name="warc-hadoop" rev="0.1.0" />

<!--dependency org="org.apache.cxf" name="cxf" rev="3.0.4" conf="*->default"/-->
<dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws" rev="3.0.4" conf="*->default"/>
<dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" rev="3.0.4" conf="*->default"/>
<dependency org="org.apache.cxf" name="cxf-rt-transports-http" rev="3.0.4" conf="*->default"/>
<dependency org="org.apache.cxf" name="cxf-rt-transports-http-jetty" rev="3.0.4" conf="*->default"/>
<dependency org="org.apache.cxf" name="cxf-rt-rs-client" rev="3.0.4" conf="test->default"/>
<dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.5.1" conf="*->default"/>
<dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.5.1" conf="*->default"/>
<dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.5.1" conf="*->default"/>
<dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws" rev="3.1.15" conf="*->default"/>
<dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" rev="3.1.15" conf="*->default"/>
<dependency org="org.apache.cxf" name="cxf-rt-transports-http" rev="3.1.15" conf="*->default"/>
<dependency org="org.apache.cxf" name="cxf-rt-transports-http-jetty" rev="3.1.15" conf="*->default"/>
<dependency org="org.apache.cxf" name="cxf-rt-rs-client" rev="3.1.15" conf="test->default"/>
<dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.9.5" conf="*->default"/>
<dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.9.5" conf="*->default"/>
<dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.9.5" conf="*->default"/>

<!-- WARC artifacts needed -->
<dependency org="org.netpreserve.commons" name="webarchive-commons" rev="1.1.5" conf="*->default">
Expand All @@ -115,33 +115,34 @@

<!-- web app dependencies -->

<dependency org="org.apache.commons" name="commons-collections4" rev="4.0" conf="*->default" />
<dependency org="org.springframework" name="spring-core" rev="4.0.4.RELEASE" conf="*->default" />
<dependency org="org.springframework" name="spring-context" rev="4.0.4.RELEASE" conf="*->default" />
<dependency org="org.springframework" name="spring-web" rev="4.0.4.RELEASE" conf="*->default" />
<dependency org="org.apache.commons" name="commons-collections4" rev="4.1" conf="*->default" />
<dependency org="org.springframework" name="spring-core" rev="4.0.9.RELEASE" conf="*->default" />
<dependency org="org.springframework" name="spring-context" rev="4.0.9.RELEASE" conf="*->default" />
<dependency org="org.springframework" name="spring-web" rev="4.0.9.RELEASE" conf="*->default" />

<dependency org="com.sun.jersey" name="jersey-client" rev="1.8" conf="*->default" />
<dependency org="com.sun.jersey" name="jersey-client" rev="1.19.4" conf="*->default" />

<dependency org="com.j256.ormlite" name="ormlite-jdbc" rev="4.48" conf="*->default" />
<dependency org="com.h2database" name="h2" rev="1.4.180" conf="*->default" />
<dependency org="org.eclipse.persistence" name="javax.persistence" rev="2.0.0" conf="*->default" />
<dependency org="com.j256.ormlite" name="ormlite-jdbc" rev="5.1" conf="*->default" />
<dependency org="com.h2database" name="h2" rev="1.4.197" conf="*->default" />
<dependency org="org.eclipse.persistence" name="javax.persistence" rev="2.2.0" conf="*->default" />

<dependency org="org.apache.wicket" name="wicket-core" rev="6.16.0" conf="*->default" />
<dependency org="org.apache.wicket" name="wicket-spring" rev="6.16.0" conf="*->default" />
<dependency org="org.apache.wicket" name="wicket-core" rev="6.17.0" conf="*->default" />
<dependency org="org.apache.wicket" name="wicket-spring" rev="6.17.0" conf="*->default" />
<dependency org="de.agilecoders.wicket" name="wicket-bootstrap-core" rev="0.9.2" conf="*->default" />
<dependency org="de.agilecoders.wicket" name="wicket-bootstrap-extensions" rev="0.9.2" conf="*->default">
<exclude org="org.json"/>
</dependency>


<!-- RabbitMQ dependencies -->
<dependency org="com.rabbitmq" name="amqp-client" rev="3.6.5" conf="*->default" />
<dependency org="com.rabbitmq" name="amqp-client" rev="5.2.0" conf="*->default" />


<!--Added Because of Elasticsearch JEST client-->
<!--TODO refactor these to indexer-elastic-rest plugin somehow, currently doesn't resolve correctly-->
<dependency org="org.apache.httpcomponents" name="httpcore-nio" rev="4.4.4"/>
<dependency org="org.apache.httpcomponents" name="httpcore" rev="4.4.4"/>
<dependency org="org.apache.httpcomponents" name="httpclient" rev="4.5.2"/>
<dependency org="org.apache.httpcomponents" name="httpcore-nio" rev="4.4.9"/>
<dependency org="org.apache.httpcomponents" name="httpcore" rev="4.4.9"/>
<dependency org="org.apache.httpcomponents" name="httpclient" rev="4.5.5"/>

<!--global exclusion -->
<exclude module="jmxtools" />
Expand Down
4 changes: 4 additions & 0 deletions src/plugin/build-plugin.xml
Expand Up @@ -235,6 +235,10 @@
<ivy:retrieve pattern="${build.test.lib}/[artifact]-[revision].[ext]" symlink="false" log="quiet"/>
</target>

<target name="report" depends="resolve-test" description="--> generates a report of dependencies">
<ivy:report todir="${build.dir}"/>
</target>

<!-- ================================================================== -->
<!-- Clean. Delete the build files, and their directories -->
<!-- ================================================================== -->
Expand Down
15 changes: 1 addition & 14 deletions src/plugin/parse-tika/build.xml
Expand Up @@ -19,23 +19,10 @@

<import file="../build-plugin.xml"/>

<!-- Build compilation dependencies -->
<target name="deps-jar">
<ant target="jar" inheritall="false" dir="../lib-nekohtml" />
</target>

<!-- Add compilation dependencies to classpath -->
<path id="plugin.deps">
<fileset dir="${nutch.root}/build">
<include name="**/lib-nekohtml/*.jar" />
</fileset>
</path>

<!-- Deploy Unit test dependencies -->
<!-- Deploy Unit test dependencies -->
<target name="deps-test">
<ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
<ant target="deploy" inheritall="false" dir="../protocol-file"/>
<ant target="deploy" inheritall="false" dir="../lib-nekohtml" />
</target>

<!-- for junit test -->
Expand Down
16 changes: 13 additions & 3 deletions src/plugin/parse-tika/howto_upgrade_tika.txt
@@ -1,8 +1,18 @@
1. Upgrade Tika depencency in trunk/ivy/ivy.xml
1. Upgrade Tika depencency (tika-core) in ivy/ivy.xml

2. Upgrade Tika dependency in src/plugin/parse-tika/ivy.xml

3. Upgrade Tika's own dependencies in src/plugin/parse-tika/plugin.xml

To get the list of dependencies and their versions execute:
$ ant -f ./build-ivy.xml
$ ls lib | sed 's/^/ <library name="/g' | sed 's/$/"\/>/g'
$ cd src/plugin/parse-tika/
$ ant -f ./build-ivy.xml
$ ls lib | sed 's/^/ <library name="/g' | sed 's/$/"\/>/g'

In the plugin.xml replace all lines between
<!-- dependencies of Tika (tika-parsers) -->
and
<!-- end of dependencies of Tika (tika-parsers) -->
with the output of the command above.


2 changes: 1 addition & 1 deletion src/plugin/parse-tika/ivy.xml
Expand Up @@ -36,7 +36,7 @@
</publications>

<dependencies>
<dependency org="org.apache.tika" name="tika-parsers" rev="1.17" conf="*->default">
<dependency org="org.apache.tika" name="tika-parsers" rev="1.18" conf="*->default">
<exclude org="org.apache.tika" name="tika-core" />
<exclude org="org.apache.httpcomponents" name="httpclient" />
<exclude org="org.apache.httpcomponents" name="httpcore" />
Expand Down
65 changes: 42 additions & 23 deletions src/plugin/parse-tika/plugin.xml
Expand Up @@ -25,6 +25,8 @@
<library name="parse-tika.jar">
<export name="*"/>
</library>
<!-- dependencies of Tika (tika-parsers) -->
<library name="aopalliance-1.0.jar"/>
<library name="apache-mime4j-core-0.8.1.jar"/>
<library name="apache-mime4j-dom-0.8.1.jar"/>
<library name="asm-5.0.4.jar"/>
Expand All @@ -35,53 +37,61 @@
<library name="bzip2-0.9.1.jar"/>
<library name="c3p0-0.9.1.1.jar"/>
<library name="cdm-4.5.5.jar"/>
<library name="commons-codec-1.6.jar"/>
<library name="commons-codec-1.10.jar"/>
<library name="commons-collections4-4.1.jar"/>
<library name="commons-compress-1.14.jar"/>
<library name="commons-compress-1.16.1.jar"/>
<library name="commons-csv-1.0.jar"/>
<library name="commons-exec-1.3.jar"/>
<library name="commons-io-2.5.jar"/>
<library name="commons-io-2.6.jar"/>
<library name="commons-logging-1.1.3.jar"/>
<library name="commons-logging-1.2.jar"/>
<library name="commons-logging-api-1.1.jar"/>
<library name="curvesapi-1.04.jar"/>
<library name="cxf-core-3.0.16.jar"/>
<library name="cxf-rt-frontend-jaxrs-3.0.16.jar"/>
<library name="cxf-rt-rs-client-3.0.16.jar"/>
<library name="cxf-rt-transports-http-3.0.16.jar"/>
<library name="dec-0.1.2.jar"/>
<library name="ehcache-core-2.6.2.jar"/>
<library name="fontbox-2.0.8.jar"/>
<library name="geoapi-3.0.0.jar"/>
<library name="fontbox-2.0.9.jar"/>
<library name="geoapi-3.0.1.jar"/>
<library name="grib-4.5.5.jar"/>
<library name="gson-2.8.1.jar"/>
<library name="guava-17.0.jar"/>
<library name="httpmime-4.5.4.jar"/>
<library name="httpservices-4.5.5.jar"/>
<library name="isoparser-1.1.18.jar"/>
<library name="jackcess-2.1.8.jar"/>
<library name="jackcess-encrypt-2.1.2.jar"/>
<library name="jackson-core-2.9.2.jar"/>
<library name="jackcess-2.1.10.jar"/>
<library name="jackcess-encrypt-2.1.4.jar"/>
<library name="jackson-annotations-2.9.5.jar"/>
<library name="jackson-core-2.9.5.jar"/>
<library name="jackson-databind-2.9.5.jar"/>
<library name="jai-imageio-core-1.3.1.jar"/>
<library name="java-libpst-0.8.1.jar"/>
<library name="javax.annotation-api-1.2.jar"/>
<library name="javax.ws.rs-api-2.0.1.jar"/>
<library name="jbig2-imageio-3.0.0.jar"/>
<library name="jcip-annotations-1.0.jar"/>
<library name="jcl-over-slf4j-1.7.24.jar"/>
<library name="jcommander-1.35.jar"/>
<library name="jdom2-2.0.4.jar"/>
<library name="jdom2-2.0.6.jar"/>
<library name="jempbox-1.8.13.jar"/>
<library name="jhighlight-1.0.2.jar"/>
<library name="jmatio-1.2.jar"/>
<library name="jna-4.1.0.jar"/>
<library name="joda-time-2.2.jar"/>
<library name="json-1.8.jar"/>
<library name="json-simple-1.1.1.jar"/>
<library name="jsoup-1.7.2.jar"/>
<library name="jsr-275-0.9.3.jar"/>
<library name="jsoup-1.11.2.jar"/>
<library name="jul-to-slf4j-1.7.24.jar"/>
<library name="juniversalchardet-1.0.3.jar"/>
<library name="junrar-0.7.jar"/>
<library name="metadata-extractor-2.10.1.jar"/>
<library name="netcdf4-4.5.5.jar"/>
<library name="opennlp-tools-1.8.3.jar"/>
<library name="pdfbox-2.0.8.jar"/>
<library name="pdfbox-tools-2.0.8.jar"/>
<library name="objenesis-2.6.jar"/>
<library name="openjson-1.0.10.jar"/>
<library name="opennlp-tools-1.8.4.jar"/>
<library name="pdfbox-2.0.9.jar"/>
<library name="pdfbox-tools-2.0.9.jar"/>
<library name="poi-3.17.jar"/>
<library name="poi-ooxml-3.17.jar"/>
<library name="poi-ooxml-schemas-3.17.jar"/>
Expand All @@ -90,27 +100,36 @@
<library name="rome-1.5.1.jar"/>
<library name="rome-utils-1.5.1.jar"/>
<library name="sentiment-analysis-parser-0.1.jar"/>
<library name="sis-metadata-0.6.jar"/>
<library name="sis-netcdf-0.6.jar"/>
<library name="sis-referencing-0.6.jar"/>
<library name="sis-storage-0.6.jar"/>
<library name="sis-utility-0.6.jar"/>
<library name="sis-feature-0.8.jar"/>
<library name="sis-metadata-0.8.jar"/>
<library name="sis-netcdf-0.8.jar"/>
<library name="sis-referencing-0.8.jar"/>
<library name="sis-storage-0.8.jar"/>
<library name="sis-utility-0.8.jar"/>
<library name="spring-aop-3.2.16.RELEASE.jar"/>
<library name="spring-beans-3.2.16.RELEASE.jar"/>
<library name="spring-context-3.2.16.RELEASE.jar"/>
<library name="spring-core-3.2.16.RELEASE.jar"/>
<library name="spring-expression-3.2.16.RELEASE.jar"/>
<library name="stax2-api-3.1.4.jar"/>
<library name="tagsoup-1.2.1.jar"/>
<library name="tika-parsers-1.17.jar"/>
<library name="tika-parsers-1.18.jar"/>
<library name="udunits-4.5.5.jar"/>
<library name="uimafit-core-2.2.0.jar"/>
<library name="uimaj-core-2.9.0.jar"/>
<library name="unit-api-1.0.jar"/>
<library name="vorbis-java-core-0.8.jar"/>
<library name="vorbis-java-tika-0.8.jar"/>
<library name="woodstox-core-asl-4.4.1.jar"/>
<library name="xmlbeans-2.6.0.jar"/>
<library name="xmlschema-core-2.2.2.jar"/>
<library name="xmpcore-5.1.3.jar"/>
<library name="xz-1.6.jar"/>
<library name="xz-1.8.jar"/>
<!-- end of dependencies of Tika (tika-parsers) -->
</runtime>

<requires>
<import plugin="nutch-extensionpoints"/>
<import plugin="lib-nekohtml"/>
</requires>

<extension point="org.apache.nutch.parse.Parser"
Expand Down

0 comments on commit 2544fad

Please sign in to comment.