Skip to content
This repository has been archived by the owner on Jul 3, 2023. It is now read-only.

Commit

Permalink
OODT-630 et al. Upgrade Tika to version 1.6.
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.apache.org/repos/asf/oodt/trunk@1631859 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
tbpalsulich committed Oct 14, 2014
1 parent 4be6b08 commit 260c233
Show file tree
Hide file tree
Showing 35 changed files with 11,955 additions and 1,396 deletions.
2 changes: 2 additions & 0 deletions CHANGES.txt
Expand Up @@ -3,6 +3,8 @@ Apache OODT Change Log

Release 0.8 - Current Development

* OODT-385, OODT-630, OODT-631, OODT-632. Upgraded Tika to version 1.6.

* OODT-757 Fixed PGETaskInstance bug that prevented instantiation of AutoDetectProductCrawler (luca)

* OODT-756 HttpClient NoClassDefFoundError For the url-downloader Script (Mengying Wang via mattmann)
Expand Down
14 changes: 14 additions & 0 deletions core/pom.xml
Expand Up @@ -323,6 +323,20 @@ the License.
</plugin>
</plugins>
</build>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.6</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.6</version>
</dependency>
</dependencies>
</dependencyManagement>
<profiles>
<profile>
<id>audit</id>
Expand Down
1 change: 0 additions & 1 deletion curator/pom.xml
Expand Up @@ -23,7 +23,6 @@ the License.
<version>0.8-SNAPSHOT</version>
<relativePath>../core/pom.xml</relativePath>
</parent>
<groupId>org.apache.oodt</groupId>
<artifactId>cas-curator</artifactId>
<packaging>war</packaging>
<name>CAS Curation Interface</name>
Expand Down
Expand Up @@ -20,11 +20,13 @@
//JDK imports
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;

//OODT imports
import org.apache.oodt.cas.metadata.util.PathUtils;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
Expand Down Expand Up @@ -112,9 +114,8 @@ public Reference(String origRef, String dataRef, long size) {
// ourselves to determine the which MimeType class to associate
// with this reference.
try {
this.mimeType = mimeTypeRepository
.getMimeType(new URL(origRef));
} catch (MalformedURLException e) {
this.mimeType = mimeTypeRepository.forName(new Tika().detect(origRef));
} catch (MimeTypeException e) {
e.printStackTrace();
}

Expand Down
8 changes: 5 additions & 3 deletions grid/web-grid.iml
Expand Up @@ -25,6 +25,8 @@
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="Maven: org.apache.tika:tika-core:0.8" level="project" />
<orderEntry type="library" name="Maven: joda-time:joda-time:2.3" level="project" />
<orderEntry type="module" module-name="oodt-commons" />
<orderEntry type="library" name="Maven: commons-dbcp:commons-dbcp:1.2.1" level="project" />
<orderEntry type="library" name="Maven: commons-collections:commons-collections:2.1" level="project" />
Expand Down Expand Up @@ -56,10 +58,10 @@
<orderEntry type="module" module-name="pcs-input" />
<orderEntry type="library" name="Maven: commons-io:commons-io:1.4" level="project" />
<orderEntry type="library" name="Maven: commons-codec:commons-codec:1.3" level="project" />
<orderEntry type="library" name="Maven: org.apache.tika:tika-core:0.8" level="project" />
<orderEntry type="module" module-name="cas-cli" />
<orderEntry type="library" name="Maven: org.apache.tika:tika-core:1.6" level="project" />
<orderEntry type="library" name="Maven: com.google.guava:guava:10.0.1" level="project" />
<orderEntry type="library" name="Maven: com.google.code.findbugs:jsr305:1.3.9" level="project" />
<orderEntry type="module" module-name="cas-cli" />
<orderEntry type="library" name="Maven: org.springframework:spring-expression:3.0.5.RELEASE" level="project" />
<orderEntry type="library" name="Maven: commons-httpclient:commons-httpclient:3.0-alpha1" level="project" />
<orderEntry type="library" name="Maven: commons-cli:commons-cli:1.2" level="project" />
Expand Down Expand Up @@ -87,7 +89,7 @@
<orderEntry type="library" name="Maven: com.fasterxml.jackson.core:jackson-core:2.1.1" level="project" />
<orderEntry type="library" name="Maven: com.fasterxml.jackson.core:jackson-databind:2.1.1" level="project" />
<orderEntry type="library" name="Maven: com.fasterxml.jackson.core:jackson-annotations:2.1.1" level="project" />
<orderEntry type="library" name="Maven: joda-time:joda-time:2.3" level="project" />
<orderEntry type="library" name="Maven: joda-time:joda-time:2.5" level="project" />
<orderEntry type="library" name="Maven: com.hp.hpl.jena:jena:2.6.3" level="project" />
<orderEntry type="library" name="Maven: com.hp.hpl.jena:iri:0.8" level="project" />
<orderEntry type="library" name="Maven: com.ibm.icu:icu4j:3.4.4" level="project" />
Expand Down
20 changes: 0 additions & 20 deletions metadata/pom.xml
Expand Up @@ -51,25 +51,6 @@ the License.
</includes>
</resource>
</resources>
<testResources>
<testResource>
<targetPath>org/apache/oodt/cas/metadata</targetPath>
<directory>${basedir}/src/testdata</directory>
<includes>
<include>copyandrewrite.test.conf</include>
<include>extern-config.xml</include>
<include>met_extr_preconditions.xml</include>
<include>product-type-patterns.xml</include>
<include>product-type-patterns-2.xml</include>
<include>samplemet.xml</include>
<include>testExtractor</include>
<include>testfile.txt</include>
<include>testfile2.txt.met</include>
<include>testfile2.txt</include>
<include>tika-mimetypes.xml</include>
</includes>
</testResource>
</testResources>
<plugins>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
Expand Down Expand Up @@ -162,7 +143,6 @@ the License.
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>0.8</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
Expand Down
Expand Up @@ -29,6 +29,7 @@
import java.util.logging.Logger;

//APACHE imports
import org.apache.tika.Tika;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
Expand All @@ -53,6 +54,8 @@ public final class MimeTypeUtils {
/* our Tika mime type registry */
private MimeTypes mimeTypes;

private Tika tika = new Tika();

/* whether or not magic should be employed or not */
private boolean mimeMagic;

Expand Down Expand Up @@ -171,11 +174,14 @@ public String autoResolveContentType(String typeName, String url,

// if returned null, or if it's the default type then try url resolution
if (type == null
|| (type != null && type.getName().equals(MimeTypes.OCTET_STREAM))) {
|| (type.getName().equals(MimeTypes.OCTET_STREAM))) {
// If no mime-type header, or cannot find a corresponding registered
// mime-type, then guess a mime-type from the url pattern
type = this.mimeTypes.getMimeType(url) != null ? this.mimeTypes
.getMimeType(url) : type;
try {
type = mimeTypes.forName(tika.detect(url)) != null ? mimeTypes.forName(tika.detect(url)) : type;
} catch (Exception e) {
// MimeTypeException or IOException from tika.detect. Ignore.
}
}

// if magic is enabled use mime magic to guess if the mime type returned
Expand All @@ -185,7 +191,12 @@ public String autoResolveContentType(String typeName, String url,
// type
// returned by the magic
if (this.mimeMagic) {
MimeType magicType = this.mimeTypes.getMimeType(data);
MimeType magicType;
try {
magicType = mimeTypes.forName(tika.detect(data));
} catch (Exception e) {
magicType = null;
}
if (magicType != null
&& !magicType.getName().equals(MimeTypes.OCTET_STREAM)
&& type != null
Expand All @@ -212,73 +223,76 @@ public String autoResolveContentType(String typeName, String url,

/**
* Facade interface to Tika's underlying
* {@link MimeTypes#getMimeType(String)} method.
*
* {@link tika.detect(String)} method.
*
* @param url
* A string representation of the document {@link URL} to sense
* the {@link MimeType} for.
* @return An appropriate {@link MimeType}, identified from the given
* Document url in string form.
*/
public String getMimeType(URL url) {
MimeType mimeType = this.mimeTypes.getMimeType(url);
if (mimeType != null)
return mimeType.getName();
else
return null;
try {
return tika.detect(url);
} catch (Exception e) {
return null;
}
}

/**
* A facade interface to Tika's underlying {@link MimeTypes#forName(String)}
* A facade interface to Tika's underlying {@link org.apache.tika.tika.detect(String)}
* method.
*
*
* @param name
* The name of a valid {@link MimeType} in the Tika mime
* registry.
* @return The object representation of the {@link MimeType}, if it exists,
* or null otherwise.
*/
public String getMimeType(String name) {
MimeType mimeType = this.mimeTypes.getMimeType(name);
if (mimeType != null)
return mimeType.getName();
else
return null;
try {
return tika.detect(name);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}

/**
* Facade interface to Tika's underlying {@link MimeTypes#getMimeType(File)}
* Facade interface to Tika's underlying {@link org.apache.tika.Tika#detect(File)}
* method.
*
*
* @param f
* The {@link File} to sense the {@link MimeType} for.
* @return The {@link MimeType} of the given {@link File}, or null if it
* cannot be determined.
*/
public String getMimeType(File f) {
MimeType mimeType = this.mimeTypes.getMimeType(f);
if (mimeType != null)
return mimeType.getName();
else
return null;
try {
return tika.detect(f);
} catch (Exception e) {
System.err.println("\n\n\n");
e.printStackTrace();
System.err.println("\n\n\n");
return null;
}
}

/**
* Utility method to act as a facade to
* {@link MimeTypes#getMimeType(byte[])}.
*
*
* @param data
* The byte data to get the {@link MimeType} for.
* @return The String representation of the resolved {@link MimeType}, or
* null if a suitable {@link MimeType} is not found.
*/
public String getMimeTypeByMagic(byte[] data) {
MimeType type = this.mimeTypes.getMimeType(data);
if (type != null) {
return type.getName();
} else
try {
return tika.detect(data);
} catch (Exception e) {
return null;

}
}

public String getDescriptionForMimeType(String mimeType) {
Expand Down

0 comments on commit 260c233

Please sign in to comment.