Skip to content

Commit

Permalink
[TIKA-4119]: Return media type "text/javascript" instead of "applicat…
Browse files Browse the repository at this point in the history
…ion/javascript" to follow RFC-9239 (#1556)

* [TIKA-4119]: Return media type "text/javascript" instead of "application/javascript"

Following RFC 9239. This also adds support for ".msj" ( as documented in the RFC).
  • Loading branch information
marcospereira committed Jan 24, 2024
1 parent 589d02a commit ae737cd
Show file tree
Hide file tree
Showing 7 changed files with 48 additions and 30 deletions.
3 changes: 3 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ Release 3.0.0-BETA - 12/01/2023
* Tika will look for "custom-mimetypes.xml" directly on the classpath, NOT
under "/org/apache/tika/mime/". (TIKA-4147).

* Return media type "text/javascript" instead of "application/javascript"
to follow RFC-9239. (TIKA-4119).

Other Changes/Updates

* Upgrade PDFBox to 3.0.1 (TIKA-3347)
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ Tika jars can be fetched from Maven Central or your favourite Maven mirror.

**Tika 1.X reached End of Life (EOL) on September 30, 2022.**

Tika is based on **Java 8** and uses the [Maven 3](https://maven.apache.org) build system.
Tika is based on **Java 11** and uses the [Maven 3](https://maven.apache.org) build system.
**N.B.** [Docker](https://www.docker.com/products/personal) is used for tests in tika-integration-tests.
As of Tika 2.5.1, if Docker is not installed, those tests are skipped. Docker is required for a successful
build on earlier 2.x versions.
Expand Down Expand Up @@ -50,7 +50,7 @@ Maven Dependencies

Apache Tika provides *Bill of Material* (BOM) artifact to align Tika module versions and simplify version management.
To avoid convergence errors in your own project, import this
bom or Tika's parent pom.xml in your dependencey management section.
bom or Tika's parent pom.xml in your dependency management section.

If you use Apache Maven:

Expand Down Expand Up @@ -170,7 +170,7 @@ Notification on all code changes are sent to the following mailing list:
The mailing lists are open to anyone and publicly archived.

You can subscribe the mailing lists by sending a message to
[LIST]-subscribe@tika.apache.org (for example user-subscribe@...).
[LIST]-subscribe@tika.apache.org (for example, user-subscribe@...).
To unsubscribe, send a message to [LIST]-unsubscribe@tika.apache.org.
For more instructions, send a message to [LIST]-help@tika.apache.org.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -366,15 +366,18 @@
<glob pattern="*.ser"/>
</mime-type>

<mime-type type="application/javascript">
<mime-type type="text/javascript">
<alias type="application/javascript"/>
<alias type="application/x-javascript"/>
<alias type="text/javascript"/>
<sub-class-of type="text/plain"/>
<_comment>JavaScript Source Code</_comment>
<!-- From RFC 9239: https://www.rfc-editor.org/rfc/rfc9239.html#name-text-javascript -->
<!-- File extension(s): .js, .mjs -->
<glob pattern="*.js"/>
<glob pattern="*.mjs"/>

<!-- Note - there is no Unique Magic for JavaScript files! -->
<!-- Generally you can only detect JS with the filename -->
<!-- Generally, you can only detect JS with the filename -->
<!-- However... A few common JS libraries accidentally trigger -->
<!-- the HTML priority=20 magic incorrectly. So, for those only, -->
<!-- we list "magic" for those specific files -->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ public void testHttpServerFileExtensions() {
assertEquals("application/java-archive", tika.detect("x.jar"));
assertEquals("application/java-serialized-object", tika.detect("x.ser"));
assertEquals("application/java-vm", tika.detect("x.class"));
assertEquals("application/javascript", tika.detect("x.js"));
assertEquals("text/javascript", tika.detect("x.js"));
assertEquals("text/javascript", tika.detect("x.mjs"));
assertEquals("application/json", tika.detect("x.json"));
assertEquals("application/lost+xml", tika.detect("x.lostxml"));
assertEquals("application/mac-binhex40", tika.detect("x.hqx"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import java.util.List;
import java.util.Set;
import java.util.concurrent.Executors;
import java.util.stream.Collectors;

import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
Expand Down Expand Up @@ -279,6 +280,24 @@ public void testGetExtensionForPowerPoint() throws Exception {
assertEquals(".ppt", mt.getExtensions().get(0));
}

@Test
public void testGetExtensionForJavaScript() throws Exception {
MimeType mt = this.mimeTypes.forName("text/javascript");
assertEquals(".js", mt.getExtension());
assertEquals(List.of(".js", ".mjs"), mt.getExtensions());
}

@Test
public void testGetAliasForJavaScript() throws Exception {
MimeType mt = this.mimeTypes.forName("text/javascript");
Set<String> aliases = mimeTypes.getMediaTypeRegistry()
.getAliases(mt.getType())
.stream()
.map(MediaType::toString)
.collect(Collectors.toSet());
assertEquals(Set.of("application/javascript", "application/x-javascript"), aliases);
}

@Test
public void testGetRegisteredMimesWithParameters() throws Exception {
//TIKA-1692
Expand Down Expand Up @@ -351,40 +370,32 @@ public void testMinShouldMatch() throws Exception {
}

@Test
public void testBadMinShouldMatch1() throws Exception {
public void testBadMinShouldMatch1() {
System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP,
"src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch1.xml");

assertThrows(IllegalArgumentException.class, () -> {
MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(new CustomClassLoader());
});
assertThrows(IllegalArgumentException.class, () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader()));
}

@Test
public void testBadMinShouldMatch2() throws Exception {
public void testBadMinShouldMatch2() {
System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP,
"src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch2.xml");
assertThrows(IllegalArgumentException.class, () -> {
MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(new CustomClassLoader());
});
assertThrows(IllegalArgumentException.class, () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader()));
}

@Test
public void testBadMinShouldMatch3() throws Exception {
public void testBadMinShouldMatch3() {
System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP,
"src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch3.xml");
assertThrows(IllegalArgumentException.class, () -> {
MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(new CustomClassLoader());
});
assertThrows(IllegalArgumentException.class, () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader()));
}

@Test
public void testBadMinShouldMatch4() throws Exception {
public void testBadMinShouldMatch4() {
System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP,
"src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch4.xml");
assertThrows(IllegalArgumentException.class, () -> {
MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(new CustomClassLoader());
});
assertThrows(IllegalArgumentException.class, () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader()));
}

private static class CustomClassLoader extends ClassLoader {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,6 @@ public void testTIKA2237() throws IOException {
"} catch (e) {\n" + " console.log(e);\n" + "}")
.getBytes(StandardCharsets.UTF_8));
MediaType detect = new ProbabilisticMimeDetectionSelector().detect(input, metadata);
assertEquals(MediaType.application("javascript"), detect);
assertEquals(MediaType.text("javascript"), detect);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -718,8 +718,8 @@ public void testTextBasedFormatsDetection() throws Exception {
assertTypeByName("text/html", "testHTML.html");
assertType("text/html", "testHTML.html");

assertTypeByName("application/javascript", "testJS.js");
assertType("application/javascript", "testJS.js");
assertTypeByName("text/javascript", "testJS.js");
assertType("text/javascript", "testJS.js");

assertType("text/vnd.graphviz", "testGRAPHVIZd.dot");
assertType("text/vnd.graphviz", "testGRAPHVIZg.dot");
Expand Down Expand Up @@ -1148,10 +1148,10 @@ public void testCodeFormats() throws Exception {
assertTypeByData("text/x-matlab", "testMATLAB_barcast.m");

// By name, or by name+data, gets it as JS
assertTypeByName("application/javascript", "testJS.js");
assertTypeByName("application/javascript", "testJS_HTML.js");
assertType("application/javascript", "testJS.js");
assertType("application/javascript", "testJS_HTML.js");
assertTypeByName("text/javascript", "testJS.js");
assertTypeByName("text/javascript", "testJS_HTML.js");
assertType("text/javascript", "testJS.js");
assertType("text/javascript", "testJS_HTML.js");

// With data only, because we have no JS file magic, can't be
// detected. One will come through as plain text, the other
Expand Down

0 comments on commit ae737cd

Please sign in to comment.