Skip to content

Commit

Permalink
Switched to Tika to determine document type
Browse files Browse the repository at this point in the history
  • Loading branch information
Claudenw committed Apr 18, 2024
1 parent 1fea3ea commit 6b09f8e
Show file tree
Hide file tree
Showing 12 changed files with 542 additions and 44 deletions.
5 changes: 5 additions & 0 deletions apache-rat-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -126,5 +126,10 @@
<artifactId>assertj-core</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>2.9.2</version>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,25 @@
*/
package org.apache.rat.analysis;

import java.io.IOException;
import java.io.InputStream;
import java.util.Collection;

import org.apache.rat.ConfigurationException;
import org.apache.rat.api.Document;
import org.apache.rat.api.Document.Type;
import org.apache.rat.document.IDocumentAnalyser;
import org.apache.rat.document.RatDocumentAnalysisException;
import org.apache.rat.document.impl.guesser.ArchiveGuesser;
import org.apache.rat.document.impl.guesser.BinaryGuesser;
import org.apache.rat.document.impl.guesser.NoteGuesser;
import org.apache.rat.license.ILicense;
import org.apache.rat.utils.Log;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;

/**
* Creates default analysers.
Expand Down Expand Up @@ -71,17 +79,52 @@ public DefaultAnalyser(final Log log, final Collection<ILicense> licenses) {
this.log = log;
}

// @Override
// public void analyse(Document document) throws RatDocumentAnalysisException {
// if (NoteGuesser.isNote(document)) {
// document.getMetaData().setDocumentType(Document.Type.NOTICE);
// } else if (ArchiveGuesser.isArchive(document)) {
// document.getMetaData().setDocumentType(Document.Type.ARCHIVE);
// } else if (BinaryGuesser.isBinary(document)) {
// document.getMetaData().setDocumentType(Document.Type.BINARY);
// } else {
// document.getMetaData().setDocumentType(Document.Type.STANDARD);
// new DocumentHeaderAnalyser(log, licenses).analyse(document);
// }
// }



@Override
public void analyse(Document document) throws RatDocumentAnalysisException {
if (NoteGuesser.isNote(document)) {
document.getMetaData().setDocumentType(Document.Type.NOTICE);
} else if (ArchiveGuesser.isArchive(document)) {
document.getMetaData().setDocumentType(Document.Type.ARCHIVE);
} else if (BinaryGuesser.isBinary(document)) {
document.getMetaData().setDocumentType(Document.Type.BINARY);
} else {
document.getMetaData().setDocumentType(Document.Type.STANDARD);
new DocumentHeaderAnalyser(log, licenses).analyse(document);
BodyContentHandler handler = new BodyContentHandler();

AutoDetectParser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
try (InputStream stream = document.inputStream()) {
parser.parse(stream, handler, metadata);
document.getMetaData()
.setDocumentType(Document.Type.fromContentType(metadata.get(Metadata.CONTENT_TYPE),log));
if (Type.STANDARD == document.getMetaData().getDocumentType()) {
if (NoteGuesser.isNote(document)) {
document.getMetaData().setDocumentType(Document.Type.NOTICE);
}
}
switch (document.getMetaData().getDocumentType()) {
case STANDARD:
DocumentHeaderAnalyser analyser = new DocumentHeaderAnalyser(log, licenses);
analyser.analyse(document);
case NOTICE:
case ARCHIVE:
case BINARY:
case UNKNOWN:
default:
break;
}
}

catch (IOException | SAXException | TikaException e) {
throw new RatDocumentAnalysisException(e);
}
}
}
Expand Down
392 changes: 382 additions & 10 deletions apache-rat-core/src/main/java/org/apache/rat/api/Document.java

Large diffs are not rendered by default.

28 changes: 14 additions & 14 deletions apache-rat-core/src/test/java/org/apache/rat/ReportTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ public void testOutputOption() throws Exception {
new Reporter(config).output();
assertTrue(output.exists());
String content = FileUtils.readFileToString(output, StandardCharsets.UTF_8);
assertTrue(content.contains("2 Unknown Licenses"));
assertTrue(content.contains("3 Unknown Licenses"));
assertTrue(content.contains("target/test-classes/elements/Source.java"));
assertTrue(content.contains("target/test-classes/elements/sub/Empty.txt"));
}
Expand All @@ -91,13 +91,13 @@ public void testDefaultOutput() throws Exception {
}
assertTrue(output.exists());
String content = FileUtils.readFileToString(output, StandardCharsets.UTF_8);
TextUtils.isMatching("Notes: 2$", content);
TextUtils.isMatching("Binaries: 2$", content);
TextUtils.isMatching("Archives: 1$", content);
TextUtils.isMatching("Standards: 8$", content);
TextUtils.isMatching("Apache Licensed: 5$", content);
TextUtils.isMatching("Generated Documents 1$", content);
TextUtils.isMatching("^2 Unknown licenses", content);
TextUtils.assertPatternInOutput("Notes: 2$", content);
TextUtils.assertPatternInOutput("Binaries: 1$", content);
TextUtils.assertPatternInOutput("Archives: 1$", content);
TextUtils.assertPatternInOutput("Standards: 9$", content);
TextUtils.assertPatternInOutput("Apache Licensed: 5$", content);
TextUtils.assertPatternInOutput("Generated Documents: 1$", content);
TextUtils.assertPatternInOutput("^3 Unknown Licenses", content);
assertTrue(content.contains(" S target/test-classes/elements/ILoggerFactory.java"));
assertTrue(content.contains(" B target/test-classes/elements/Image.png"));
assertTrue(content.contains(" N target/test-classes/elements/LICENSE"));
Expand All @@ -108,7 +108,7 @@ public void testDefaultOutput() throws Exception {
assertTrue(content.contains(" S target/test-classes/elements/Xml.xml"));
assertTrue(content.contains(" S target/test-classes/elements/buildr.rb"));
assertTrue(content.contains(" A target/test-classes/elements/dummy.jar"));
assertTrue(content.contains(" B target/test-classes/elements/plain.json"));
assertTrue(content.contains("!S target/test-classes/elements/plain.json"));
assertTrue(content.contains("!S target/test-classes/elements/sub/Empty.txt"));
assertTrue(content.contains(" S target/test-classes/elements/tri.txt"));
assertTrue(content.contains(" G target/test-classes/elements/generated.txt"));
Expand All @@ -132,7 +132,7 @@ public void testXMLOutput() throws Exception {
XPath xPath = XPathFactory.newInstance().newXPath();

NodeList nodeList = XmlUtils.getNodeList(doc, xPath, "/rat-report/resource/license[@approval='false']");
assertEquals(2, nodeList.getLength());
assertEquals(3, nodeList.getLength());

nodeList = XmlUtils.getNodeList(doc, xPath, "/rat-report/resource/license[@id='AL']");
assertEquals(5, nodeList.getLength());
Expand All @@ -147,17 +147,17 @@ public void testXMLOutput() throws Exception {
assertEquals(1, nodeList.getLength());

nodeList = XmlUtils.getNodeList(doc, xPath, "/rat-report/resource/license[@id='?????']");
assertEquals(2, nodeList.getLength());
assertEquals(3, nodeList.getLength());

// GENERATED, UNKNOWN, ARCHIVE, NOTICE, BINARY, STANDARD
nodeList = XmlUtils.getNodeList(doc, xPath, "/rat-report/resource[@type='STANDARD']");
assertEquals(8, nodeList.getLength());
assertEquals(9, nodeList.getLength());

nodeList = XmlUtils.getNodeList(doc, xPath, "/rat-report/resource[@type='ARCHIVE']");
assertEquals(1, nodeList.getLength());

nodeList = XmlUtils.getNodeList(doc, xPath, "/rat-report/resource[@type='BINARY']");
assertEquals(2, nodeList.getLength());
assertEquals(1, nodeList.getLength());

nodeList = XmlUtils.getNodeList(doc, xPath, "/rat-report/resource[@type='GENERATED']");
assertEquals(1, nodeList.getLength());
Expand All @@ -169,7 +169,7 @@ public void testXMLOutput() throws Exception {
assertEquals(2, nodeList.getLength());

nodeList = XmlUtils.getNodeList(doc, xPath, "/rat-report/resource/sample");
assertEquals(1, nodeList.getLength());
assertEquals(2, nodeList.getLength());

nodeList = XmlUtils.getNodeList(doc, xPath, "/rat-report/resource[@type='GENERATED']/license/notes");
assertEquals(1, nodeList.getLength());
Expand Down
14 changes: 8 additions & 6 deletions apache-rat-core/src/test/java/org/apache/rat/ReporterTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ public void xmlReportTest() throws Exception {
checkNode(doc, xPath, "src/test/resources/elements/Xml.xml", apacheLic, "STANDARD", false);
checkNode(doc, xPath, "src/test/resources/elements/buildr.rb", apacheLic, "STANDARD", false);
checkNode(doc, xPath, "src/test/resources/elements/dummy.jar", null, "ARCHIVE", false);
checkNode(doc, xPath, "src/test/resources/elements/plain.json", null, "BINARY", false);
checkNode(doc, xPath, "src/test/resources/elements/plain.json", null, "STANDARD", false);
checkNode(doc, xPath, "src/test/resources/elements/sub/Empty.txt", new LicenseInfo("?????", false, false),
"STANDARD", false);
checkNode(doc, xPath, "src/test/resources/elements/tri.txt", apacheLic, "STANDARD", false);
Expand Down Expand Up @@ -162,14 +162,16 @@ public void plainReportTest() throws Exception {
assertTrue(document.startsWith(HEADER), "'Generated at' is not present in " + document);

TextUtils.assertPatternInOutput("^Notes: 2$", document);
TextUtils.assertPatternInOutput("^Binaries: 2$", document);
TextUtils.assertPatternInOutput("^Binaries: 1$", document);
TextUtils.assertPatternInOutput("^Archives: 1$", document);
TextUtils.assertPatternInOutput("^Standards: 8$", document);
TextUtils.assertPatternInOutput("^Standards: 9$", document);
TextUtils.assertPatternInOutput("^Apache Licensed: 5$", document);
TextUtils.assertPatternInOutput("^Generated Documents: 1$", document);
TextUtils.assertPatternInOutput("^2 Unknown Licenses$", document);
TextUtils.assertPatternInOutput("^3 Unknown Licenses$", document);
TextUtils.assertPatternInOutput(
"^Files with unapproved licenses:\\s+" + "\\Qsrc/test/resources/elements/Source.java\\E\\s+"
"^Files with unapproved licenses:\\s+" //
+ "\\Qsrc/test/resources/elements/Source.java\\E\\s+" //
+ "\\Qsrc/test/resources/elements/plain.json\\E\\s+" //
+ "\\Qsrc/test/resources/elements/sub/Empty.txt\\E\\s",
document);
TextUtils.assertPatternInOutput(documentOut(true, Type.ARCHIVE, "src/test/resources/elements/dummy.jar"),
Expand All @@ -193,7 +195,7 @@ public void plainReportTest() throws Exception {
+ licenseOut("AL", "Apache License Version 2.0"), document);
TextUtils.assertPatternInOutput(documentOut(true, Type.STANDARD, "src/test/resources/elements/TextHttps.txt")
+ licenseOut("AL", "Apache License Version 2.0"), document);
TextUtils.assertPatternInOutput(documentOut(true, Type.BINARY, "src/test/resources/elements/plain.json"),
TextUtils.assertPatternInOutput(documentOut(false, Type.STANDARD, "src/test/resources/elements/plain.json"),
document);
TextUtils.assertPatternInOutput(documentOut(true, Type.STANDARD, "src/test/resources/elements/tri.txt")
+ licenseOut("AL", "Apache License Version 2.0") + licenseOut("BSD-3", "BSD 3 clause")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
*/
package org.apache.rat.analysis;

import static org.junit.Assert.assertTrue;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;

Expand All @@ -31,6 +30,7 @@
import org.apache.rat.report.claim.impl.xml.SimpleXmlClaimReporter;
import org.apache.rat.report.xml.writer.impl.base.XmlWriter;
import org.apache.rat.test.utils.Resources;
import org.apache.rat.testhelpers.TextUtils;
import org.apache.rat.utils.DefaultLog;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
Expand Down Expand Up @@ -72,13 +72,13 @@ public void standardTypeAnalyser() throws Exception {
" * specific language governing permissions and limitations", //
" * under the License.", //
" ]]></sample></resource>" };

final MonolithicFileDocument document = new MonolithicFileDocument(
Resources.getResourceFile("/elements/Text.txt"));
analyser.analyse(document);
reporter.report(document);
String result = out.toString();
for (String exp : expected ) {
for (String exp : expected) {
assertTrue(result.contains(exp), () -> exp);
}
}
Expand Down Expand Up @@ -122,4 +122,57 @@ public void archiveTypeAnalyserIntelliJ() throws Exception {
assertEquals("<resource name='src/test/resources/elements/dummy.jar' type='ARCHIVE'/>", out.toString(),
"Open archive element");
}

@Test
public void RAT211_bmp_Test() throws Exception {
MonolithicFileDocument document = new MonolithicFileDocument(
Resources.getResourceFile("/jira/RAT211/side_left.bmp"));
analyser.analyse(document);
reporter.report(document);
assertEquals("<resource name='src/test/resources/jira/RAT211/side_left.bmp' type='BINARY'/>", out.toString(),
"Open archive element");
}

@Test
public void RAT211_dia_Test() throws Exception {
MonolithicFileDocument document = new MonolithicFileDocument(
Resources.getResourceFile("/jira/RAT211/leader-election-message-arrives.dia"));
analyser.analyse(document);
reporter.report(document);
assertEquals(
"<resource name='src/test/resources/jira/RAT211/leader-election-message-arrives.dia' type='ARCHIVE'/>",
out.toString(), "Open archive element");
}

@Test
public void RAT147_unix_Test() throws Exception {
MonolithicFileDocument document = new MonolithicFileDocument(
Resources.getResourceFile("/jira/RAT147/unix-newlines.txt.bin"));
analyser.analyse(document);
reporter.report(document);
String result = out.toString();
TextUtils.assertPatternInOutput(
"<resource name='src/test/resources/jira/RAT147/unix-newlines.txt.bin' type='STANDARD'",
result);
TextUtils.assertPatternInOutput("sentence 1.$", result);
TextUtils.assertPatternInOutput("^sentence 2.$", result);
TextUtils.assertPatternInOutput("^sentence 3.$", result);
TextUtils.assertPatternInOutput("^sentence 4.$", result);
}

@Test
public void RAT147_windows_Test() throws Exception {
MonolithicFileDocument document = new MonolithicFileDocument(
Resources.getResourceFile("/jira/RAT147/windows-newlines.txt.bin"));
analyser.analyse(document);
reporter.report(document);
String result = out.toString();
TextUtils.assertPatternInOutput(
"<resource name='src/test/resources/jira/RAT147/windows-newlines.txt.bin' type='STANDARD'",
result);
TextUtils.assertPatternInOutput("sentence 1.$", result);
TextUtils.assertPatternInOutput("^sentence 2.$", result);
TextUtils.assertPatternInOutput("^sentence 3.$", result);
TextUtils.assertPatternInOutput("^sentence 4.$", result);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,9 @@ public void standardReport() throws Exception {
"Preamble and document element are OK");

assertTrue(XmlUtils.isWellFormedXml(output), "Is well formed");
assertEquals(2, statistic.getDocumentCategoryMap().get(Document.Type.BINARY)[0], "Binary files");
assertEquals(1, statistic.getDocumentCategoryMap().get(Document.Type.BINARY)[0], "Binary files");
assertEquals(2, statistic.getDocumentCategoryMap().get(Document.Type.NOTICE)[0], "Notice files");
assertEquals(8, statistic.getDocumentCategoryMap().get(Document.Type.STANDARD)[0], "Standard files");
assertEquals(9, statistic.getDocumentCategoryMap().get(Document.Type.STANDARD)[0], "Standard files");
assertEquals(1, statistic.getDocumentCategoryMap().get(Document.Type.ARCHIVE)[0], "Archives");
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
sentence 1.
sentence 2.


sentence 3.

sentence 4.

Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
sentence 1.
sentence 2.


sentence 3.

sentence 4.


Binary file not shown.
Binary file not shown.
6 changes: 6 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,12 @@ agnostic home for software distribution comprehension and audit tools.
<version>3.25.3</version>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-core -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>2.9.2</version>
</dependency>
</dependencies>
</dependencyManagement>
<reporting>
Expand Down

0 comments on commit 6b09f8e

Please sign in to comment.