Skip to content

Commit

Permalink
TIKA-2302 -- make extraction of macros optional in OfficeParsers and …
Browse files Browse the repository at this point in the history
…set default to false
  • Loading branch information
tballison committed Mar 27, 2017
1 parent 7ce58d6 commit 19c0e91
Show file tree
Hide file tree
Showing 15 changed files with 347 additions and 20 deletions.
3 changes: 3 additions & 0 deletions CHANGES.txt
@@ -1,5 +1,8 @@
Release 1.15 - ??

* Change default behavior of Office Parsers to _not_ extract
Macros. User needs to setExtractMacros to "true" (TIKA-2302).

* Unified logging across Tika: SLF4J as logging API, Apache Log4j as
implementation with JCL and JUL bridges in standalone tools like
tika-app, tika-batch and tika-server (TIKA-2245).
Expand Down
Expand Up @@ -50,6 +50,13 @@ public boolean getUseSAXDocxExtractor() {
return defaultOfficeParserConfig.getUseSAXDocxExtractor();
}

/**
* @see OfficeParserConfig#getExtractMacros()
* @return whether or not to extract macros
*/
public boolean getExtractMacros() {
return defaultOfficeParserConfig.getExtractMacros();
}

@Field
public void setIncludeDeletedContent(boolean includeDeletedConent) {
Expand All @@ -70,4 +77,9 @@ public void setUseSAXDocxExtractor(boolean useSAXDocxExtractor) {
public void setUseSAXPptxExtractor(boolean useSAXPptxExtractor) {
defaultOfficeParserConfig.setUseSAXPptxExtractor(useSAXPptxExtractor);
}

@Field
public void setExtractMacros(boolean extractMacros) {
defaultOfficeParserConfig.setExtractMacros(extractMacros);
}
}
Expand Up @@ -40,6 +40,7 @@
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.poifs.macros.VBAMacroReader;
import org.apache.poi.util.IOUtils;
import org.apache.tika.config.Initializable;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
Expand Down Expand Up @@ -129,10 +130,13 @@ public void parse(
}
}
parse(root, context, metadata, xhtml);
OfficeParserConfig officeParserConfig = context.get(OfficeParserConfig.class);

//now try to get macros
extractMacros(root.getNFileSystem(), xhtml,
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
if (officeParserConfig.getExtractMacros()) {
//now try to get macros
extractMacros(root.getNFileSystem(), xhtml,
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
}
} finally {
IOUtils.closeQuietly(mustCloseFs);
}
Expand Down
Expand Up @@ -21,12 +21,31 @@

public class OfficeParserConfig implements Serializable {

private boolean extractMacros = false;

private boolean includeDeletedContent = false;
private boolean includeMoveFromContent = false;

private boolean useSAXDocxExtractor = false;
private boolean useSAXPptxExtractor = false;

/**
* Sets whether or not MSOffice parsers should extract macros.
* As of Tika 1.15, the default is <code>false</code>.
*
* @param extractMacros
*/
public void setExtractMacros(boolean extractMacros) {
this.extractMacros = extractMacros;
}

/**
*
* @return whether or not to extract macros
*/
public boolean getExtractMacros() {
return extractMacros;
}
/**
* Sets whether or not the parser should include deleted content.
* <p/>
Expand Down
Expand Up @@ -54,6 +54,7 @@
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.xmlbeans.XmlException;
Expand Down Expand Up @@ -91,9 +92,11 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {


private final EmbeddedDocumentExtractor embeddedExtractor;
private final ParseContext context;
protected POIXMLTextExtractor extractor;

public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor extractor) {
this.context = context;
this.extractor = extractor;
embeddedExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
}
Expand Down Expand Up @@ -382,14 +385,17 @@ protected abstract List<PackagePart> getMainDocumentParts()


void handleMacros(PackagePart macroPart, ContentHandler handler) throws TikaException, SAXException {
OfficeParserConfig officeParserConfig = context.get(OfficeParserConfig.class);

try (InputStream is = macroPart.getInputStream()) {
try (NPOIFSFileSystem npoifs = new NPOIFSFileSystem(is)) {
//Macro reading exceptions are already swallowed here
OfficeParser.extractMacros(npoifs, handler, embeddedExtractor);
if (officeParserConfig.getExtractMacros()) {
try (InputStream is = macroPart.getInputStream()) {
try (NPOIFSFileSystem npoifs = new NPOIFSFileSystem(is)) {
//Macro reading exceptions are already swallowed here
OfficeParser.extractMacros(npoifs, handler, embeddedExtractor);
}
} catch (IOException e) {
throw new TikaException("Broken OOXML file", e);
}
} catch (IOException e) {
throw new TikaException("Broken OOXML file", e);
}
}

Expand Down
Expand Up @@ -90,7 +90,9 @@ public static void parse(

// Have the appropriate OOXML text extractor picked
POIXMLTextExtractor poiExtractor = null;
OfficeParserConfig config = context.get(OfficeParserConfig.class, new OfficeParserConfig());
//This has already been set by OOXMLParser's call to configure()
//We can rely on this being non-null.
OfficeParserConfig config = context.get(OfficeParserConfig.class);
if (config.getUseSAXDocxExtractor()) {
poiExtractor = trySXWPF(pkg);
}
Expand All @@ -109,11 +111,11 @@ public static void parse(
} else if (poiExtractor instanceof XWPFEventBasedWordExtractor) {
extractor = new SXWPFWordExtractorDecorator(metadata, context,
(XWPFEventBasedWordExtractor) poiExtractor);
metadata.add("X-Parsed-By", XWPFEventBasedWordExtractor.class.getSimpleName());
metadata.add("X-Parsed-By", XWPFEventBasedWordExtractor.class.getCanonicalName());
} else if (poiExtractor instanceof XSLFEventBasedPowerPointExtractor) {
extractor = new SXSLFPowerPointExtractorDecorator(metadata, context,
(XSLFEventBasedPowerPointExtractor) poiExtractor);
metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getSimpleName());
metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getCanonicalName());
} else if (document == null) {
throw new TikaException(
"Expecting UserModel based POI OOXML extractor with a document, but none found. " +
Expand Down
Expand Up @@ -27,6 +27,7 @@

import org.apache.poi.util.LocaleUtil;
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.EncryptedDocumentException;
Expand Down Expand Up @@ -480,14 +481,33 @@ public void testBigIntegersWGeneralFormat() throws Exception {

@Test
public void testMacros() throws Exception {
//test default is "don't extract macros"
for (Metadata metadata : getRecursiveMetadata("testEXCEL_macro.xls")) {
if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
fail("Shouldn't have extract macros as default");
}
}

//now test that they were extracted
ParseContext context = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setExtractMacros(true);
context.set(OfficeParserConfig.class, officeParserConfig);

Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Dirty()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "dirty dirt dirt");
minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.MACRO.toString());

assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xls"));
assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xls", context));

//test configuring via config file
TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-macros.xml"));
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xls", parser));

}

}
Expand Up @@ -261,7 +261,13 @@ public void testMacros() throws Exception {
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.MACRO.toString());

List<Metadata> metadataList = getRecursiveMetadata("testPPT_macros.ppt");
ParseContext parseContext = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setExtractMacros(true);
parseContext.set(OfficeParserConfig.class, officeParserConfig);


List<Metadata> metadataList = getRecursiveMetadata("testPPT_macros.ppt", parseContext);
assertContainsAtLeast(minExpected, metadataList);
}

Expand Down
Expand Up @@ -29,11 +29,13 @@
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.BodyContentHandler;
Expand Down Expand Up @@ -525,14 +527,36 @@ public void testBoldHyperlink() throws Exception {

@Test
public void testMacros() throws Exception {

//test default is "don't extract macros"
for (Metadata metadata : getRecursiveMetadata("testWORD_macros.doc")) {
if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
fail("Shouldn't have extract macros as default");
}
}

//now test that they were extracted
ParseContext context = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setExtractMacros(true);
context.set(OfficeParserConfig.class, officeParserConfig);


Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.MACRO.toString());

List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.doc");
List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.doc", context);
assertContainsAtLeast(minExpected, metadataList);

//test configuring via config file
TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-macros.xml"));
AutoDetectParser parser = new AutoDetectParser(tikaConfig);

metadataList = getRecursiveMetadata("testWORD_macros.doc", parser);
assertContainsAtLeast(minExpected, metadataList);
}

Expand Down
Expand Up @@ -20,6 +20,7 @@
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

import javax.xml.transform.OutputKeys;
import javax.xml.transform.sax.SAXTransformerFactory;
Expand Down Expand Up @@ -1288,38 +1289,100 @@ public void testLongForIntExceptionInSummaryDetails() throws Exception {

@Test
public void testMacrosInDocm() throws Exception {

//test default is "don't extract macros"
for (Metadata metadata : getRecursiveMetadata("testWORD_macros.docm")) {
if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
fail("Shouldn't have extract macros as default");
}
}

//now test that they were extracted
ParseContext context = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setExtractMacros(true);
context.set(OfficeParserConfig.class, officeParserConfig);


Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.MACRO.toString());

assertContainsAtLeast(minExpected, getRecursiveMetadata("testWORD_macros.docm"));
assertContainsAtLeast(minExpected, getRecursiveMetadata("testWORD_macros.docm", context));

//test configuring via config file
TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-dom-macros.xml"));
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
assertContainsAtLeast(minExpected, getRecursiveMetadata("testWORD_macros.docm", parser));

}

@Test
public void testMacrosInPptm() throws Exception {

//test default is "don't extract macros"
for (Metadata metadata : getRecursiveMetadata("testPPT_macros.pptm")) {
if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
fail("Shouldn't have extract macros as default");
}
}

//now test that they were extracted
ParseContext context = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setExtractMacros(true);
context.set(OfficeParserConfig.class, officeParserConfig);

Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.MACRO.toString());

assertContainsAtLeast(minExpected, getRecursiveMetadata("testPPT_macros.pptm"));
assertContainsAtLeast(minExpected, getRecursiveMetadata("testPPT_macros.pptm", context));

//test configuring via config file
TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-dom-macros.xml"));
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
assertContainsAtLeast(minExpected, getRecursiveMetadata("testPPT_macros.pptm", parser));

}

@Test
public void testMacroinXlsm() throws Exception {

//test default is "don't extract macros"
for (Metadata metadata : getRecursiveMetadata("testEXCEL_macro.xlsm")) {
if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
fail("Shouldn't have extract macros as default");
}
}

//now test that they were extracted
ParseContext context = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setExtractMacros(true);
context.set(OfficeParserConfig.class, officeParserConfig);

Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Dirty()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "dirty dirt dirt");
minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.MACRO.toString());

assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xlsm"));
assertContainsAtLeast(minExpected,
getRecursiveMetadata("testEXCEL_macro.xlsm", context));

//test configuring via config file
TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-dom-macros.xml"));
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xlsm", parser));

}

//@Test //use this for lightweight benchmarking to compare xwpf options
Expand Down

0 comments on commit 19c0e91

Please sign in to comment.