Skip to content

Commit

Permalink
TIKA-1195 and TIKA-2329
Browse files Browse the repository at this point in the history
  • Loading branch information
tballison committed Apr 19, 2017
1 parent a970303 commit 67612b8
Show file tree
Hide file tree
Showing 8 changed files with 347 additions and 74 deletions.
14 changes: 8 additions & 6 deletions CHANGES.txt
@@ -1,5 +1,12 @@
Release 1.15 - ??

* Change default behavior to parse embedded documents even if the user
forgets to specify a Parser.class in the ParseContext (TIKA-2096).
Users who wish to parse only the container document should set
an EmptyParser as the Parser.class in the ParseContext.

* Add support for the XLSB format (TIKA-1195).

* Change default behavior of Office Parsers to _not_ extract
Macros. User needs to setExtractMacros to "true" (TIKA-2302).

Expand Down Expand Up @@ -64,14 +71,9 @@ Release 1.15 - ??
* Added experimental SAX parser for .docx files. To select this parser,
set useSAXDocxExtractor(true) on OfficeParserConfig (TIKA-1321, TIKA-2191).

* Change default behavior to parse embedded documents even if the user
forgets to specify a Parser.class in the ParseContext (TIKA-2096).
Users who wish to parse only the container document should set
an EmptyParser as the Parser.class in the ParseContext.

* Add mime detection and parser for Word 2006ML format (TIKA-2179).

* Upgrade to POI 3.16-beta2 (TIKA-2116, TIKA-2181).
* Upgrade to POI 3.16 (TIKA-2116, TIKA-2181, TIKA-2329).

* Allow configuration of timeout for ForkParser (TIKA-2170).

Expand Down
2 changes: 1 addition & 1 deletion tika-parsers/pom.xml
Expand Up @@ -35,7 +35,7 @@
<url>http://tika.apache.org/</url>

<properties>
<poi.version>3.16-beta2</poi.version>
<poi.version>3.16</poi.version>
<!-- NOTE: sync codec version with POI -->
<codec.version>1.10</codec.version>
<!-- NOTE: sync tukaani version with commons-compress in tika-parent-->
Expand Down
Expand Up @@ -32,6 +32,7 @@
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
Expand Down Expand Up @@ -104,8 +105,10 @@ public static void parse(
}

POIXMLDocument document = poiExtractor.getDocument();
if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) {
extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale);

if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
} else if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
extractor = new XSSFExcelExtractorDecorator(
context, poiExtractor, locale);
} else if (poiExtractor instanceof XWPFEventBasedWordExtractor) {
Expand Down
Expand Up @@ -74,6 +74,8 @@ public class OOXMLParser extends AbstractOfficeParser {
MediaType.application("vnd.ms-visio.drawing"),
MediaType.application("vnd.ms-xpsdocument"),
MediaType.parse("model/vnd.dwfx+xps")
// MediaType.application("x-tika-ooxml")

)));
/**
* We claim to support all OOXML files, but we actually don't support a small
Expand All @@ -82,10 +84,9 @@ public class OOXMLParser extends AbstractOfficeParser {
* by Tika and/or POI.
*/
protected static final Set<MediaType> UNSUPPORTED_OOXML_TYPES =
Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
MediaType.application("vnd.ms-xpsdocument"),
MediaType.application("vnd.ms-excel.sheet.binary.macroenabled.12")
)));
Collections.singleton(
MediaType.application("vnd.ms-xpsdocument")
);
/**
* Serial version UID
*/
Expand Down
@@ -0,0 +1,282 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.ooxml;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;

import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackagePartName;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.openxml4j.opc.TargetMode;
import org.apache.poi.xssf.binary.XSSFBCommentsTable;
import org.apache.poi.xssf.binary.XSSFBSharedStringsTable;
import org.apache.poi.xssf.binary.XSSFBSheetHandler;
import org.apache.poi.xssf.binary.XSSFBStylesTable;
import org.apache.poi.xssf.eventusermodel.XSSFBReader;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.apache.poi.xssf.usermodel.XSSFShape;
import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.drawingml.x2006.main.CTHyperlink;
import org.openxmlformats.schemas.drawingml.x2006.main.CTNonVisualDrawingProps;
import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShape;
import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShapeNonVisual;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

public class XSSFBExcelExtractorDecorator extends XSSFExcelExtractorDecorator {

public XSSFBExcelExtractorDecorator(
ParseContext context, POIXMLTextExtractor extractor, Locale locale) {
super(context, extractor, locale);
}

@Override
protected void configureExtractor(POIXMLTextExtractor extractor, Locale locale) {
//need to override this because setFormulasNotResults is not yet available
//for xlsb
//((XSSFBEventBasedExcelExtractor)extractor).setFormulasNotResults(false);
((XSSFBEventBasedExcelExtractor)extractor).setLocale(locale);
}

@Override
public void getXHTML(
ContentHandler handler, Metadata metadata, ParseContext context)
throws SAXException, XmlException, IOException, TikaException {

this.metadata = metadata;
this.parseContext = context;
metadata.set(TikaMetadataKeys.PROTECTED, "false");

super.getXHTML(handler, metadata, context);
}

/**
* @see org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor#getText()
*/
@Override
protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
XmlException, IOException {
OPCPackage container = extractor.getPackage();

XSSFBSharedStringsTable strings;
XSSFBReader.SheetIterator iter;
XSSFBReader xssfReader;
XSSFBStylesTable styles;
try {
xssfReader = new XSSFBReader(container);
styles = xssfReader.getXSSFBStylesTable();
iter = (XSSFBReader.SheetIterator) xssfReader.getSheetsData();
strings = new XSSFBSharedStringsTable(container);
} catch (InvalidFormatException e) {
throw new XmlException(e);
} catch (OpenXML4JException oe) {
throw new XmlException(oe);
}

while (iter.hasNext()) {
InputStream stream = iter.next();
PackagePart sheetPart = iter.getSheetPart();
addDrawingHyperLinks(sheetPart);
sheetParts.add(sheetPart);

SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(xhtml);
XSSFBCommentsTable comments = iter.getXSSFBSheetComments();

// Start, and output the sheet name
xhtml.startElement("div");
xhtml.element("h1", iter.getSheetName());

// Extract the main sheet contents
xhtml.startElement("table");
xhtml.startElement("tbody");

processSheet(sheetExtractor, comments, styles, strings, stream);

xhtml.endElement("tbody");
xhtml.endElement("table");

// Output any headers and footers
// (Need to process the sheet to get them, so we can't
// do the headers before the contents)
for (String header : sheetExtractor.headers) {
extractHeaderFooter(header, xhtml);
}
for (String footer : sheetExtractor.footers) {
extractHeaderFooter(footer, xhtml);
}
List<XSSFShape> shapes = iter.getShapes();
processShapes(shapes, xhtml);

//for now dump sheet hyperlinks at bottom of page
//consider a double-pass of the inputstream to reunite hyperlinks with cells/textboxes
//step 1: extract hyperlink info from bottom of page
//step 2: process as we do now, but with cached hyperlink relationship info
extractHyperLinks(sheetPart, xhtml);
// All done with this sheet
xhtml.endElement("div");
}
}

@Override
protected void extractHeaderFooter(String hf, XHTMLContentHandler xhtml)
throws SAXException {
if (hf.length() > 0) {
xhtml.element("p", hf);
}
}

private void extractHyperLinks(PackagePart sheetPart, XHTMLContentHandler xhtml) throws SAXException {
try {
for (PackageRelationship rel : sheetPart.getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) {
xhtml.startElement("a", "href", rel.getTargetURI().toString());
xhtml.characters(rel.getTargetURI().toString());
xhtml.endElement("a");
}
} catch (InvalidFormatException e) {
//swallow
}
}

private void processShapes(List<XSSFShape> shapes, XHTMLContentHandler xhtml) throws SAXException {
if (shapes == null) {
return;
}
for (XSSFShape shape : shapes) {
if (shape instanceof XSSFSimpleShape) {
String sText = ((XSSFSimpleShape) shape).getText();
if (sText != null && sText.length() > 0) {
xhtml.element("p", sText);
}
extractHyperLinksFromShape(((XSSFSimpleShape)shape).getCTShape(), xhtml);
}
}
}

private void extractHyperLinksFromShape(CTShape ctShape, XHTMLContentHandler xhtml) throws SAXException {

if (ctShape == null)
return;

CTShapeNonVisual nvSpPR = ctShape.getNvSpPr();
if (nvSpPR == null)
return;

CTNonVisualDrawingProps cNvPr = nvSpPR.getCNvPr();
if (cNvPr == null)
return;

CTHyperlink ctHyperlink = cNvPr.getHlinkClick();
if (ctHyperlink == null)
return;

String url = drawingHyperlinks.get(ctHyperlink.getId());
if (url != null) {
xhtml.startElement("a", "href", url);
xhtml.characters(url);
xhtml.endElement("a");
}

CTHyperlink ctHoverHyperlink = cNvPr.getHlinkHover();
if (ctHoverHyperlink == null)
return;

url = drawingHyperlinks.get(ctHoverHyperlink.getId());
if (url != null) {
xhtml.startElement("a", "href", url);
xhtml.characters(url);
xhtml.endElement("a");
}

}

private void processSheet(
SheetContentsHandler sheetContentsExtractor,
XSSFBCommentsTable comments,
XSSFBStylesTable styles,
XSSFBSharedStringsTable strings,
InputStream sheetInputStream)
throws IOException, SAXException {

XSSFBSheetHandler xssfbSheetHandler = new XSSFBSheetHandler(
sheetInputStream,
styles,
comments,
strings,
sheetContentsExtractor,
formatter,
false
);
xssfbSheetHandler.parse();
}

/**
* In Excel files, sheets have things embedded in them,
* and sheet drawings which have the images
*/
@Override
protected List<PackagePart> getMainDocumentParts() throws TikaException {
List<PackagePart> parts = new ArrayList<PackagePart>();
for (PackagePart part : sheetParts) {
// Add the sheet
parts.add(part);

// If it has drawings, return those too
try {
for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
if (rel.getTargetMode() == TargetMode.INTERNAL) {
PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
parts.add(rel.getPackage().getPart(relName));
}
}
for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) {
if (rel.getTargetMode() == TargetMode.INTERNAL) {
PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
parts.add(rel.getPackage().getPart(relName));
}
}
} catch (InvalidFormatException e) {
throw new TikaException("Broken OOXML file", e);
}
}

//add main document so that macros can be extracted
//by AbstractOOXMLExtractor
for (PackagePart part : extractor.getPackage().
getPartsByRelationshipType(RELATION_OFFICE_DOCUMENT)) {
parts.add(part);
}

return parts;
}
}
Expand Up @@ -71,7 +71,6 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
* Allows access to headers/footers from raw xml strings
*/
protected static HeaderFooterHelper hfHelper = new HeaderFooterHelper();
private final XSSFEventBasedExcelExtractor extractor;
protected final DataFormatter formatter;
protected final List<PackagePart> sheetParts = new ArrayList<PackagePart>();
protected final Map<String, String> drawingHyperlinks = new HashMap<>();
Expand All @@ -84,9 +83,7 @@ public XSSFExcelExtractorDecorator(

this.parseContext = context;
this.extractor = (XSSFEventBasedExcelExtractor)extractor;
// not yet supported in POI-3.16-beta3
// this.extractor.setFormulasNotResults(false);
this.extractor.setLocale(locale);
configureExtractor(this.extractor, locale);

if (locale == null) {
formatter = new TikaExcelDataFormatter();
Expand All @@ -95,6 +92,11 @@ public XSSFExcelExtractorDecorator(
}
}

protected void configureExtractor(POIXMLTextExtractor extractor, Locale locale) {
((XSSFEventBasedExcelExtractor)extractor).setFormulasNotResults(false);
((XSSFEventBasedExcelExtractor)extractor).setLocale(locale);
}

@Override
public void getXHTML(
ContentHandler handler, Metadata metadata, ParseContext context)
Expand Down

0 comments on commit 67612b8

Please sign in to comment.