From 5b258608a5f576c1174d2c293e4580fb041c8bca Mon Sep 17 00:00:00 2001 From: Mariusz Olejnik Date: Sun, 19 Feb 2017 14:59:19 +0100 Subject: [PATCH 1/3] Saving as XHTML span attributes PDF document TextPosition information (coordinates, font-name). Rendering XHTML using JFX WebView (it looks like parsed PDF). --- .../java/org/apache/tika/gui/TikaGUI.java | 120 +++++++++++------- .../org/apache/tika/parser/pdf/PDF2XHTML.java | 49 ++++--- 2 files changed, 107 insertions(+), 62 deletions(-) diff --git a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java index 5ecc7630f68..9944284fd84 100644 --- a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java +++ b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java @@ -25,7 +25,7 @@ import javax.swing.JMenuBar; import javax.swing.JMenuItem; import javax.swing.JOptionPane; -import javax.swing.JPanel; +import javax.swing.JTabbedPane; import javax.swing.JScrollPane; import javax.swing.JTextPane; import javax.swing.ProgressMonitorInputStream; @@ -39,7 +39,6 @@ import javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.sax.TransformerHandler; import javax.xml.transform.stream.StreamResult; -import java.awt.CardLayout; import java.awt.Color; import java.awt.Dimension; import java.awt.Toolkit; @@ -61,6 +60,11 @@ import java.util.Map; import java.util.Set; +import javafx.application.Platform; +import javafx.embed.swing.JFXPanel; +import javafx.scene.Scene; +import javafx.scene.web.WebEngine; +import javafx.scene.web.WebView; import org.apache.commons.io.IOUtils; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; @@ -148,19 +152,33 @@ public void run() { private final ImageSavingParser imageParser; /** - * The card layout for switching between different views. + * Container for the editor tabs. */ - private final CardLayout layout = new CardLayout(); + private final JTabbedPane tabs; /** - * Container for the editor cards. + * Tabs definitions. */ - private final JPanel cards; + private enum TabDef { + WELCOME("Welcome", ""), + METADATA("Metadata", "text/plain"), + XHTML("Formatted XHTML", ""), + TEXT("Plain text", "text/plain"), + TEXT_MAIN("Main content", "text/plain"), + XML("Structured XML", "text/plain"), + JSON("Recursive JSON", "text/plain"); + + String title, content; + TabDef(String title, String content) { + this.title=title; + this.content=content; + } + } /** * Formatted XHTML output. */ - private final JEditorPane html; + private final JFXPanel xhtml; /** * Plain text output. @@ -198,16 +216,15 @@ public TikaGUI(Parser parser) { addMenuBar(); - cards = new JPanel(layout); - addWelcomeCard(cards, "welcome"); - metadata = addCard(cards, "text/plain", "metadata"); - html = addCard(cards, "text/html", "html"); - text = addCard(cards, "text/plain", "text"); - textMain = addCard(cards, "text/plain", "main"); - xml = addCard(cards, "text/plain", "xhtml"); - json = addCard(cards, "text/plain", "json"); - add(cards); - layout.show(cards, "welcome"); + tabs = new JTabbedPane(); + addWelcomeTab(tabs, TabDef.WELCOME); + metadata = addTab(tabs, TabDef.METADATA); + xhtml = addWebViewTab(tabs, TabDef.XHTML); + text = addTab(tabs, TabDef.TEXT); + textMain = addTab(tabs, TabDef.TEXT_MAIN); + xml = addTab(tabs, TabDef.XML); + json = addTab(tabs, TabDef.JSON); + add(tabs); setPreferredSize(new Dimension(640, 480)); pack(); @@ -231,16 +248,6 @@ private void addMenuBar() { addMenuItem(file, "Exit", "exit", KeyEvent.VK_X); bar.add(file); - JMenu view = new JMenu("View"); - view.setMnemonic(KeyEvent.VK_V); - addMenuItem(view, "Metadata", "metadata", KeyEvent.VK_M); - addMenuItem(view, "Formatted text", "html", KeyEvent.VK_F); - addMenuItem(view, "Plain text", "text", KeyEvent.VK_P); - addMenuItem(view, "Main content", "main", KeyEvent.VK_C); - addMenuItem(view, "Structured text", "xhtml", KeyEvent.VK_S); - addMenuItem(view, "Recursive JSON", "json", KeyEvent.VK_J); - bar.add(view); - bar.add(Box.createHorizontalGlue()); JMenu help = new JMenu("Help"); help.setMnemonic(KeyEvent.VK_H); @@ -279,18 +286,6 @@ public void actionPerformed(ActionEvent e) { "Invalid URL", JOptionPane.ERROR_MESSAGE); } } - } else if ("html".equals(command)) { - layout.show(cards, command); - } else if ("text".equals(command)) { - layout.show(cards, command); - } else if ("main".equals(command)) { - layout.show(cards, command); - } else if ("xhtml".equals(command)) { - layout.show(cards, command); - } else if ("metadata".equals(command)) { - layout.show(cards, command); - } else if ("json".equals(command)) { - layout.show(cards, command); } else if ("about".equals(command)) { textDialog( "About Apache Tika", @@ -378,10 +373,10 @@ private void handleStream(InputStream input, Metadata md) setText(xml, xmlBuffer.toString()); setText(text, textBuffer.toString()); setText(textMain, textMainBuffer.toString()); - setText(html, htmlBuffer.toString()); + setHtml(xhtml, htmlBuffer.toString()); if (!input.markSupported()) { setText(json, "InputStream does not support mark/reset for Recursive Parsing"); - layout.show(cards, "metadata"); + selectTab(TabDef.JSON); return; } boolean isReset = false; @@ -404,7 +399,7 @@ private void handleStream(InputStream input, Metadata md) JsonMetadataList.toJson(wrapper.getMetadata(), jsonBuffer); setText(json, jsonBuffer.toString()); } - layout.show(cards, "metadata"); + selectTab(TabDef.XHTML); } private void handleError(String name, Throwable t) { @@ -427,7 +422,7 @@ private void handleError(String name, Throwable t) { dialog.setVisible(true); } - private void addWelcomeCard(JPanel panel, String name) { + private void addWelcomeTab(JTabbedPane panel, TabDef tabDef) { try { JEditorPane editor = new JEditorPane(TikaGUI.class.getResource("welcome.html")); @@ -436,22 +431,36 @@ private void addWelcomeCard(JPanel panel, String name) { editor.setBackground(Color.WHITE); editor.setTransferHandler(new ParsingTransferHandler( editor.getTransferHandler(), this)); - panel.add(new JScrollPane(editor), name); + panel.addTab(tabDef.title, new JScrollPane(editor)); + selectTab(tabDef); } catch (IOException e) { e.printStackTrace(); } } - private JEditorPane addCard(JPanel panel, String type, String name) { + private JEditorPane addTab(JTabbedPane panel, TabDef tabDef) { JEditorPane editor = new JTextPane(); editor.setBackground(Color.WHITE); - editor.setContentType(type); + editor.setContentType(tabDef.content); editor.setTransferHandler(new ParsingTransferHandler( editor.getTransferHandler(), this)); - panel.add(new JScrollPane(editor), name); + panel.addTab(tabDef.title, new JScrollPane(editor)); return editor; } + private JFXPanel addWebViewTab(final JTabbedPane panel, final TabDef tabDef){ + final JFXPanel jfxPanel = new JFXPanel(); + Platform.runLater(new Runnable() { + @Override + public void run() { + WebView webView = new WebView(); + jfxPanel.setScene( new Scene( webView ) ); + panel.addTab(tabDef.title,jfxPanel); + } + }); + return jfxPanel; + } + private void textDialog(String title, URL resource) { try { JDialog dialog = new JDialog(this, title); @@ -500,6 +509,23 @@ private void setText(JEditorPane editor, String text) { editor.setCaretPosition(0); } + private void setHtml(final JFXPanel htmlPanel, final String html) { + Platform.runLater(new Runnable() { + @Override + public void run() { + WebView webView = (WebView)htmlPanel.getScene().getRoot(); + WebEngine engine = webView.getEngine(); + //webView.getEngine().setUserStyleSheetLocation(); + engine.loadContent(html); + } + }); + } + + private void selectTab(TabDef tabDef){ + tabs.setSelectedIndex(tabs.indexOfTab(tabDef.title)); + + } + /** * Creates and returns a content handler that turns XHTML input to * simplified HTML output that can be correctly parsed and displayed diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java index ba5af757f7a..bc4eba6c8f2 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java @@ -61,7 +61,9 @@ * stream. */ class PDF2XHTML extends AbstractPDF2XHTML { - + private final static String XHTML_ELEMENT_NAME_P = "p"; + private final static String XHTML_ELEMENT_NAME_IMG = "img"; + private final static String XHTML_ELEMENT_NAME_SPAN = "span"; private static final List JPEG = Arrays.asList( COSName.DCT_DECODE.getName(), @@ -238,8 +240,8 @@ private void extractImages(PDResources resources, Set seenThisPage) thr AttributesImpl attr = new AttributesImpl(); attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName); attr.addAttribute("", "alt", "alt", "CDATA", fileName); - xhtml.startElement("img", attr); - xhtml.endElement("img"); + xhtml.startElement(XHTML_ELEMENT_NAME_IMG, attr); + xhtml.endElement(XHTML_ELEMENT_NAME_IMG); //Do we only want to process unique COSObject ids? //If so, have we already processed this one? @@ -316,7 +318,7 @@ private void writeToBuffer(PDImageXObject pdImage, String suffix, OutputStream o protected void writeParagraphStart() throws IOException { super.writeParagraphStart(); try { - xhtml.startElement("p"); + xhtml.startElement(XHTML_ELEMENT_NAME_P); } catch (SAXException e) { throw new IOException("Unable to start a paragraph", e); } @@ -326,29 +328,46 @@ protected void writeParagraphStart() throws IOException { protected void writeParagraphEnd() throws IOException { super.writeParagraphEnd(); try { - xhtml.endElement("p"); + xhtml.endElement(XHTML_ELEMENT_NAME_P); } catch (SAXException e) { throw new IOException("Unable to end a paragraph", e); } } - @Override - protected void writeString(String text) throws IOException { - try { - xhtml.characters(text); - } catch (SAXException e) { - throw new IOException( - "Unable to write a string: " + text, e); + private AttributesImpl extractSpanAttrs(List textPositions) { + float fontResizeFactor = 0.8f; + AttributesImpl attrs = new AttributesImpl(); + if (textPositions.size() > 0) { + TextPosition startPos = textPositions.get(0); + attrs.addAttribute("", "", "pdfFontName", null, startPos.getFont().getName()); + attrs.addAttribute("", "", "pdfPageNo", null, getCurrentPageNo() + ""); + attrs.addAttribute("", "", "style", null, + "position: absolute; top: " + startPos.getY() + "px; left: " + startPos.getX() + "px; " + + "font-size: " + (startPos.getFontSize()*fontResizeFactor) + "em" + ); } + return attrs; } @Override - protected void writeCharacters(TextPosition text) throws IOException { + protected void writeString(String text, List textPositions) throws IOException { + boolean elementStarted = false; try { - xhtml.characters(text.getUnicode()); + xhtml.startElement(XHTML_ELEMENT_NAME_SPAN, extractSpanAttrs(textPositions)); + elementStarted = true; + xhtml.characters(text); } catch (SAXException e) { throw new IOException( - "Unable to write a character: " + text.getUnicode(), e); + "Unable to write a character: " + text, e); + } finally { + try { + if (elementStarted) { + xhtml.endElement(XHTML_ELEMENT_NAME_SPAN); + } + } catch (SAXException e) { + throw new IOException( + "Unable to end element " + XHTML_ELEMENT_NAME_SPAN, e); + } } } From b668477f18e5c70c3cdf7c890b248e4696739e2e Mon Sep 17 00:00:00 2001 From: Mariusz Olejnik Date: Mon, 20 Feb 2017 00:59:24 +0100 Subject: [PATCH 2/3] Add documents to Lucene Index and UI to execute queries --- tika-app/pom.xml | 10 ++ .../java/org/apache/tika/gui/TikaGUI.java | 113 ++++++++++++++---- .../apache/tika/lucene/DocumentIndexer.java | 81 +++++++++++++ .../org/apache/tika/lucene/FoundItem.java | 41 +++++++ 4 files changed, 222 insertions(+), 23 deletions(-) create mode 100644 tika-app/src/main/java/org/apache/tika/lucene/DocumentIndexer.java create mode 100644 tika-app/src/main/java/org/apache/tika/lucene/FoundItem.java diff --git a/tika-app/pom.xml b/tika-app/pom.xml index 16b65d203d0..db9f0d977ed 100644 --- a/tika-app/pom.xml +++ b/tika-app/pom.xml @@ -73,6 +73,16 @@ tika-batch ${project.version} + + org.apache.lucene + lucene-core + 6.4.1 + + + org.apache.lucene + lucene-queryparser + 6.4.1 + org.slf4j diff --git a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java index 9944284fd84..e45662a61e6 100644 --- a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java +++ b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java @@ -16,21 +16,7 @@ */ package org.apache.tika.gui; -import javax.swing.Box; -import javax.swing.JDialog; -import javax.swing.JEditorPane; -import javax.swing.JFileChooser; -import javax.swing.JFrame; -import javax.swing.JMenu; -import javax.swing.JMenuBar; -import javax.swing.JMenuItem; -import javax.swing.JOptionPane; -import javax.swing.JTabbedPane; -import javax.swing.JScrollPane; -import javax.swing.JTextPane; -import javax.swing.ProgressMonitorInputStream; -import javax.swing.SwingUtilities; -import javax.swing.UIManager; +import javax.swing.*; import javax.swing.event.HyperlinkEvent; import javax.swing.event.HyperlinkEvent.EventType; import javax.swing.event.HyperlinkListener; @@ -39,9 +25,7 @@ import javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.sax.TransformerHandler; import javax.xml.transform.stream.StreamResult; -import java.awt.Color; -import java.awt.Dimension; -import java.awt.Toolkit; +import java.awt.*; import java.awt.event.ActionEvent; import java.awt.event.ActionListener; import java.awt.event.KeyEvent; @@ -55,10 +39,8 @@ import java.io.Writer; import java.net.MalformedURLException; import java.net.URL; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; -import java.util.Set; +import java.util.*; +import java.util.List; import javafx.application.Platform; import javafx.embed.swing.JFXPanel; @@ -66,10 +48,15 @@ import javafx.scene.web.WebEngine; import javafx.scene.web.WebView; import org.apache.commons.io.IOUtils; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.DocumentSelector; import org.apache.tika.io.TikaInputStream; +import org.apache.tika.lucene.DocumentIndexer; +import org.apache.tika.lucene.FoundItem; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.serialization.JsonMetadataList; import org.apache.tika.mime.MediaType; @@ -151,6 +138,11 @@ public void run() { */ private final ImageSavingParser imageParser; + /** + * Document indexer + */ + private DocumentIndexer docIndexer; + /** * Container for the editor tabs. */ @@ -166,7 +158,9 @@ private enum TabDef { TEXT("Plain text", "text/plain"), TEXT_MAIN("Main content", "text/plain"), XML("Structured XML", "text/plain"), - JSON("Recursive JSON", "text/plain"); + JSON("Recursive JSON", "text/plain"), + LUCENE("Lucene", ""), + ; String title, content; TabDef(String title, String content) { @@ -205,6 +199,12 @@ private enum TabDef { */ private final JEditorPane metadata; + /** + * Index queries. + */ + private JEditorPane queryEditor; + private JEditorPane resultEditor; + /** * File chooser. */ @@ -224,6 +224,7 @@ public TikaGUI(Parser parser) { textMain = addTab(tabs, TabDef.TEXT_MAIN); xml = addTab(tabs, TabDef.XML); json = addTab(tabs, TabDef.JSON); + addDocIndexerTab(tabs, TabDef.LUCENE); add(tabs); setPreferredSize(new Dimension(640, 480)); @@ -231,6 +232,12 @@ public TikaGUI(Parser parser) { this.context = new ParseContext(); this.parser = parser; + try { + this.docIndexer = new DocumentIndexer(); + } catch (IOException e) { + this.docIndexer = null; + e.printStackTrace(); + } this.imageParser = new ImageSavingParser(parser); this.context.set(DocumentSelector.class, new ImageDocumentSelector()); @@ -286,6 +293,8 @@ public void actionPerformed(ActionEvent e) { "Invalid URL", JOptionPane.ERROR_MESSAGE); } } + } else if ("query".equals(command)) { + executeQuery(); } else if ("about".equals(command)) { textDialog( "About Apache Tika", @@ -374,6 +383,7 @@ private void handleStream(InputStream input, Metadata md) setText(text, textBuffer.toString()); setText(textMain, textMainBuffer.toString()); setHtml(xhtml, htmlBuffer.toString()); + addDocumentToIndex(name,htmlBuffer.toString()); if (!input.markSupported()) { setText(json, "InputStream does not support mark/reset for Recursive Parsing"); selectTab(TabDef.JSON); @@ -461,6 +471,63 @@ public void run() { return jfxPanel; } + private JPanel addDocIndexerTab(JTabbedPane tabs, TabDef tabDef) { + JPanel docIndexerPanel = new JPanel(); + docIndexerPanel.setLayout(new BoxLayout(docIndexerPanel, BoxLayout.PAGE_AXIS)); + + this.queryEditor = new JTextPane(); + this.queryEditor.setText("contents:\"*01*\""); + JScrollPane queryScroller = new JScrollPane(queryEditor); + queryScroller.setPreferredSize(new Dimension(tabs.getWidth(), 50)); + queryScroller.setAlignmentX(LEFT_ALIGNMENT); + + //Lay out the buttons from left to right. + final JButton queryButton = new JButton("Query"); + queryButton.setActionCommand("query"); + queryButton.addActionListener(this); + JPanel buttonPane = new JPanel(); + buttonPane.setLayout(new BoxLayout(buttonPane, BoxLayout.PAGE_AXIS)); + buttonPane.setBorder(BorderFactory.createEmptyBorder(0, 10, 10, 10)); + buttonPane.add(Box.createHorizontalGlue()); + buttonPane.add(queryButton); + + this.resultEditor = new JTextPane(); + JScrollPane resultsScroller = new JScrollPane(resultEditor); + resultsScroller.setPreferredSize(new Dimension(tabs.getWidth(), 280)); + resultsScroller.setAlignmentX(LEFT_ALIGNMENT); + + //Put everything together, using the content pane's BorderLayout. + docIndexerPanel.add(queryScroller); + docIndexerPanel.add(buttonPane); + docIndexerPanel.add(resultsScroller); + tabs.add(tabDef.title,docIndexerPanel); + return docIndexerPanel; + } + + private void addDocumentToIndex(String filename, String contents) { + try { + docIndexer.addDocument(filename, contents); + } catch (IOException e) { + this.resultEditor.setText(e.getMessage()); + e.printStackTrace(); + } + } + private void executeQuery() { + try { + List fis = docIndexer.searchDocuments(this.queryEditor.getText()); + StringBuffer res = new StringBuffer(); + for (FoundItem fi : fis) { + res.append(fi.getScoreDoc().toString()+"\n - "+fi.getDocument().toString()+"\n"); + } + this.resultEditor.setText(res.toString()); + } catch (ParseException e) { + this.resultEditor.setText(e.getMessage()); + e.printStackTrace(); + } catch (IOException e) { + this.resultEditor.setText(e.getMessage()); + e.printStackTrace(); + } + } private void textDialog(String title, URL resource) { try { JDialog dialog = new JDialog(this, title); diff --git a/tika-app/src/main/java/org/apache/tika/lucene/DocumentIndexer.java b/tika-app/src/main/java/org/apache/tika/lucene/DocumentIndexer.java new file mode 100644 index 00000000000..0a16c049cff --- /dev/null +++ b/tika-app/src/main/java/org/apache/tika/lucene/DocumentIndexer.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.lucene; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.queryparser.classic.QueryParser; + +import java.io.*; +import java.util.ArrayList; +import java.util.List; + +/** + * Document index in RAM. + */ +public class DocumentIndexer { + private final Directory dir; + private final Analyzer analyzer; + + public DocumentIndexer() throws IOException { + this.dir = new RAMDirectory(); + this.analyzer = new StandardAnalyzer(); + } + + public void addDocument(String filename, String contents) throws IOException { + Document doc = new Document(); + doc.add(new StringField("filename", filename, Store.YES)); + doc.add(new TextField("contents", contents, Store.NO)); + IndexWriterConfig iwc = new IndexWriterConfig(analyzer); + IndexWriter writer = new IndexWriter(dir, iwc); + writer.addDocument(doc); + writer.close(); + } + + + public List searchDocuments(String queryString) throws ParseException, IOException { + IndexReader reader = DirectoryReader.open(dir); + IndexSearcher searcher = new IndexSearcher(reader); + // Build a Query object + QueryParser parser = new QueryParser("contents", analyzer); + Query query = parser.parse(queryString); + TopDocs topDocs = searcher.search(query, 10); + List fi = new ArrayList<>(topDocs.scoreDocs.length); + for (ScoreDoc scoreDoc : topDocs.scoreDocs) { + fi.add(new FoundItem(scoreDoc,reader.document(scoreDoc.doc))); + } + reader.close(); + return fi; + } + +} diff --git a/tika-app/src/main/java/org/apache/tika/lucene/FoundItem.java b/tika-app/src/main/java/org/apache/tika/lucene/FoundItem.java new file mode 100644 index 00000000000..6e5d6154865 --- /dev/null +++ b/tika-app/src/main/java/org/apache/tika/lucene/FoundItem.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.lucene; + +import org.apache.lucene.document.Document; +import org.apache.lucene.search.ScoreDoc; + +/** + * Info about one found item. + */ +public class FoundItem { + private final ScoreDoc scoreDoc; + private final Document document; + + public FoundItem(ScoreDoc scoreDoc, Document document) { + this.scoreDoc = scoreDoc; + this.document = document; + } + + public ScoreDoc getScoreDoc() { + return scoreDoc; + } + + public Document getDocument() { + return document; + } +} From bd77c215a6f0041da0c7256579849ab2c7ce744c Mon Sep 17 00:00:00 2001 From: Mariusz Olejnik Date: Wed, 12 Apr 2017 13:33:03 +0200 Subject: [PATCH 3/3] no message --- .../org/apache/tika/parser/pdf/PDF2XHTML.java | 29 +++++++------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java index bc4eba6c8f2..55671d83448 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java @@ -16,19 +16,6 @@ */ package org.apache.tika.parser.pdf; -import java.awt.image.BufferedImage; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.io.Writer; -import java.util.Arrays; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.cos.COSStream; @@ -55,6 +42,10 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; +import java.awt.image.BufferedImage; +import java.io.*; +import java.util.*; + /** * Utility class that overrides the {@link PDFTextStripper} functionality * to produce a semi-structured XHTML SAX events instead of a plain text @@ -335,14 +326,16 @@ protected void writeParagraphEnd() throws IOException { } private AttributesImpl extractSpanAttrs(List textPositions) { - float fontResizeFactor = 0.8f; + float fontResizeFactor = 0.7f; AttributesImpl attrs = new AttributesImpl(); if (textPositions.size() > 0) { TextPosition startPos = textPositions.get(0); - attrs.addAttribute("", "", "pdfFontName", null, startPos.getFont().getName()); - attrs.addAttribute("", "", "pdfPageNo", null, getCurrentPageNo() + ""); - attrs.addAttribute("", "", "style", null, - "position: absolute; top: " + startPos.getY() + "px; left: " + startPos.getX() + "px; " + + int pageNo = getCurrentPageNo(); + float x = startPos.getX(); + float y = startPos.getY(); + attrs.addAttribute("", "coordinates", "coordinates", null, pageNo+"-"+x+"-"+y); + attrs.addAttribute("", "style", "style", null, + "position: absolute; top: "+y+"px; left: "+x+"px; " + "font-size: " + (startPos.getFontSize()*fontResizeFactor) + "em" ); }