From 5b258608a5f576c1174d2c293e4580fb041c8bca Mon Sep 17 00:00:00 2001
From: Mariusz Olejnik <mariusz.olejnik@infinite.pl>
Date: Sun, 19 Feb 2017 14:59:19 +0100
Subject: [PATCH 1/3] Saving as XHTML span attributes PDF document TextPosition
 information (coordinates, font-name). Rendering  XHTML using JFX WebView (it
 looks like parsed PDF).

---
 .../java/org/apache/tika/gui/TikaGUI.java     | 120 +++++++++++-------
 .../org/apache/tika/parser/pdf/PDF2XHTML.java |  49 ++++---
 2 files changed, 107 insertions(+), 62 deletions(-)

diff --git a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
index 5ecc7630f68..9944284fd84 100644
--- a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
+++ b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
@@ -25,7 +25,7 @@
 import javax.swing.JMenuBar;
 import javax.swing.JMenuItem;
 import javax.swing.JOptionPane;
-import javax.swing.JPanel;
+import javax.swing.JTabbedPane;
 import javax.swing.JScrollPane;
 import javax.swing.JTextPane;
 import javax.swing.ProgressMonitorInputStream;
@@ -39,7 +39,6 @@
 import javax.xml.transform.sax.SAXTransformerFactory;
 import javax.xml.transform.sax.TransformerHandler;
 import javax.xml.transform.stream.StreamResult;
-import java.awt.CardLayout;
 import java.awt.Color;
 import java.awt.Dimension;
 import java.awt.Toolkit;
@@ -61,6 +60,11 @@
 import java.util.Map;
 import java.util.Set;
 
+import javafx.application.Platform;
+import javafx.embed.swing.JFXPanel;
+import javafx.scene.Scene;
+import javafx.scene.web.WebEngine;
+import javafx.scene.web.WebView;
 import org.apache.commons.io.IOUtils;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaException;
@@ -148,19 +152,33 @@ public void run() {
     private final ImageSavingParser imageParser;
 
     /**
-     * The card layout for switching between different views.
+     * Container for the editor tabs.
      */
-    private final CardLayout layout = new CardLayout();
+    private final JTabbedPane tabs;
 
     /**
-     * Container for the editor cards.
+     * Tabs definitions.
      */
-    private final JPanel cards;
+    private enum TabDef {
+        WELCOME("Welcome", ""),
+        METADATA("Metadata", "text/plain"),
+        XHTML("Formatted XHTML", ""),
+        TEXT("Plain text", "text/plain"),
+        TEXT_MAIN("Main content", "text/plain"),
+        XML("Structured XML", "text/plain"),
+        JSON("Recursive JSON", "text/plain");
+
+        String title, content;
+        TabDef(String title, String content) {
+            this.title=title;
+            this.content=content;
+        }
+    }
 
     /**
      * Formatted XHTML output.
      */
-    private final JEditorPane html;
+    private final JFXPanel xhtml;
 
     /**
      * Plain text output.
@@ -198,16 +216,15 @@ public TikaGUI(Parser parser) {
 
         addMenuBar();
 
-        cards = new JPanel(layout);
-        addWelcomeCard(cards, "welcome");
-        metadata = addCard(cards, "text/plain", "metadata");
-        html = addCard(cards, "text/html", "html");
-        text = addCard(cards, "text/plain", "text");
-        textMain = addCard(cards, "text/plain", "main");
-        xml = addCard(cards, "text/plain", "xhtml");
-        json = addCard(cards, "text/plain", "json");
-        add(cards);
-        layout.show(cards, "welcome");
+        tabs = new JTabbedPane();
+        addWelcomeTab(tabs, TabDef.WELCOME);
+        metadata = addTab(tabs, TabDef.METADATA);
+        xhtml = addWebViewTab(tabs, TabDef.XHTML);
+        text = addTab(tabs, TabDef.TEXT);
+        textMain = addTab(tabs, TabDef.TEXT_MAIN);
+        xml = addTab(tabs, TabDef.XML);
+        json = addTab(tabs, TabDef.JSON);
+        add(tabs);
 
         setPreferredSize(new Dimension(640, 480));
         pack();
@@ -231,16 +248,6 @@ private void addMenuBar() {
         addMenuItem(file, "Exit", "exit", KeyEvent.VK_X);
         bar.add(file);
 
-        JMenu view = new JMenu("View");
-        view.setMnemonic(KeyEvent.VK_V);
-        addMenuItem(view, "Metadata", "metadata", KeyEvent.VK_M);
-        addMenuItem(view, "Formatted text", "html", KeyEvent.VK_F);
-        addMenuItem(view, "Plain text", "text", KeyEvent.VK_P);
-        addMenuItem(view, "Main content", "main", KeyEvent.VK_C);
-        addMenuItem(view, "Structured text", "xhtml", KeyEvent.VK_S);
-        addMenuItem(view, "Recursive JSON", "json", KeyEvent.VK_J);
-        bar.add(view);
-
         bar.add(Box.createHorizontalGlue());
         JMenu help = new JMenu("Help");
         help.setMnemonic(KeyEvent.VK_H);
@@ -279,18 +286,6 @@ public void actionPerformed(ActionEvent e) {
                             "Invalid URL", JOptionPane.ERROR_MESSAGE);
                 }
             }
-        } else if ("html".equals(command)) {
-            layout.show(cards, command);
-        } else if ("text".equals(command)) {
-            layout.show(cards, command);
-        } else if ("main".equals(command)) {
-            layout.show(cards, command);
-        } else if ("xhtml".equals(command)) {
-            layout.show(cards, command);
-        } else if ("metadata".equals(command)) {
-            layout.show(cards, command);
-        } else if ("json".equals(command)) {
-            layout.show(cards, command);
         } else if ("about".equals(command)) {
             textDialog(
                     "About Apache Tika",
@@ -378,10 +373,10 @@ private void handleStream(InputStream input, Metadata md)
         setText(xml, xmlBuffer.toString());
         setText(text, textBuffer.toString());
         setText(textMain, textMainBuffer.toString());
-        setText(html, htmlBuffer.toString());
+        setHtml(xhtml, htmlBuffer.toString());
         if (!input.markSupported()) {
             setText(json, "InputStream does not support mark/reset for Recursive Parsing");
-            layout.show(cards, "metadata");
+            selectTab(TabDef.JSON);
             return;
         }
         boolean isReset = false;
@@ -404,7 +399,7 @@ private void handleStream(InputStream input, Metadata md)
             JsonMetadataList.toJson(wrapper.getMetadata(), jsonBuffer);
             setText(json, jsonBuffer.toString());
         }
-        layout.show(cards, "metadata");
+        selectTab(TabDef.XHTML);
     }
 
     private void handleError(String name, Throwable t) {
@@ -427,7 +422,7 @@ private void handleError(String name, Throwable t) {
         dialog.setVisible(true);
     }
 
-    private void addWelcomeCard(JPanel panel, String name) {
+    private void addWelcomeTab(JTabbedPane panel, TabDef tabDef) {
         try {
             JEditorPane editor =
                 new JEditorPane(TikaGUI.class.getResource("welcome.html"));
@@ -436,22 +431,36 @@ private void addWelcomeCard(JPanel panel, String name) {
             editor.setBackground(Color.WHITE);
             editor.setTransferHandler(new ParsingTransferHandler(
                     editor.getTransferHandler(), this));
-            panel.add(new JScrollPane(editor), name);
+            panel.addTab(tabDef.title, new JScrollPane(editor));
+            selectTab(tabDef);
         } catch (IOException e) {
             e.printStackTrace();
         }
     }
 
-    private JEditorPane addCard(JPanel panel, String type, String name) {
+    private JEditorPane addTab(JTabbedPane panel, TabDef tabDef) {
         JEditorPane editor = new JTextPane();
         editor.setBackground(Color.WHITE);
-        editor.setContentType(type);
+        editor.setContentType(tabDef.content);
         editor.setTransferHandler(new ParsingTransferHandler(
                 editor.getTransferHandler(), this));
-        panel.add(new JScrollPane(editor), name);
+        panel.addTab(tabDef.title, new JScrollPane(editor));
         return editor;
     }
 
+    private JFXPanel addWebViewTab(final JTabbedPane panel, final TabDef tabDef){
+        final JFXPanel jfxPanel = new JFXPanel();
+        Platform.runLater(new Runnable() {
+            @Override
+            public void run() {
+                WebView webView = new WebView();
+                jfxPanel.setScene( new Scene( webView ) );
+                panel.addTab(tabDef.title,jfxPanel);
+            }
+        });
+        return jfxPanel;
+    }
+
     private void textDialog(String title, URL resource) {
         try {
             JDialog dialog = new JDialog(this, title);
@@ -500,6 +509,23 @@ private void setText(JEditorPane editor, String text) {
         editor.setCaretPosition(0);
     }
 
+    private void setHtml(final JFXPanel htmlPanel, final String html) {
+        Platform.runLater(new Runnable() {
+            @Override
+            public void run() {
+                WebView webView = (WebView)htmlPanel.getScene().getRoot();
+                WebEngine engine = webView.getEngine();
+                //webView.getEngine().setUserStyleSheetLocation();
+                engine.loadContent(html);
+            }
+        });
+    }
+
+    private void selectTab(TabDef tabDef){
+        tabs.setSelectedIndex(tabs.indexOfTab(tabDef.title));
+
+    }
+
     /**
      * Creates and returns a content handler that turns XHTML input to
      * simplified HTML output that can be correctly parsed and displayed
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index ba5af757f7a..bc4eba6c8f2 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -61,7 +61,9 @@
  * stream.
  */
 class PDF2XHTML extends AbstractPDF2XHTML {
-
+    private final static String XHTML_ELEMENT_NAME_P = "p";
+    private final static String XHTML_ELEMENT_NAME_IMG = "img";
+    private final static String XHTML_ELEMENT_NAME_SPAN = "span";
 
     private static final List<String> JPEG = Arrays.asList(
             COSName.DCT_DECODE.getName(),
@@ -238,8 +240,8 @@ private void extractImages(PDResources resources, Set<COSBase> seenThisPage) thr
                 AttributesImpl attr = new AttributesImpl();
                 attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName);
                 attr.addAttribute("", "alt", "alt", "CDATA", fileName);
-                xhtml.startElement("img", attr);
-                xhtml.endElement("img");
+                xhtml.startElement(XHTML_ELEMENT_NAME_IMG, attr);
+                xhtml.endElement(XHTML_ELEMENT_NAME_IMG);
 
                 //Do we only want to process unique COSObject ids?
                 //If so, have we already processed this one?
@@ -316,7 +318,7 @@ private void writeToBuffer(PDImageXObject pdImage, String suffix, OutputStream o
     protected void writeParagraphStart() throws IOException {
         super.writeParagraphStart();
         try {
-            xhtml.startElement("p");
+            xhtml.startElement(XHTML_ELEMENT_NAME_P);
         } catch (SAXException e) {
             throw new IOException("Unable to start a paragraph", e);
         }
@@ -326,29 +328,46 @@ protected void writeParagraphStart() throws IOException {
     protected void writeParagraphEnd() throws IOException {
         super.writeParagraphEnd();
         try {
-            xhtml.endElement("p");
+            xhtml.endElement(XHTML_ELEMENT_NAME_P);
         } catch (SAXException e) {
             throw new IOException("Unable to end a paragraph", e);
         }
     }
 
-    @Override
-    protected void writeString(String text) throws IOException {
-        try {
-            xhtml.characters(text);
-        } catch (SAXException e) {
-            throw new IOException(
-                    "Unable to write a string: " + text, e);
+    private AttributesImpl extractSpanAttrs(List<TextPosition> textPositions) {
+        float fontResizeFactor = 0.8f;
+        AttributesImpl attrs = new AttributesImpl();
+        if (textPositions.size() > 0) {
+            TextPosition startPos = textPositions.get(0);
+            attrs.addAttribute("", "", "pdfFontName", null, startPos.getFont().getName());
+            attrs.addAttribute("", "", "pdfPageNo", null, getCurrentPageNo() + "");
+            attrs.addAttribute("", "", "style", null,
+                    "position: absolute; top: " + startPos.getY() + "px; left: " + startPos.getX() + "px; " +
+                            "font-size: " + (startPos.getFontSize()*fontResizeFactor) + "em"
+            );
         }
+        return attrs;
     }
 
     @Override
-    protected void writeCharacters(TextPosition text) throws IOException {
+    protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
+        boolean elementStarted = false;
         try {
-            xhtml.characters(text.getUnicode());
+            xhtml.startElement(XHTML_ELEMENT_NAME_SPAN, extractSpanAttrs(textPositions));
+            elementStarted = true;
+            xhtml.characters(text);
         } catch (SAXException e) {
             throw new IOException(
-                    "Unable to write a character: " + text.getUnicode(), e);
+                    "Unable to write a character: " + text, e);
+        } finally {
+            try {
+                if (elementStarted) {
+                    xhtml.endElement(XHTML_ELEMENT_NAME_SPAN);
+                }
+            } catch (SAXException e) {
+                throw new IOException(
+                        "Unable to end element " + XHTML_ELEMENT_NAME_SPAN, e);
+            }
         }
     }
 

From b668477f18e5c70c3cdf7c890b248e4696739e2e Mon Sep 17 00:00:00 2001
From: Mariusz Olejnik <mariusz.olejnik@infinite.pl>
Date: Mon, 20 Feb 2017 00:59:24 +0100
Subject: [PATCH 2/3] Add documents to Lucene Index and UI to execute queries

---
 tika-app/pom.xml                              |  10 ++
 .../java/org/apache/tika/gui/TikaGUI.java     | 113 ++++++++++++++----
 .../apache/tika/lucene/DocumentIndexer.java   |  81 +++++++++++++
 .../org/apache/tika/lucene/FoundItem.java     |  41 +++++++
 4 files changed, 222 insertions(+), 23 deletions(-)
 create mode 100644 tika-app/src/main/java/org/apache/tika/lucene/DocumentIndexer.java
 create mode 100644 tika-app/src/main/java/org/apache/tika/lucene/FoundItem.java

diff --git a/tika-app/pom.xml b/tika-app/pom.xml
index 16b65d203d0..db9f0d977ed 100644
--- a/tika-app/pom.xml
+++ b/tika-app/pom.xml
@@ -73,6 +73,16 @@
       <artifactId>tika-batch</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.lucene</groupId>
+      <artifactId>lucene-core</artifactId>
+      <version>6.4.1</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.lucene</groupId>
+      <artifactId>lucene-queryparser</artifactId>
+      <version>6.4.1</version>
+    </dependency>
 
     <dependency>
       <groupId>org.slf4j</groupId>
diff --git a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
index 9944284fd84..e45662a61e6 100644
--- a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
+++ b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
@@ -16,21 +16,7 @@
  */
 package org.apache.tika.gui;
 
-import javax.swing.Box;
-import javax.swing.JDialog;
-import javax.swing.JEditorPane;
-import javax.swing.JFileChooser;
-import javax.swing.JFrame;
-import javax.swing.JMenu;
-import javax.swing.JMenuBar;
-import javax.swing.JMenuItem;
-import javax.swing.JOptionPane;
-import javax.swing.JTabbedPane;
-import javax.swing.JScrollPane;
-import javax.swing.JTextPane;
-import javax.swing.ProgressMonitorInputStream;
-import javax.swing.SwingUtilities;
-import javax.swing.UIManager;
+import javax.swing.*;
 import javax.swing.event.HyperlinkEvent;
 import javax.swing.event.HyperlinkEvent.EventType;
 import javax.swing.event.HyperlinkListener;
@@ -39,9 +25,7 @@
 import javax.xml.transform.sax.SAXTransformerFactory;
 import javax.xml.transform.sax.TransformerHandler;
 import javax.xml.transform.stream.StreamResult;
-import java.awt.Color;
-import java.awt.Dimension;
-import java.awt.Toolkit;
+import java.awt.*;
 import java.awt.event.ActionEvent;
 import java.awt.event.ActionListener;
 import java.awt.event.KeyEvent;
@@ -55,10 +39,8 @@
 import java.io.Writer;
 import java.net.MalformedURLException;
 import java.net.URL;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Set;
+import java.util.*;
+import java.util.List;
 
 import javafx.application.Platform;
 import javafx.embed.swing.JFXPanel;
@@ -66,10 +48,15 @@
 import javafx.scene.web.WebEngine;
 import javafx.scene.web.WebView;
 import org.apache.commons.io.IOUtils;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.DocumentSelector;
 import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.lucene.DocumentIndexer;
+import org.apache.tika.lucene.FoundItem;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.serialization.JsonMetadataList;
 import org.apache.tika.mime.MediaType;
@@ -151,6 +138,11 @@ public void run() {
      */
     private final ImageSavingParser imageParser;
 
+    /**
+     * Document indexer
+     */
+    private DocumentIndexer docIndexer;
+
     /**
      * Container for the editor tabs.
      */
@@ -166,7 +158,9 @@ private enum TabDef {
         TEXT("Plain text", "text/plain"),
         TEXT_MAIN("Main content", "text/plain"),
         XML("Structured XML", "text/plain"),
-        JSON("Recursive JSON", "text/plain");
+        JSON("Recursive JSON", "text/plain"),
+        LUCENE("Lucene", ""),
+        ;
 
         String title, content;
         TabDef(String title, String content) {
@@ -205,6 +199,12 @@ private enum TabDef {
      */
     private final JEditorPane metadata;
 
+    /**
+     * Index queries.
+     */
+    private JEditorPane queryEditor;
+    private JEditorPane resultEditor;
+
     /**
      * File chooser.
      */
@@ -224,6 +224,7 @@ public TikaGUI(Parser parser) {
         textMain = addTab(tabs, TabDef.TEXT_MAIN);
         xml = addTab(tabs, TabDef.XML);
         json = addTab(tabs, TabDef.JSON);
+        addDocIndexerTab(tabs, TabDef.LUCENE);
         add(tabs);
 
         setPreferredSize(new Dimension(640, 480));
@@ -231,6 +232,12 @@ public TikaGUI(Parser parser) {
 
         this.context = new ParseContext();
         this.parser = parser;
+        try {
+            this.docIndexer = new DocumentIndexer();
+        } catch (IOException e) {
+            this.docIndexer = null;
+            e.printStackTrace();
+        }
 
         this.imageParser = new ImageSavingParser(parser);
         this.context.set(DocumentSelector.class, new ImageDocumentSelector());
@@ -286,6 +293,8 @@ public void actionPerformed(ActionEvent e) {
                             "Invalid URL", JOptionPane.ERROR_MESSAGE);
                 }
             }
+        } else if ("query".equals(command)) {
+            executeQuery();
         } else if ("about".equals(command)) {
             textDialog(
                     "About Apache Tika",
@@ -374,6 +383,7 @@ private void handleStream(InputStream input, Metadata md)
         setText(text, textBuffer.toString());
         setText(textMain, textMainBuffer.toString());
         setHtml(xhtml, htmlBuffer.toString());
+        addDocumentToIndex(name,htmlBuffer.toString());
         if (!input.markSupported()) {
             setText(json, "InputStream does not support mark/reset for Recursive Parsing");
             selectTab(TabDef.JSON);
@@ -461,6 +471,63 @@ public void run() {
         return jfxPanel;
     }
 
+    private JPanel addDocIndexerTab(JTabbedPane tabs, TabDef tabDef) {
+        JPanel docIndexerPanel = new JPanel();
+        docIndexerPanel.setLayout(new BoxLayout(docIndexerPanel, BoxLayout.PAGE_AXIS));
+
+        this.queryEditor = new JTextPane();
+        this.queryEditor.setText("contents:\"*01*\"");
+        JScrollPane queryScroller = new JScrollPane(queryEditor);
+        queryScroller.setPreferredSize(new Dimension(tabs.getWidth(), 50));
+        queryScroller.setAlignmentX(LEFT_ALIGNMENT);
+
+        //Lay out the buttons from left to right.
+        final JButton queryButton = new JButton("Query");
+        queryButton.setActionCommand("query");
+        queryButton.addActionListener(this);
+        JPanel buttonPane = new JPanel();
+        buttonPane.setLayout(new BoxLayout(buttonPane, BoxLayout.PAGE_AXIS));
+        buttonPane.setBorder(BorderFactory.createEmptyBorder(0, 10, 10, 10));
+        buttonPane.add(Box.createHorizontalGlue());
+        buttonPane.add(queryButton);
+
+        this.resultEditor = new JTextPane();
+        JScrollPane resultsScroller = new JScrollPane(resultEditor);
+        resultsScroller.setPreferredSize(new Dimension(tabs.getWidth(), 280));
+        resultsScroller.setAlignmentX(LEFT_ALIGNMENT);
+
+        //Put everything together, using the content pane's BorderLayout.
+        docIndexerPanel.add(queryScroller);
+        docIndexerPanel.add(buttonPane);
+        docIndexerPanel.add(resultsScroller);
+        tabs.add(tabDef.title,docIndexerPanel);
+        return docIndexerPanel;
+    }
+
+    private void addDocumentToIndex(String filename, String contents) {
+        try {
+            docIndexer.addDocument(filename, contents);
+        } catch (IOException e) {
+            this.resultEditor.setText(e.getMessage());
+            e.printStackTrace();
+        }
+    }
+    private void executeQuery() {
+        try {
+            List<FoundItem> fis = docIndexer.searchDocuments(this.queryEditor.getText());
+            StringBuffer res = new StringBuffer();
+            for (FoundItem fi : fis) {
+                res.append(fi.getScoreDoc().toString()+"\n  - "+fi.getDocument().toString()+"\n");
+            }
+            this.resultEditor.setText(res.toString());
+        } catch (ParseException e) {
+            this.resultEditor.setText(e.getMessage());
+            e.printStackTrace();
+        } catch (IOException e) {
+            this.resultEditor.setText(e.getMessage());
+            e.printStackTrace();
+        }
+    }
     private void textDialog(String title, URL resource) {
         try {
             JDialog dialog = new JDialog(this, title);
diff --git a/tika-app/src/main/java/org/apache/tika/lucene/DocumentIndexer.java b/tika-app/src/main/java/org/apache/tika/lucene/DocumentIndexer.java
new file mode 100644
index 00000000000..0a16c049cff
--- /dev/null
+++ b/tika-app/src/main/java/org/apache/tika/lucene/DocumentIndexer.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.lucene;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.queryparser.classic.QueryParser;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Document index in RAM.
+ */
+public class DocumentIndexer {
+    private final Directory dir;
+    private final Analyzer analyzer;
+
+    public DocumentIndexer() throws IOException {
+        this.dir = new RAMDirectory();
+        this.analyzer = new StandardAnalyzer();
+    }
+
+    public void addDocument(String filename, String contents) throws IOException {
+        Document doc = new Document();
+        doc.add(new StringField("filename", filename, Store.YES));
+        doc.add(new TextField("contents", contents, Store.NO));
+        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
+        IndexWriter writer = new IndexWriter(dir, iwc);
+        writer.addDocument(doc);
+        writer.close();
+    }
+
+
+    public List<FoundItem> searchDocuments(String queryString) throws ParseException, IOException {
+        IndexReader reader = DirectoryReader.open(dir);
+        IndexSearcher searcher = new IndexSearcher(reader);
+        // Build a Query object
+        QueryParser parser = new QueryParser("contents", analyzer);
+        Query query = parser.parse(queryString);
+        TopDocs topDocs = searcher.search(query, 10);
+        List<FoundItem> fi = new ArrayList<>(topDocs.scoreDocs.length);
+        for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
+            fi.add(new FoundItem(scoreDoc,reader.document(scoreDoc.doc)));
+        }
+        reader.close();
+        return fi;
+    }
+
+}
diff --git a/tika-app/src/main/java/org/apache/tika/lucene/FoundItem.java b/tika-app/src/main/java/org/apache/tika/lucene/FoundItem.java
new file mode 100644
index 00000000000..6e5d6154865
--- /dev/null
+++ b/tika-app/src/main/java/org/apache/tika/lucene/FoundItem.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.lucene;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.search.ScoreDoc;
+
+/**
+ * Info about one found item.
+ */
+public class FoundItem {
+    private final ScoreDoc scoreDoc;
+    private final Document document;
+
+    public FoundItem(ScoreDoc scoreDoc, Document document) {
+        this.scoreDoc = scoreDoc;
+        this.document = document;
+    }
+
+    public ScoreDoc getScoreDoc() {
+        return scoreDoc;
+    }
+
+    public Document getDocument() {
+        return document;
+    }
+}

From bd77c215a6f0041da0c7256579849ab2c7ce744c Mon Sep 17 00:00:00 2001
From: Mariusz Olejnik <mariusz.olejnik@infinite.pl>
Date: Wed, 12 Apr 2017 13:33:03 +0200
Subject: [PATCH 3/3] no message

---
 .../org/apache/tika/parser/pdf/PDF2XHTML.java | 29 +++++++------------
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index bc4eba6c8f2..55671d83448 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -16,19 +16,6 @@
  */
 package org.apache.tika.parser.pdf;
 
-import java.awt.image.BufferedImage;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.Writer;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
 import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.cos.COSStream;
@@ -55,6 +42,10 @@
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
+import java.awt.image.BufferedImage;
+import java.io.*;
+import java.util.*;
+
 /**
  * Utility class that overrides the {@link PDFTextStripper} functionality
  * to produce a semi-structured XHTML SAX events instead of a plain text
@@ -335,14 +326,16 @@ protected void writeParagraphEnd() throws IOException {
     }
 
     private AttributesImpl extractSpanAttrs(List<TextPosition> textPositions) {
-        float fontResizeFactor = 0.8f;
+        float fontResizeFactor = 0.7f;
         AttributesImpl attrs = new AttributesImpl();
         if (textPositions.size() > 0) {
             TextPosition startPos = textPositions.get(0);
-            attrs.addAttribute("", "", "pdfFontName", null, startPos.getFont().getName());
-            attrs.addAttribute("", "", "pdfPageNo", null, getCurrentPageNo() + "");
-            attrs.addAttribute("", "", "style", null,
-                    "position: absolute; top: " + startPos.getY() + "px; left: " + startPos.getX() + "px; " +
+            int pageNo = getCurrentPageNo();
+            float x = startPos.getX();
+            float y = startPos.getY();
+            attrs.addAttribute("", "coordinates", "coordinates", null, pageNo+"-"+x+"-"+y);
+            attrs.addAttribute("", "style", "style", null,
+                    "position: absolute; top: "+y+"px; left: "+x+"px; " +
                             "font-size: " + (startPos.getFontSize()*fontResizeFactor) + "em"
             );
         }