lucene/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.highlight;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Objects;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.PriorityQueue;

/**
 * Marks up highlighted terms found in the best sections of text, using configurable {@link
 * Fragmenter}, {@link Scorer}, {@link Formatter}, {@link Encoder} and tokenizers.
 *
 * <p>This is Lucene's original Highlighter; there are others.
 */
public class Highlighter {
  public static final int DEFAULT_MAX_CHARS_TO_ANALYZE = 50 * 1024;

  private Formatter formatter;
  private Encoder encoder;
  private Scorer fragmentScorer;
  private int maxDocCharsToAnalyze = DEFAULT_MAX_CHARS_TO_ANALYZE;
  private Fragmenter textFragmenter = new SimpleFragmenter();

  public Highlighter(Scorer fragmentScorer) {
    this(new SimpleHTMLFormatter(), fragmentScorer);
  }

  public Highlighter(Formatter formatter, Scorer fragmentScorer) {
    this(formatter, new DefaultEncoder(), fragmentScorer);
  }

  public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer) {
    ensureArgumentNotNull(formatter, "'formatter' must not be null");
    ensureArgumentNotNull(encoder, "'encoder' must not be null");
    ensureArgumentNotNull(fragmentScorer, "'fragmentScorer' must not be null");

    this.formatter = formatter;
    this.encoder = encoder;
    this.fragmentScorer = fragmentScorer;
  }

  /**
   * Highlights chosen terms in a text, extracting the most relevant section. This is a convenience
   * method that calls {@link #getBestFragment(TokenStream, String)}
   *
   * @param analyzer the analyzer that will be used to split <code>text</code> into chunks
   * @param text text to highlight terms in
   * @param fieldName Name of field used to influence analyzer's tokenization policy
   * @return highlighted text fragment or null if no terms found
   * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided
   *     text's length
   */
  public final String getBestFragment(Analyzer analyzer, String fieldName, String text)
      throws IOException, InvalidTokenOffsetsException {
    TokenStream tokenStream = analyzer.tokenStream(fieldName, text);
    return getBestFragment(tokenStream, text);
  }

  /**
   * Highlights chosen terms in a text, extracting the most relevant section. The document text is
   * analysed in chunks to record hit statistics across the document. After accumulating stats, the
   * fragment with the highest score is returned
   *
   * @param tokenStream a stream of tokens identified in the text parameter, including offset
   *     information. This is typically produced by an analyzer re-parsing a document's text. Some
   *     work may be done on retrieving TokenStreams more efficiently by adding support for storing
   *     original text position data in the Lucene index but this support is not currently available
   *     (as of Lucene 1.4 rc2).
   * @param text text to highlight terms in
   * @return highlighted text fragment or null if no terms found
   * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided
   *     text's length
   */
  public final String getBestFragment(TokenStream tokenStream, String text)
      throws IOException, InvalidTokenOffsetsException {
    String[] results = getBestFragments(tokenStream, text, 1);
    if (results.length > 0) {
      return results[0];
    }
    return null;
  }

  /**
   * Highlights chosen terms in a text, extracting the most relevant sections. This is a convenience
   * method that calls {@link #getBestFragments(TokenStream, String, int)}
   *
   * @param analyzer the analyzer that will be used to split <code>text</code> into chunks
   * @param fieldName the name of the field being highlighted (used by analyzer)
   * @param text text to highlight terms in
   * @param maxNumFragments the maximum number of fragments.
   * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
   * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided
   *     text's length
   */
  public final String[] getBestFragments(
      Analyzer analyzer, String fieldName, String text, int maxNumFragments)
      throws IOException, InvalidTokenOffsetsException {
    TokenStream tokenStream = analyzer.tokenStream(fieldName, text);
    return getBestFragments(tokenStream, text, maxNumFragments);
  }

  /**
   * Highlights chosen terms in a text, extracting the most relevant sections. The document text is
   * analysed in chunks to record hit statistics across the document. After accumulating stats, the
   * fragments with the highest scores are returned as an array of strings in order of score
   * (contiguous fragments are merged into one in their original order to improve readability)
   *
   * @param text text to highlight terms in
   * @param maxNumFragments the maximum number of fragments.
   * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
   * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided
   *     text's length
   */
  public final String[] getBestFragments(TokenStream tokenStream, String text, int maxNumFragments)
      throws IOException, InvalidTokenOffsetsException {
    maxNumFragments = Math.max(1, maxNumFragments); // sanity check

    TextFragment[] frag = getBestTextFragments(tokenStream, text, true, maxNumFragments);

    // Get text
    ArrayList<String> fragTexts = new ArrayList<>();
    for (int i = 0; i < frag.length; i++) {
      if ((frag[i] != null) && (frag[i].getScore() > 0)) {
        fragTexts.add(frag[i].toString());
      }
    }
    return fragTexts.toArray(new String[0]);
  }

  /**
   * Low level api to get the most relevant (formatted) sections of the document. This method has
   * been made public to allow visibility of score information held in TextFragment objects. Thanks
   * to Jason Calabrese for help in redefining the interface.
   *
   * @throws IOException If there is a low-level I/O error
   * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided
   *     text's length
   */
  public final TextFragment[] getBestTextFragments(
      TokenStream tokenStream, String text, boolean mergeContiguousFragments, int maxNumFragments)
      throws IOException, InvalidTokenOffsetsException {
    ArrayList<TextFragment> docFrags = new ArrayList<>();
    StringBuilder newText = new StringBuilder();

    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
    TextFragment currentFrag = new TextFragment(newText, newText.length(), docFrags.size());

    if (fragmentScorer instanceof QueryScorer) {
      ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
    }

    TokenStream newStream = fragmentScorer.init(tokenStream);
    if (newStream != null) {
      tokenStream = newStream;
    }
    fragmentScorer.startFragment(currentFrag);
    docFrags.add(currentFrag);

    FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);

    try {

      String tokenText;
      int startOffset;
      int endOffset;
      int lastEndOffset = 0;
      textFragmenter.start(text, tokenStream);

      TokenGroup tokenGroup = new TokenGroup(tokenStream);

      tokenStream.reset();
      for (boolean next = tokenStream.incrementToken();
          next && (offsetAtt.startOffset() < maxDocCharsToAnalyze);
          next = tokenStream.incrementToken()) {
        if ((offsetAtt.endOffset() > text.length()) || (offsetAtt.startOffset() > text.length())) {
          throw new InvalidTokenOffsetsException(
              "Token "
                  + termAtt.toString()
                  + " exceeds length of provided text sized "
                  + text.length());
        }
        if ((tokenGroup.getNumTokens() > 0) && (tokenGroup.isDistinct())) {
          // the current token is distinct from previous tokens -
          // markup the cached token group info
          startOffset = tokenGroup.getStartOffset();
          endOffset = tokenGroup.getEndOffset();
          tokenText = text.substring(startOffset, endOffset);
          String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
          // store any whitespace etc from between this and last group
          if (startOffset > lastEndOffset)
            newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
          newText.append(markedUpText);
          lastEndOffset = Math.max(endOffset, lastEndOffset);
          tokenGroup.clear();

          // check if current token marks the start of a new fragment
          if (textFragmenter.isNewFragment()) {
            currentFrag.setScore(fragmentScorer.getFragmentScore());
            // record stats for a new fragment
            currentFrag.textEndPos = newText.length();
            currentFrag = new TextFragment(newText, newText.length(), docFrags.size());
            fragmentScorer.startFragment(currentFrag);
            docFrags.add(currentFrag);
          }
        }

        tokenGroup.addToken(fragmentScorer.getTokenScore());

        //        if(lastEndOffset>maxDocBytesToAnalyze)
        //        {
        //          break;
        //        }
      }
      currentFrag.setScore(fragmentScorer.getFragmentScore());

      if (tokenGroup.getNumTokens() > 0) {
        // flush the accumulated text (same code as in above loop)
        startOffset = tokenGroup.getStartOffset();
        endOffset = tokenGroup.getEndOffset();
        tokenText = text.substring(startOffset, endOffset);
        String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
        // store any whitespace etc from between this and last group
        if (startOffset > lastEndOffset)
          newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
        newText.append(markedUpText);
        lastEndOffset = Math.max(lastEndOffset, endOffset);
      }

      // Test what remains of the original text beyond the point where we stopped analyzing
      if (
      //          if there is text beyond the last token considered..
      (lastEndOffset < text.length())
          &&
          //          and that text is not too large...
          (text.length() <= maxDocCharsToAnalyze)) {
        // append it to the last fragment
        newText.append(encoder.encodeText(text.substring(lastEndOffset)));
      }

      currentFrag.textEndPos = newText.length();

      // sort the most relevant sections of the text
      for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext(); ) {
        currentFrag = i.next();

        // If you are running with a version of Lucene before 11th Sept 03
        // you do not have PriorityQueue.insert() - so uncomment the code below
        /*
                  if (currentFrag.getScore() >= minScore)
                  {
                    fragQueue.put(currentFrag);
                    if (fragQueue.size() > maxNumFragments)
                    { // if hit queue overfull
                      fragQueue.pop(); // remove lowest in hit queue
                      minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
                    }


                  }
        */
        // The above code caused a problem as a result of Christoph Goller's 11th Sept 03
        // fix to PriorityQueue. The correct method to use here is the new "insert" method
        // USE ABOVE CODE IF THIS DOES NOT COMPILE!
        fragQueue.insertWithOverflow(currentFrag);
      }

      // return the most relevant fragments
      TextFragment[] frag = new TextFragment[fragQueue.size()];
      for (int i = frag.length - 1; i >= 0; i--) {
        frag[i] = fragQueue.pop();
      }

      // merge any contiguous fragments to improve readability
      if (mergeContiguousFragments) {
        mergeContiguousFragments(frag);
        ArrayList<TextFragment> fragTexts = new ArrayList<>();
        for (int i = 0; i < frag.length; i++) {
          if ((frag[i] != null) && (frag[i].getScore() > 0)) {
            fragTexts.add(frag[i]);
          }
        }
        frag = fragTexts.toArray(new TextFragment[0]);
      }

      return frag;

    } finally {
      if (tokenStream != null) {
        try {
          tokenStream.end();
          tokenStream.close();
        } catch (
            @SuppressWarnings("unused")
            Exception e) {
        }
      }
    }
  }

  /**
   * Improves readability of a score-sorted list of TextFragments by merging any fragments that were
   * contiguous in the original text into one larger fragment with the correct order. This will
   * leave a "null" in the array entry for the lesser scored fragment.
   *
   * @param frag An array of document fragments in descending score
   */
  private void mergeContiguousFragments(TextFragment[] frag) {
    boolean mergingStillBeingDone;
    if (frag.length > 1)
      do {
        mergingStillBeingDone = false; // initialise loop control flag
        // for each fragment, scan other frags looking for contiguous blocks
        for (int i = 0; i < frag.length; i++) {
          if (frag[i] == null) {
            continue;
          }
          // merge any contiguous blocks
          for (int x = 0; x < frag.length; x++) {
            if (frag[x] == null) {
              continue;
            }
            if (frag[i] == null) {
              break;
            }
            TextFragment frag1 = null;
            TextFragment frag2 = null;
            int frag1Num = 0;
            int frag2Num = 0;
            int bestScoringFragNum;
            int worstScoringFragNum;
            // if blocks are contiguous....
            if (frag[i].follows(frag[x])) {
              frag1 = frag[x];
              frag1Num = x;
              frag2 = frag[i];
              frag2Num = i;
            } else if (frag[x].follows(frag[i])) {
              frag1 = frag[i];
              frag1Num = i;
              frag2 = frag[x];
              frag2Num = x;
            }
            // merging required..
            if (frag1 != null) {
              if (frag1.getScore() > frag2.getScore()) {
                bestScoringFragNum = frag1Num;
                worstScoringFragNum = frag2Num;
              } else {
                bestScoringFragNum = frag2Num;
                worstScoringFragNum = frag1Num;
              }
              frag1.merge(frag2);
              frag[worstScoringFragNum] = null;
              mergingStillBeingDone = true;
              frag[bestScoringFragNum] = frag1;
            }
          }
        }
      } while (mergingStillBeingDone);
  }

  /**
   * Highlights terms in the text , extracting the most relevant sections and concatenating the
   * chosen fragments with a separator (typically "..."). The document text is analysed in chunks to
   * record hit statistics across the document. After accumulating stats, the fragments with the
   * highest scores are returned in order as "separator" delimited strings.
   *
   * @param text text to highlight terms in
   * @param maxNumFragments the maximum number of fragments.
   * @param separator the separator used to intersperse the document fragments (typically "...")
   * @return highlighted text
   * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided
   *     text's length
   */
  public final String getBestFragments(
      TokenStream tokenStream, String text, int maxNumFragments, String separator)
      throws IOException, InvalidTokenOffsetsException {
    String[] sections = getBestFragments(tokenStream, text, maxNumFragments);
    StringBuilder result = new StringBuilder();
    for (int i = 0; i < sections.length; i++) {
      if (i > 0) {
        result.append(separator);
      }
      result.append(sections[i]);
    }
    return result.toString();
  }

  public int getMaxDocCharsToAnalyze() {
    return maxDocCharsToAnalyze;
  }

  public void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
    this.maxDocCharsToAnalyze = maxDocCharsToAnalyze;
  }

  public Fragmenter getTextFragmenter() {
    return textFragmenter;
  }

  public void setTextFragmenter(Fragmenter fragmenter) {
    textFragmenter = Objects.requireNonNull(fragmenter);
  }

  /**
   * @return Object used to score each text fragment
   */
  public Scorer getFragmentScorer() {
    return fragmentScorer;
  }

  public void setFragmentScorer(Scorer scorer) {
    fragmentScorer = Objects.requireNonNull(scorer);
  }

  public Encoder getEncoder() {
    return encoder;
  }

  public void setEncoder(Encoder encoder) {
    this.encoder = Objects.requireNonNull(encoder);
  }

  /**
   * Throws an IllegalArgumentException with the provided message if 'argument' is null.
   *
   * @param argument the argument to be null-checked
   * @param message the message of the exception thrown if argument == null
   */
  private static void ensureArgumentNotNull(Object argument, String message) {
    if (argument == null) {
      throw new IllegalArgumentException(message);
    }
  }

  static class FragmentQueue extends PriorityQueue<TextFragment> {
    FragmentQueue(int size) {
      super(size);
    }

    @Override
    public final boolean lessThan(TextFragment fragA, TextFragment fragB) {
      if (fragA.getScore() == fragB.getScore()) return fragA.fragNum > fragB.fragNum;
      else return fragA.getScore() < fragB.getScore();
    }
  }
}