src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizer.cs

﻿using J2N;
using Lucene.Net.Analysis.Ja.Dict;
using Lucene.Net.Analysis.Ja.TokenAttributes;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Diagnostics;
using Lucene.Net.Support;
using Lucene.Net.Util;
using Lucene.Net.Util.Fst;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Threading;
using Console = Lucene.Net.Util.SystemConsole;
using JCG = J2N.Collections.Generic;

namespace Lucene.Net.Analysis.Ja
{
    /*
     * Licensed to the Apache Software Foundation (ASF) under one or more
     * contributor license agreements.  See the NOTICE file distributed with
     * this work for additional information regarding copyright ownership.
     * The ASF licenses this file to You under the Apache License, Version 2.0
     * (the "License"); you may not use this file except in compliance with
     * the License.  You may obtain a copy of the License at
     *
     *     http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing, software
     * distributed under the License is distributed on an "AS IS" BASIS,
     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     * See the License for the specific language governing permissions and
     * limitations under the License.
     */

    /// <summary>
    /// Tokenizer for Japanese that uses morphological analysis.
    /// </summary>
    /// <remarks>
    /// This tokenizer sets a number of additional attributes:
    /// <list type="bullet">
    ///     <item><description><see cref="IBaseFormAttribute"/> containing base form for inflected adjectives and verbs.</description></item>
    ///     <item><description><see cref="IPartOfSpeechAttribute"/> containing part-of-speech.</description></item>
    ///     <item><description><see cref="IReadingAttribute"/> containing reading and pronunciation.</description></item>
    ///     <item><description><see cref="IInflectionAttribute"/> containing additional part-of-speech information for inflected forms.</description></item>
    /// </list>
    /// <para/>
    /// This tokenizer uses a rolling Viterbi search to find the 
    /// least cost segmentation (path) of the incoming characters.
    /// For tokens that appear to be compound (> length 2 for all
    /// Kanji, or > length 7 for non-Kanji), we see if there is a
    /// 2nd best segmentation of that token after applying
    /// penalties to the long tokens.  If so, and the Mode is
    /// <see cref="JapaneseTokenizerMode.SEARCH"/>, we output the alternate segmentation 
    /// as well.
    /// </remarks>
    public sealed class JapaneseTokenizer : Tokenizer
    {
        // LUCENENET specific: de-nested Mode and renamed JapaneseTokenizerMode

        /// <summary>
        /// Default tokenization mode. Currently this is <see cref="JapaneseTokenizerMode.SEARCH"/>.
        /// </summary>
        public static readonly JapaneseTokenizerMode DEFAULT_MODE = JapaneseTokenizerMode.SEARCH;

        // LUCENENET specific: de-nested Type and renamed JapaneseTokenizerType


#pragma warning disable CA1802 // Use literals where appropriate
        private static readonly bool VERBOSE = false; // For debugging
#pragma warning restore CA1802 // Use literals where appropriate

        private const int SEARCH_MODE_KANJI_LENGTH = 2;

        private const int SEARCH_MODE_OTHER_LENGTH = 7; // Must be >= SEARCH_MODE_KANJI_LENGTH

        private const int SEARCH_MODE_KANJI_PENALTY = 3000;

        private const int SEARCH_MODE_OTHER_PENALTY = 1700;

        // For safety:
        private const int MAX_UNKNOWN_WORD_LENGTH = 1024;
        private const int MAX_BACKTRACE_GAP = 1024;

        private readonly IDictionary<JapaneseTokenizerType, IDictionary> dictionaryMap = new Dictionary<JapaneseTokenizerType, IDictionary>();

        private readonly TokenInfoFST fst;
        private readonly TokenInfoDictionary dictionary;
        private readonly UnknownDictionary unkDictionary;
        private readonly ConnectionCosts costs;
        private readonly UserDictionary userDictionary;
        private readonly CharacterDefinition characterDefinition;

        private readonly FST.Arc<long?> arc = new FST.Arc<long?>();
        private readonly FST.BytesReader fstReader;
        private readonly Int32sRef wordIdRef = new Int32sRef();

        private readonly FST.BytesReader userFSTReader;
        private readonly TokenInfoFST userFST;

        private readonly RollingCharBuffer buffer = new RollingCharBuffer();

        private readonly WrappedPositionArray positions = new WrappedPositionArray();

        private readonly bool discardPunctuation;
        private readonly bool searchMode;
        private readonly bool extendedMode;
        private readonly bool outputCompounds;

        // Index of the last character of unknown word:
        private int unknownWordEndIndex = -1;

        // True once we've hit the EOF from the input reader:
        private bool end;

        // Last absolute position we backtraced from:
        private int lastBackTracePos;

        // Position of last token we returned; we use this to
        // figure out whether to set posIncr to 0 or 1:
        private int lastTokenPos;

        // Next absolute position to process:
        private int pos;

        // Already parsed, but not yet passed to caller, tokens:
        private readonly JCG.List<Token> pending = new JCG.List<Token>();

        private readonly ICharTermAttribute termAtt;
        private readonly IOffsetAttribute offsetAtt;
        private readonly IPositionIncrementAttribute posIncAtt;
        private readonly IPositionLengthAttribute posLengthAtt;
        private readonly IBaseFormAttribute basicFormAtt;
        private readonly IPartOfSpeechAttribute posAtt;
        private readonly IReadingAttribute readingAtt;
        private readonly IInflectionAttribute inflectionAtt;

        /// <summary>
        /// Create a new JapaneseTokenizer.
        /// <para/>
        /// Uses the default AttributeFactory.
        /// </summary>
        /// <param name="input">TextReader containing text.</param>
        /// <param name="userDictionary">Optional: if non-null, user dictionary.</param>
        /// <param name="discardPunctuation"><c>true</c> if punctuation tokens should be dropped from the output.</param>
        /// <param name="mode">Tokenization mode.</param>
        public JapaneseTokenizer(TextReader input, UserDictionary userDictionary, bool discardPunctuation, JapaneseTokenizerMode mode)
            : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, userDictionary, discardPunctuation, mode)
        {
        }

        /// <summary>
        /// Create a new JapaneseTokenizer.
        /// </summary>
        /// <param name="factory">The AttributeFactory to use.</param>
        /// <param name="input">TextReader containing text.</param>
        /// <param name="userDictionary">Optional: if non-null, user dictionary.</param>
        /// <param name="discardPunctuation"><c>true</c> if punctuation tokens should be dropped from the output.</param>
        /// <param name="mode">Tokenization mode.</param>
        public JapaneseTokenizer
            (AttributeFactory factory, TextReader input, UserDictionary userDictionary, bool discardPunctuation, JapaneseTokenizerMode mode)
            : base(factory, input)
        {
            this.termAtt = AddAttribute<ICharTermAttribute>();
            this.offsetAtt = AddAttribute<IOffsetAttribute>();
            this.posIncAtt = AddAttribute<IPositionIncrementAttribute>();
            this.posLengthAtt = AddAttribute<IPositionLengthAttribute>();
            this.basicFormAtt = AddAttribute<IBaseFormAttribute>();
            this.posAtt = AddAttribute<IPartOfSpeechAttribute>();
            this.readingAtt = AddAttribute<IReadingAttribute>();
            this.inflectionAtt = AddAttribute<IInflectionAttribute>();

            dictionary = TokenInfoDictionary.Instance;
            fst = dictionary.FST;
            unkDictionary = UnknownDictionary.Instance;
            characterDefinition = unkDictionary.CharacterDefinition;
            this.userDictionary = userDictionary;
            costs = ConnectionCosts.Instance;
            fstReader = fst.GetBytesReader();
            if (userDictionary != null)
            {
                userFST = userDictionary.FST;
                userFSTReader = userFST.GetBytesReader();
            }
            else
            {
                userFST = null;
                userFSTReader = null;
            }
            this.discardPunctuation = discardPunctuation;
            switch (mode)
            {
                case JapaneseTokenizerMode.SEARCH:
                    searchMode = true;
                    extendedMode = false;
                    outputCompounds = true;
                    break;
                case JapaneseTokenizerMode.EXTENDED:
                    searchMode = true;
                    extendedMode = true;
                    outputCompounds = false;
                    break;
                default:
                    searchMode = false;
                    extendedMode = false;
                    outputCompounds = false;
                    break;
            }
            buffer.Reset(this.m_input);

            ResetState();

            dictionaryMap[JapaneseTokenizerType.KNOWN] = dictionary;
            dictionaryMap[JapaneseTokenizerType.UNKNOWN] = unkDictionary;
            dictionaryMap[JapaneseTokenizerType.USER] = userDictionary;
        }

        private GraphvizFormatter dotOut;

        // LUCENENET specific - added getter and made into property
        // so we can set this during object initialization.

        /// <summary>
        /// Expert: set this to produce graphviz (dot) output of
        /// the Viterbi lattice
        /// </summary>
        public GraphvizFormatter GraphvizFormatter
        {
            get => this.dotOut;
            set => this.dotOut = value;
        }

        protected override void Dispose(bool disposing)
        {
            base.Dispose(disposing);
            if (disposing)
            {
                buffer.Reset(m_input);
            }
        }

        public override void Reset()
        {
            base.Reset();
            buffer.Reset(m_input);
            ResetState();
        }

        private void ResetState()
        {
            positions.Reset();
            unknownWordEndIndex = -1;
            pos = 0;
            end = false;
            lastBackTracePos = 0;
            lastTokenPos = -1;
            pending.Clear();

            // Add BOS:
            positions.Get(0).Add(0, 0, -1, -1, -1, JapaneseTokenizerType.KNOWN);
        }

        public override void End()
        {
            base.End();
            // Set final offset
            int finalOffset = CorrectOffset(pos);
            offsetAtt.SetOffset(finalOffset, finalOffset);
        }

        // Returns the added cost that a 2nd best segmentation is
        // allowed to have.  Ie, if we see path with cost X,
        // ending in a compound word, and this method returns
        // threshold > 0, then we will also find the 2nd best
        // segmentation and if its path score is within this
        // threshold of X, we'll include it in the output:
        private int ComputeSecondBestThreshold(int pos, int length)
        {
            // TODO: maybe we do something else here, instead of just
            // using the penalty...?  EG we can be more aggressive on
            // when to also test for 2nd best path
            return ComputePenalty(pos, length);
        }

        private int ComputePenalty(int pos, int length)
        {
            if (length > SEARCH_MODE_KANJI_LENGTH)
            {
                bool allKanji = true;
                // check if node consists of only kanji
                int endPos = pos + length;
                for (int pos2 = pos; pos2 < endPos; pos2++)
                {
                    if (!characterDefinition.IsKanji((char)buffer.Get(pos2)))
                    {
                        allKanji = false;
                        break;
                    }
                }
                if (allKanji)
                {  // Process only Kanji keywords
                    return (length - SEARCH_MODE_KANJI_LENGTH) * SEARCH_MODE_KANJI_PENALTY;
                }
                else if (length > SEARCH_MODE_OTHER_LENGTH)
                {
                    return (length - SEARCH_MODE_OTHER_LENGTH) * SEARCH_MODE_OTHER_PENALTY;
                }
            }
            return 0;
        }

        // LUCENENET specific - de-nested Position class

        private void Add(IDictionary dict, Position fromPosData, int endPos, int wordID, JapaneseTokenizerType type, bool addPenalty)
        {
            int wordCost = dict.GetWordCost(wordID);
            int leftID = dict.GetLeftId(wordID);
            int leastCost = int.MaxValue;
            int leastIDX = -1;
            if (Debugging.AssertsEnabled) Debugging.Assert(fromPosData.count > 0);
            for (int idx = 0; idx < fromPosData.count; idx++)
            {
                // Cost is path cost so far, plus word cost (added at
                // end of loop), plus bigram cost:
                int cost = fromPosData.costs[idx] + costs.Get(fromPosData.lastRightID[idx], leftID);
                if (VERBOSE)
                {
                    Console.WriteLine("      fromIDX=" + idx + ": cost=" + cost + " (prevCost=" + fromPosData.costs[idx] + " wordCost=" + wordCost + " bgCost=" + costs.Get(fromPosData.lastRightID[idx], leftID) + " leftID=" + leftID);
                }
                if (cost < leastCost)
                {
                    leastCost = cost;
                    leastIDX = idx;
                    if (VERBOSE)
                    {
                        Console.WriteLine("        **");
                    }
                }
            }

            leastCost += wordCost;

            if (VERBOSE)
            {
                Console.WriteLine("      + cost=" + leastCost + " wordID=" + wordID + " leftID=" + leftID + " leastIDX=" + leastIDX + " toPos=" + endPos + " toPos.idx=" + positions.Get(endPos).count);
            }

            if ((addPenalty || (!outputCompounds && searchMode)) && type != JapaneseTokenizerType.USER)
            {
                int penalty = ComputePenalty(fromPosData.pos, endPos - fromPosData.pos);
                if (VERBOSE)
                {
                    if (penalty > 0)
                    {
                        Console.WriteLine("        + penalty=" + penalty + " cost=" + (leastCost + penalty));
                    }
                }
                leastCost += penalty;
            }

            //positions.get(endPos).add(leastCost, dict.getRightId(wordID), fromPosData.pos, leastIDX, wordID, type);
            if (Debugging.AssertsEnabled) Debugging.Assert(leftID == dict.GetRightId(wordID));
            positions.Get(endPos).Add(leastCost, leftID, fromPosData.pos, leastIDX, wordID, type);
        }

        public override bool IncrementToken()
        {

            // parse() is able to return w/o producing any new
            // tokens, when the tokens it had produced were entirely
            // punctuation.  So we loop here until we get a real
            // token or we end:
            while (pending.Count == 0)
            {
                if (end)
                {
                    return false;
                }

                // Push Viterbi forward some more:
                Parse();
            }

            Token token = pending[pending.Count - 1]; // LUCENENET: The above loop ensures we don't get here unless we have at least 1 item
            if (token != null)
            {
                pending.Remove(token);
            }

            int position = token.Position;
            int length = token.Length;
            ClearAttributes();
            if (Debugging.AssertsEnabled) Debugging.Assert(length > 0);
            //System.out.println("off=" + token.getOffset() + " len=" + length + " vs " + token.getSurfaceForm().length);
            termAtt.CopyBuffer(token.SurfaceForm, token.Offset, length);
            offsetAtt.SetOffset(CorrectOffset(position), CorrectOffset(position + length));
            basicFormAtt.SetToken(token);
            posAtt.SetToken(token);
            readingAtt.SetToken(token);
            inflectionAtt.SetToken(token);
            if (token.Position == lastTokenPos)
            {
                posIncAtt.PositionIncrement = 0;
                posLengthAtt.PositionLength = token.PositionLength;
            }
            else
            {
                if (Debugging.AssertsEnabled) Debugging.Assert(token.Position > lastTokenPos);
                posIncAtt.PositionIncrement = 1;
                posLengthAtt.PositionLength = 1;
            }
            if (VERBOSE)
            {
                Console.WriteLine(Thread.CurrentThread.Name + ":    incToken: return token=" + token);
            }
            lastTokenPos = token.Position;
            return true;
        }

        /// <summary>
        /// Incrementally parse some more characters.  This runs
        /// the viterbi search forwards "enough" so that we
        /// generate some more tokens.  How much forward depends on
        /// the chars coming in, since some chars could cause
        /// longer-lasting ambiguity in the parsing.  Once the
        /// ambiguity is resolved, then we back trace, produce
        /// the pending tokens, and return.
        /// </summary>
        private void Parse()
        {
            if (VERBOSE)
            {
                Console.WriteLine("\nPARSE");
            }

            // Advances over each position (character):
            while (true)
            {

                if (buffer.Get(pos) == -1)
                {
                    // End
                    break;
                }

                Position posData = positions.Get(pos);
                bool isFrontier = positions.GetNextPos() == pos + 1;

                if (posData.count == 0)
                {
                    // No arcs arrive here; move to next position:
                    if (VERBOSE)
                    {
                        Console.WriteLine("    no arcs in; skip pos=" + pos);
                    }
                    pos++;
                    continue;
                }

                if (pos > lastBackTracePos && posData.count == 1 && isFrontier)
                {
                    //  if (pos > lastBackTracePos && posData.count == 1 && isFrontier) {
                    // We are at a "frontier", and only one node is
                    // alive, so whatever the eventual best path is must
                    // come through this node.  So we can safely commit
                    // to the prefix of the best path at this point:
                    Backtrace(posData, 0);

                    // Re-base cost so we don't risk int overflow:
                    posData.costs[0] = 0;

                    if (pending.Count != 0)
                    {
                        return;
                    }
                    else
                    {
                        // This means the backtrace only produced
                        // punctuation tokens, so we must keep parsing.
                    }
                }

                if (pos - lastBackTracePos >= MAX_BACKTRACE_GAP)
                {
                    // Safety: if we've buffered too much, force a
                    // backtrace now.  We find the least-cost partial
                    // path, across all paths, backtrace from it, and
                    // then prune all others.  Note that this, in
                    // general, can produce the wrong result, if the
                    // total best path did not in fact back trace
                    // through this partial best path.  But it's the
                    // best we can do... (short of not having a
                    // safety!).

                    // First pass: find least cost partial path so far,
                    // including ending at future positions:
                    int leastIDX = -1;
                    int leastCost = int.MaxValue;
                    Position leastPosData = null;
                    for (int pos2 = pos; pos2 < positions.GetNextPos(); pos2++)
                    {
                        Position posData2 = positions.Get(pos2);
                        for (int idx = 0; idx < posData2.count; idx++)
                        {
                            //System.out.println("    idx=" + idx + " cost=" + cost);
                            int cost = posData2.costs[idx];
                            if (cost < leastCost)
                            {
                                leastCost = cost;
                                leastIDX = idx;
                                leastPosData = posData2;
                            }
                        }
                    }

                    // We will always have at least one live path:
                    if (Debugging.AssertsEnabled) Debugging.Assert(leastIDX != -1);

                    // Second pass: prune all but the best path:
                    for (int pos2 = pos; pos2 < positions.GetNextPos(); pos2++)
                    {
                        Position posData2 = positions.Get(pos2);
                        if (posData2 != leastPosData)
                        {
                            posData2.Reset();
                        }
                        else
                        {
                            if (leastIDX != 0)
                            {
                                posData2.costs[0] = posData2.costs[leastIDX];
                                posData2.lastRightID[0] = posData2.lastRightID[leastIDX];
                                posData2.backPos[0] = posData2.backPos[leastIDX];
                                posData2.backIndex[0] = posData2.backIndex[leastIDX];
                                posData2.backID[0] = posData2.backID[leastIDX];
                                posData2.backType[0] = posData2.backType[leastIDX];
                            }
                            posData2.count = 1;
                        }
                    }

                    Backtrace(leastPosData, 0);

                    // Re-base cost so we don't risk int overflow:
                    Arrays.Fill(leastPosData.costs, 0, leastPosData.count, 0);

                    if (pos != leastPosData.pos)
                    {
                        // We jumped into a future position:
                        if (Debugging.AssertsEnabled) Debugging.Assert(pos < leastPosData.pos);
                        pos = leastPosData.pos;
                    }

                    if (pending.Count != 0)
                    {
                        return;
                    }
                    else
                    {
                        // This means the backtrace only produced
                        // punctuation tokens, so we must keep parsing.
                        continue;
                    }
                }

                if (VERBOSE)
                {
                    Console.WriteLine("\n  extend @ pos=" + pos + " char=" + (char)buffer.Get(pos));
                }

                if (VERBOSE)
                {
                    Console.WriteLine("    " + posData.count + " arcs in");
                }

                bool anyMatches = false;

                // First try user dict:
                if (userFST != null)
                {
                    userFST.GetFirstArc(arc);
                    int output = 0;
                    for (int posAhead = posData.pos; ; posAhead++)
                    {
                        int ch = buffer.Get(posAhead);
                        if (ch == -1)
                        {
                            break;
                        }
                        if (userFST.FindTargetArc(ch, arc, arc, posAhead == posData.pos, userFSTReader) == null)
                        {
                            break;
                        }
                        output += (int)arc.Output;
                        if (arc.IsFinal)
                        {
                            if (VERBOSE)
                            {
                                Console.WriteLine("    USER word " + new string(buffer.Get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1));
                            }
                            Add(userDictionary, posData, posAhead + 1, output + (int)arc.NextFinalOutput, JapaneseTokenizerType.USER, false);
                            anyMatches = true;
                        }
                    }
                }

                // TODO: we can be more aggressive about user
                // matches?  if we are "under" a user match then don't
                // extend KNOWN/UNKNOWN paths?

                if (!anyMatches)
                {
                    // Next, try known dictionary matches
                    fst.GetFirstArc(arc);
                    int output = 0;

                    for (int posAhead = posData.pos; ; posAhead++)
                    {
                        int ch = buffer.Get(posAhead);
                        if (ch == -1)
                        {
                            break;
                        }
                        //System.out.println("    match " + (char) ch + " posAhead=" + posAhead);

                        if (fst.FindTargetArc(ch, arc, arc, posAhead == posData.pos, fstReader) == null)
                        {
                            break;
                        }

                        output += (int)arc.Output;

                        // Optimization: for known words that are too-long
                        // (compound), we should pre-compute the 2nd
                        // best segmentation and store it in the
                        // dictionary instead of recomputing it each time a
                        // match is found.

                        if (arc.IsFinal)
                        {
                            dictionary.LookupWordIds(output + (int)arc.NextFinalOutput, wordIdRef);
                            if (VERBOSE)
                            {
                                Console.WriteLine("    KNOWN word " + new string(buffer.Get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1) + " " + wordIdRef.Length + " wordIDs");
                            }
                            for (int ofs = 0; ofs < wordIdRef.Length; ofs++)
                            {
                                Add(dictionary, posData, posAhead + 1, wordIdRef.Int32s[wordIdRef.Offset + ofs], JapaneseTokenizerType.KNOWN, false);
                                anyMatches = true;
                            }
                        }
                    }
                }

                // In the case of normal mode, it doesn't process unknown word greedily.

                if (!searchMode && unknownWordEndIndex > posData.pos)
                {
                    pos++;
                    continue;
                }

                char firstCharacter = (char)buffer.Get(pos);
                if (!anyMatches || characterDefinition.IsInvoke(firstCharacter))
                {

                    // Find unknown match:
                    int characterId = characterDefinition.GetCharacterClass(firstCharacter);
                    bool isPunct = IsPunctuation(firstCharacter);

                    // NOTE: copied from UnknownDictionary.lookup:
                    int unknownWordLength;
                    if (!characterDefinition.IsGroup(firstCharacter))
                    {
                        unknownWordLength = 1;
                    }
                    else
                    {
                        // Extract unknown word. Characters with the same character class are considered to be part of unknown word
                        unknownWordLength = 1;
                        for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++)
                        {
                            int ch = buffer.Get(posAhead);
                            if (ch == -1)
                            {
                                break;
                            }
                            if (characterId == characterDefinition.GetCharacterClass((char)ch) &&
                                IsPunctuation((char)ch) == isPunct)
                            {
                                unknownWordLength++;
                            }
                            else
                            {
                                break;
                            }
                        }
                    }

                    unkDictionary.LookupWordIds(characterId, wordIdRef); // characters in input text are supposed to be the same
                    if (VERBOSE)
                    {
                        Console.WriteLine("    UNKNOWN word len=" + unknownWordLength + " " + wordIdRef.Length + " wordIDs");
                    }
                    for (int ofs = 0; ofs < wordIdRef.Length; ofs++)
                    {
                        Add(unkDictionary, posData, posData.pos + unknownWordLength, wordIdRef.Int32s[wordIdRef.Offset + ofs], JapaneseTokenizerType.UNKNOWN, false);
                    }

                    unknownWordEndIndex = posData.pos + unknownWordLength;
                }

                pos++;
            }

            end = true;

            if (pos > 0)
            {

                Position endPosData = positions.Get(pos);
                int leastCost = int.MaxValue;
                int leastIDX = -1;
                if (VERBOSE)
                {
                    Console.WriteLine("  end: " + endPosData.count + " nodes");
                }
                for (int idx = 0; idx < endPosData.count; idx++)
                {
                    // Add EOS cost:
                    int cost = endPosData.costs[idx] + costs.Get(endPosData.lastRightID[idx], 0);
                    //System.out.println("    idx=" + idx + " cost=" + cost + " (pathCost=" + endPosData.costs[idx] + " bgCost=" + costs.get(endPosData.lastRightID[idx], 0) + ") backPos=" + endPosData.backPos[idx]);
                    if (cost < leastCost)
                    {
                        leastCost = cost;
                        leastIDX = idx;
                    }
                }

                Backtrace(endPosData, leastIDX);
            }
            else
            {
                // No characters in the input string; return no tokens!
            }
        }

        // Eliminates arcs from the lattice that are compound
        // tokens (have a penalty) or are not congruent with the
        // compound token we've matched (ie, span across the
        // startPos).  This should be fairly efficient, because we
        // just keep the already intersected structure of the
        // graph, eg we don't have to consult the FSTs again:

        private void PruneAndRescore(int startPos, int endPos, int bestStartIDX)
        {
            if (VERBOSE)
            {
                Console.WriteLine("  pruneAndRescore startPos=" + startPos + " endPos=" + endPos + " bestStartIDX=" + bestStartIDX);
            }

            // First pass: walk backwards, building up the forward
            // arcs and pruning inadmissible arcs:
            for (int pos = endPos; pos > startPos; pos--)
            {
                Position posData = positions.Get(pos);
                if (VERBOSE)
                {
                    Console.WriteLine("    back pos=" + pos);
                }
                for (int arcIDX = 0; arcIDX < posData.count; arcIDX++)
                {
                    int backPos = posData.backPos[arcIDX];
                    if (backPos >= startPos)
                    {
                        // Keep this arc:
                        //System.out.println("      keep backPos=" + backPos);
                        positions.Get(backPos).AddForward(pos,
                                                          arcIDX,
                                                          posData.backID[arcIDX],
                                                          posData.backType[arcIDX]);
                    }
                    else
                    {
                        if (VERBOSE)
                        {
                            Console.WriteLine("      prune");
                        }
                    }
                }
                if (pos != startPos)
                {
                    posData.count = 0;
                }
            }

            // Second pass: walk forward, re-scoring:
            for (int pos = startPos; pos < endPos; pos++)
            {
                Position posData = positions.Get(pos);
                if (VERBOSE)
                {
                    Console.WriteLine("    forward pos=" + pos + " count=" + posData.forwardCount);
                }
                if (posData.count == 0)
                {
                    // No arcs arrive here...
                    if (VERBOSE)
                    {
                        Console.WriteLine("      skip");
                    }
                    posData.forwardCount = 0;
                    continue;
                }

                if (pos == startPos)
                {
                    // On the initial position, only consider the best
                    // path so we "force congruence":  the
                    // sub-segmentation is "in context" of what the best
                    // path (compound token) had matched:
                    int rightID;
                    if (startPos == 0)
                    {
                        rightID = 0;
                    }
                    else
                    {
                        rightID = GetDict(posData.backType[bestStartIDX]).GetRightId(posData.backID[bestStartIDX]);
                    }
                    int pathCost = posData.costs[bestStartIDX];
                    for (int forwardArcIDX = 0; forwardArcIDX < posData.forwardCount; forwardArcIDX++)
                    {
                        JapaneseTokenizerType forwardType = posData.forwardType[forwardArcIDX];
                        IDictionary dict2 = GetDict(forwardType);
                        int wordID = posData.forwardID[forwardArcIDX];
                        int toPos = posData.forwardPos[forwardArcIDX];
                        int newCost = pathCost + dict2.GetWordCost(wordID) +
                          costs.Get(rightID, dict2.GetLeftId(wordID)) +
                          ComputePenalty(pos, toPos - pos);
                        if (VERBOSE)
                        {
                            Console.WriteLine("      + " + forwardType + " word " + new string(buffer.Get(pos, toPos - pos)) + " toPos=" + toPos + " cost=" + newCost + " penalty=" + ComputePenalty(pos, toPos - pos) + " toPos.idx=" + positions.Get(toPos).count);
                        }
                        positions.Get(toPos).Add(newCost,
                                                 dict2.GetRightId(wordID),
                                                 pos,
                                                 bestStartIDX,
                                                 wordID,
                                                 forwardType);
                    }
                }
                else
                {
                    // On non-initial positions, we maximize score
                    // across all arriving lastRightIDs:
                    for (int forwardArcIDX = 0; forwardArcIDX < posData.forwardCount; forwardArcIDX++)
                    {
                        JapaneseTokenizerType forwardType = posData.forwardType[forwardArcIDX];
                        int toPos = posData.forwardPos[forwardArcIDX];
                        if (VERBOSE)
                        {
                            Console.WriteLine("      + " + forwardType + " word " + new string(buffer.Get(pos, toPos - pos)) + " toPos=" + toPos);
                        }
                        Add(GetDict(forwardType),
                            posData,
                            toPos,
                            posData.forwardID[forwardArcIDX],
                            forwardType,
                            true);
                    }
                }
                posData.forwardCount = 0;
            }
        }

        // Backtrace from the provided position, back to the last
        // time we back-traced, accumulating the resulting tokens to
        // the pending list.  The pending list is then in-reverse
        // (last token should be returned first).
        private void Backtrace(Position endPosData, int fromIDX)
        {
            int endPos = endPosData.pos;

            if (VERBOSE)
            {
                Console.WriteLine("\n  backtrace: endPos=" + endPos + " pos=" + this.pos + "; " + (this.pos - lastBackTracePos) + " characters; last=" + lastBackTracePos + " cost=" + endPosData.costs[fromIDX]);
            }

            char[] fragment = buffer.Get(lastBackTracePos, endPos - lastBackTracePos);

            if (dotOut != null)
            {
                dotOut.OnBacktrace(this, positions, lastBackTracePos, endPosData, fromIDX, fragment, end);
            }

            int pos = endPos;
            int bestIDX = fromIDX;
            Token altToken = null;

            // We trace backwards, so this will be the leftWordID of
            // the token after the one we are now on:
            int lastLeftWordID = -1;

            int backCount = 0;

            // TODO: sort of silly to make Token instances here; the
            // back trace has all info needed to generate the
            // token.  So, we could just directly set the attrs,
            // from the backtrace, in incrementToken w/o ever
            // creating Token; we'd have to defer calling freeBefore
            // until after the backtrace was fully "consumed" by
            // incrementToken.

            while (pos > lastBackTracePos)
            {
                //System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX);
                Position posData = positions.Get(pos);
                if (Debugging.AssertsEnabled) Debugging.Assert(bestIDX < posData.count);

                int backPos = posData.backPos[bestIDX];
                if (Debugging.AssertsEnabled) Debugging.Assert(backPos >= lastBackTracePos,"backPos={0} vs lastBackTracePos={1}", backPos, lastBackTracePos);
                int length = pos - backPos;
                JapaneseTokenizerType backType = posData.backType[bestIDX];
                int backID = posData.backID[bestIDX];
                int nextBestIDX = posData.backIndex[bestIDX];

                if (outputCompounds && searchMode && altToken == null && backType != JapaneseTokenizerType.USER)
                {

                    // In searchMode, if best path had picked a too-long
                    // token, we use the "penalty" to compute the allowed
                    // max cost of an alternate back-trace.  If we find an
                    // alternate back trace with cost below that
                    // threshold, we pursue it instead (but also output
                    // the long token).
                    //System.out.println("    2nd best backPos=" + backPos + " pos=" + pos);

                    int penalty = ComputeSecondBestThreshold(backPos, pos - backPos);

                    if (penalty > 0)
                    {
                        if (VERBOSE)
                        {
                            Console.WriteLine("  compound=" + new string(buffer.Get(backPos, pos - backPos)) + " backPos=" + backPos + " pos=" + pos + " penalty=" + penalty + " cost=" + posData.costs[bestIDX] + " bestIDX=" + bestIDX + " lastLeftID=" + lastLeftWordID);
                        }

                        // Use the penalty to set maxCost on the 2nd best
                        // segmentation:
                        int maxCost = posData.costs[bestIDX] + penalty;
                        if (lastLeftWordID != -1)
                        {
                            maxCost += costs.Get(GetDict(backType).GetRightId(backID), lastLeftWordID);
                        }

                        // Now, prune all too-long tokens from the graph:
                        PruneAndRescore(backPos, pos,
                                        posData.backIndex[bestIDX]);

                        // Finally, find 2nd best back-trace and resume
                        // backtrace there:
                        int leastCost = int.MaxValue;
                        int leastIDX = -1;
                        for (int idx = 0; idx < posData.count; idx++)
                        {
                            int cost = posData.costs[idx];
                            //System.out.println("    idx=" + idx + " prevCost=" + cost);

                            if (lastLeftWordID != -1)
                            {
                                cost += costs.Get(GetDict(posData.backType[idx]).GetRightId(posData.backID[idx]),
                                                  lastLeftWordID);
                                //System.out.println("      += bgCost=" + costs.get(getDict(posData.backType[idx]).getRightId(posData.backID[idx]),
                                //lastLeftWordID) + " -> " + cost);
                            }
                            //System.out.println("penalty " + posData.backPos[idx] + " to " + pos);
                            //cost += computePenalty(posData.backPos[idx], pos - posData.backPos[idx]);
                            if (cost < leastCost)
                            {
                                //System.out.println("      ** ");
                                leastCost = cost;
                                leastIDX = idx;
                            }
                        }
                        //System.out.println("  leastIDX=" + leastIDX);

                        if (VERBOSE)
                        {
                            Console.WriteLine("  afterPrune: " + posData.count + " arcs arriving; leastCost=" + leastCost + " vs threshold=" + maxCost + " lastLeftWordID=" + lastLeftWordID);
                        }

                        if (leastIDX != -1 && leastCost <= maxCost && posData.backPos[leastIDX] != backPos)
                        {
                            // We should have pruned the altToken from the graph:
                            if (Debugging.AssertsEnabled) Debugging.Assert(posData.backPos[leastIDX] != backPos);

                            // Save the current compound token, to output when
                            // this alternate path joins back:
                            altToken = new Token(backID,
                                                 fragment,
                                                 backPos - lastBackTracePos,
                                                 length,
                                                 backType,
                                                 backPos,
                                                 GetDict(backType));

                            // Redirect our backtrace to 2nd best:
                            bestIDX = leastIDX;
                            nextBestIDX = posData.backIndex[bestIDX];

                            backPos = posData.backPos[bestIDX];
                            length = pos - backPos;
                            backType = posData.backType[bestIDX];
                            backID = posData.backID[bestIDX];
                            backCount = 0;
                            //System.out.println("  do alt token!");

                        }
                        else
                        {
                            // I think in theory it's possible there is no
                            // 2nd best path, which is fine; in this case we
                            // only output the compound token:
                            //System.out.println("  no alt token! bestIDX=" + bestIDX);
                        }
                    }
                }

                int offset = backPos - lastBackTracePos;
                if (Debugging.AssertsEnabled) Debugging.Assert(offset >= 0);

                if (altToken != null && altToken.Position >= backPos)
                {

                    // We've backtraced to the position where the
                    // compound token starts; add it now:

                    // The pruning we did when we created the altToken
                    // ensures that the back trace will align back with
                    // the start of the altToken:
                    if (Debugging.AssertsEnabled) Debugging.Assert(altToken.Position == backPos, "{0} vs {1}", altToken.Position, backPos);

                    // NOTE: not quite right: the compound token may
                    // have had all punctuation back traced so far, but
                    // then the decompounded token at this position is
                    // not punctuation.  In this case backCount is 0,
                    // but we should maybe add the altToken anyway...?

                    if (backCount > 0)
                    {
                        backCount++;
                        altToken.PositionLength = backCount;
                        if (VERBOSE)
                        {
                            Console.WriteLine("    add altToken=" + altToken);
                        }
                        pending.Add(altToken);
                    }
                    else
                    {
                        // This means alt token was all punct tokens:
                        if (VERBOSE)
                        {
                            Console.WriteLine("    discard all-punctuation altToken=" + altToken);
                        }
                        if (Debugging.AssertsEnabled) Debugging.Assert(discardPunctuation);
                    }
                    altToken = null;
                }

                IDictionary dict = GetDict(backType);

                if (backType == JapaneseTokenizerType.USER)
                {

                    // Expand the phraseID we recorded into the actual
                    // segmentation:
                    int[] wordIDAndLength = userDictionary.LookupSegmentation(backID);
                    int wordID = wordIDAndLength[0];
                    int current = 0;
                    for (int j = 1; j < wordIDAndLength.Length; j++)
                    {
                        int len = wordIDAndLength[j];
                        //System.out.println("    add user: len=" + len);
                        pending.Add(new Token(wordID + j - 1,
                                              fragment,
                                              current + offset,
                                              len,
                                              JapaneseTokenizerType.USER,
                                              current + backPos,
                                              dict));
                        if (VERBOSE)
                        {
                            Console.WriteLine("    add USER token=" + pending[pending.Count - 1]);
                        }
                        current += len;
                    }

                    // Reverse the tokens we just added, because when we
                    // serve them up from incrementToken we serve in
                    // reverse:
                    pending.Reverse(pending.Count - (wordIDAndLength.Length - 1),
                        wordIDAndLength.Length - 1); // LUCENENET: Converted from reverse on SubList to reverse on List<T> and converted end index to length


                    backCount += wordIDAndLength.Length - 1;
                }
                else
                {

                    if (extendedMode && backType == JapaneseTokenizerType.UNKNOWN)
                    {
                        // In EXTENDED mode we convert unknown word into
                        // unigrams:
                        int unigramTokenCount = 0;
                        for (int i = length - 1; i >= 0; i--)
                        {
                            int charLen = 1;
                            if (i > 0 && char.IsLowSurrogate(fragment[offset + i]))
                            {
                                i--;
                                charLen = 2;
                            }
                            //System.out.println("    extended tok offset="
                            //+ (offset + i));
                            if (!discardPunctuation || !IsPunctuation(fragment[offset + i]))
                            {
                                pending.Add(new Token(CharacterDefinition.NGRAM,
                                                      fragment,
                                                      offset + i,
                                                      charLen,
                                                      JapaneseTokenizerType.UNKNOWN,
                                                      backPos + i,
                                                      unkDictionary));
                                unigramTokenCount++;
                            }
                        }
                        backCount += unigramTokenCount;

                    }
                    else if (!discardPunctuation || length == 0 || !IsPunctuation(fragment[offset]))
                    {
                        pending.Add(new Token(backID,
                                              fragment,
                                              offset,
                                              length,
                                              backType,
                                              backPos,
                                              dict));
                        if (VERBOSE)
                        {
                            Console.WriteLine("    add token=" + pending[pending.Count - 1]);
                        }
                        backCount++;
                    }
                    else
                    {
                        if (VERBOSE)
                        {
                            Console.WriteLine("    skip punctuation token=" + new string(fragment, offset, length));
                        }
                    }
                }

                lastLeftWordID = dict.GetLeftId(backID);
                pos = backPos;
                bestIDX = nextBestIDX;
            }

            lastBackTracePos = endPos;

            if (VERBOSE)
            {
                Console.WriteLine("  freeBefore pos=" + endPos);
            }
            // Notify the circular buffers that we are done with
            // these positions:
            buffer.FreeBefore(endPos);
            positions.FreeBefore(endPos);
        }

        internal IDictionary GetDict(JapaneseTokenizerType type)
        {
            dictionaryMap.TryGetValue(type, out IDictionary result);
            return result;
        }

        private static bool IsPunctuation(char ch)
        {
            switch (Character.GetType(ch))
            {
                case UnicodeCategory.SpaceSeparator:
                case UnicodeCategory.LineSeparator:
                case UnicodeCategory.ParagraphSeparator:
                case UnicodeCategory.Control:
                case UnicodeCategory.Format:
                case UnicodeCategory.DashPunctuation:
                case UnicodeCategory.OpenPunctuation:
                case UnicodeCategory.ClosePunctuation:
                case UnicodeCategory.ConnectorPunctuation:
                case UnicodeCategory.OtherPunctuation:
                case UnicodeCategory.MathSymbol:
                case UnicodeCategory.CurrencySymbol:
                case UnicodeCategory.ModifierSymbol:
                case UnicodeCategory.OtherSymbol:
                case UnicodeCategory.InitialQuotePunctuation:
                case UnicodeCategory.FinalQuotePunctuation:
                    return true;
                default:
                    return false;
            }
        }
    }

    // LUCENENET specific - de-nested Mode and renamed JapaneseTokenizerMode

    /// <summary>
    /// Tokenization mode: this determines how the tokenizer handles
    /// compound and unknown words.
    /// </summary>
    public enum JapaneseTokenizerMode
    {
        /// <summary>
        /// Ordinary segmentation: no decomposition for compounds,
        /// </summary>
        NORMAL,

        /// <summary>
        /// Segmentation geared towards search: this includes a 
        /// decompounding process for long nouns, also including
        /// the full compound token as a synonym.
        /// </summary>
        SEARCH,

        /// <summary>
        /// Extended mode outputs unigrams for unknown words.
        /// <para/>
        /// @lucene.experimental
        /// </summary>
        EXTENDED
    }

    // LUCENENET specific: de-nested Type and renamed JapaneseTokenizerType

    /// <summary>
    /// Token type reflecting the original source of this token
    /// </summary>
    public enum JapaneseTokenizerType
    {
        /// <summary>
        /// Known words from the system dictionary.
        /// </summary>
        KNOWN,
        /// <summary>
        /// Unknown words (heuristically segmented).
        /// </summary>
        UNKNOWN,
        /// <summary>
        /// Known words from the user dictionary.
        /// </summary>
        USER
    }


    // LUCENENET specific - De-nested Position

    // Holds all back pointers arriving to this position:
    internal sealed class Position
    {

        internal int pos;

        internal int count;

        // maybe single int array * 5?
        internal int[] costs = new int[8];
        internal int[] lastRightID = new int[8];
        internal int[] backPos = new int[8];
        internal int[] backIndex = new int[8];
        internal int[] backID = new int[8];
        internal JapaneseTokenizerType[] backType = new JapaneseTokenizerType[8];

        // Only used when finding 2nd best segmentation under a
        // too-long token:
        internal int forwardCount;
        internal int[] forwardPos = new int[8];
        internal int[] forwardID = new int[8];
        internal int[] forwardIndex = new int[8];
        internal JapaneseTokenizerType[] forwardType = new JapaneseTokenizerType[8];

        public void Grow()
        {
            costs = ArrayUtil.Grow(costs, 1 + count);
            lastRightID = ArrayUtil.Grow(lastRightID, 1 + count);
            backPos = ArrayUtil.Grow(backPos, 1 + count);
            backIndex = ArrayUtil.Grow(backIndex, 1 + count);
            backID = ArrayUtil.Grow(backID, 1 + count);

            // NOTE: sneaky: grow separately because
            // ArrayUtil.grow will otherwise pick a different
            // length than the int[]s we just grew:
            JapaneseTokenizerType[] newBackType = new JapaneseTokenizerType[backID.Length];
            System.Array.Copy(backType, 0, newBackType, 0, backType.Length);
            backType = newBackType;
        }

        public void GrowForward()
        {
            forwardPos = ArrayUtil.Grow(forwardPos, 1 + forwardCount);
            forwardID = ArrayUtil.Grow(forwardID, 1 + forwardCount);
            forwardIndex = ArrayUtil.Grow(forwardIndex, 1 + forwardCount);

            // NOTE: sneaky: grow separately because
            // ArrayUtil.grow will otherwise pick a different
            // length than the int[]s we just grew:
            JapaneseTokenizerType[] newForwardType = new JapaneseTokenizerType[forwardPos.Length];
            System.Array.Copy(forwardType, 0, newForwardType, 0, forwardType.Length);
            forwardType = newForwardType;
        }

        public void Add(int cost, int lastRightID, int backPos, int backIndex, int backID, JapaneseTokenizerType backType)
        {
            // NOTE: this isn't quite a true Viterbi search,
            // because we should check if lastRightID is
            // already present here, and only update if the new
            // cost is less than the current cost, instead of
            // simply appending.  However, that will likely hurt
            // performance (usually we add a lastRightID only once),
            // and it means we actually create the full graph
            // intersection instead of a "normal" Viterbi lattice:
            if (count == costs.Length)
            {
                Grow();
            }
            this.costs[count] = cost;
            this.lastRightID[count] = lastRightID;
            this.backPos[count] = backPos;
            this.backIndex[count] = backIndex;
            this.backID[count] = backID;
            this.backType[count] = backType;
            count++;
        }

        public void AddForward(int forwardPos, int forwardIndex, int forwardID, JapaneseTokenizerType forwardType)
        {
            if (forwardCount == this.forwardID.Length)
            {
                GrowForward();
            }
            this.forwardPos[forwardCount] = forwardPos;
            this.forwardIndex[forwardCount] = forwardIndex;
            this.forwardID[forwardCount] = forwardID;
            this.forwardType[forwardCount] = forwardType;
            forwardCount++;
        }

        public void Reset()
        {
            count = 0;
            // forwardCount naturally resets after it runs:
            if (Debugging.AssertsEnabled) Debugging.Assert(forwardCount == 0,"pos={0} forwardCount={1}", pos, forwardCount);
        }
    }


    // LUCENENET specific - de-nested WrappedPositionArray

    // TODO: make generic'd version of this "circular array"?
    // It's a bit tricky because we do things to the Position
    // (eg, set .pos = N on reuse)...
    internal sealed class WrappedPositionArray
    {
        private Position[] positions = new Position[8];

        public WrappedPositionArray()
        {
            for (int i = 0; i < positions.Length; i++)
            {
                positions[i] = new Position();
            }
        }

        // Next array index to write to in positions:
        private int nextWrite;

        // Next position to write:
        private int nextPos;

        // How many valid Position instances are held in the
        // positions array:
        private int count;

        public void Reset()
        {
            nextWrite--;
            while (count > 0)
            {
                if (nextWrite == -1)
                {
                    nextWrite = positions.Length - 1;
                }
                positions[nextWrite--].Reset();
                count--;
            }
            nextWrite = 0;
            nextPos = 0;
            count = 0;
        }

        /// <summary>
        /// Get Position instance for this absolute position;
        /// this is allowed to be arbitrarily far "in the
        /// future" but cannot be before the last freeBefore.
        /// </summary>
        public Position Get(int pos)
        {
            while (pos >= nextPos)
            {
                //System.out.println("count=" + count + " vs len=" + positions.length);
                if (count == positions.Length)
                {
                    Position[] newPositions = new Position[ArrayUtil.Oversize(1 + count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
                    //System.out.println("grow positions " + newPositions.length);
                    System.Array.Copy(positions, nextWrite, newPositions, 0, positions.Length - nextWrite);
                    System.Array.Copy(positions, 0, newPositions, positions.Length - nextWrite, nextWrite);
                    for (int i = positions.Length; i < newPositions.Length; i++)
                    {
                        newPositions[i] = new Position();
                    }
                    nextWrite = positions.Length;
                    positions = newPositions;
                }
                if (nextWrite == positions.Length)
                {
                    nextWrite = 0;
                }
                // Should have already been reset:
                if (Debugging.AssertsEnabled) Debugging.Assert(positions[nextWrite].count == 0);
                positions[nextWrite++].pos = nextPos++;
                count++;
            }
            if (Debugging.AssertsEnabled) Debugging.Assert(InBounds(pos));
            int index = GetIndex(pos);
            if (Debugging.AssertsEnabled) Debugging.Assert(positions[index].pos == pos);
            return positions[index];
        }

        public int GetNextPos()
        {
            return nextPos;
        }

        // For assert:
        private bool InBounds(int pos)
        {
            return pos < nextPos && pos >= nextPos - count;
        }

        private int GetIndex(int pos)
        {
            int index = nextWrite - (nextPos - pos);
            if (index < 0)
            {
                index += positions.Length;
            }
            return index;
        }

        public void FreeBefore(int pos)
        {
            int toFree = count - (nextPos - pos);
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(toFree >= 0);
                Debugging.Assert(toFree <= count);
            }
            int index = nextWrite - count;
            if (index < 0)
            {
                index += positions.Length;
            }
            for (int i = 0; i < toFree; i++)
            {
                if (index == positions.Length)
                {
                    index = 0;
                }
                //System.out.println("  fb idx=" + index);
                positions[index].Reset();
                index++;
            }
            count -= toFree;
        }
    }
}