apache · NightOwl888 · Aug 2, 2020 · Jul 29, 2020 · Aug 1, 2020
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs
@@ -25,7 +25,7 @@ namespace Lucene.Net.Analysis.Core
     /// <summary>
     /// A <see cref="LetterTokenizer"/> is a tokenizer that divides text at non-letters. That's to
     /// say, it defines tokens as maximal strings of adjacent letters, as defined by
-    /// <see cref="char.IsLetter(char)"/> predicate.
+    /// <see cref="Character.IsLetter(int)"/> predicate.
     /// <para>
     /// Note: this does a decent job for most European languages, but does a terrible
     /// job for some Asian languages, where words are not separated by spaces.

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseTokenizer.cs
@@ -1,5 +1,6 @@
 using J2N;
 using Lucene.Net.Util;
+using System.Globalization;
 using System.IO;
 
 namespace Lucene.Net.Analysis.Core
@@ -73,11 +74,11 @@ public LowerCaseTokenizer(LuceneVersion matchVersion, AttributeSource.AttributeF
 
         /// <summary>
         /// Converts char to lower case
-        /// <see cref="Character.ToLower(int)"/>.
+        /// <see cref="Character.ToLower(int, CultureInfo)"/> in the invariant culture.
         /// </summary>
         protected override int Normalize(int c)
         {
-            return Character.ToLower(c);
+            return Character.ToLower(c, CultureInfo.InvariantCulture); // LUCENENET specific - need to use invariant culture to match Java
         }
     }
 }
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizer.cs
@@ -1,4 +1,5 @@
-using Lucene.Net.Analysis.Util;
+using J2N;
+using Lucene.Net.Analysis.Util;
 using Lucene.Net.Util;
 using System.IO;
 
@@ -65,7 +66,7 @@ public WhitespaceTokenizer(LuceneVersion matchVersion, AttributeFactory factory,
         /// </summary>
         protected override bool IsTokenChar(int c)
         {
-            return !char.IsWhiteSpace((char)c);
+            return !Character.IsWhiteSpace(c);
         }
     }
 }
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/El/GreekLowerCaseFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/El/GreekLowerCaseFilter.cs
@@ -2,6 +2,7 @@
 using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.Analysis.Util;
 using Lucene.Net.Util;
+using System.Globalization;
 
 namespace Lucene.Net.Analysis.El
 {
@@ -37,6 +38,8 @@ public sealed class GreekLowerCaseFilter : TokenFilter
         private readonly ICharTermAttribute termAtt;
         private readonly CharacterUtils charUtils;
 
+        private static readonly CultureInfo culture = new CultureInfo("el"); // LUCENENET specific - use Greek culture when lowercasing.
+
         /// <summary>
         /// Create a <see cref="GreekLowerCaseFilter"/> that normalizes Greek token text.
         /// </summary>
@@ -127,7 +130,7 @@ private int LowerCase(int codepoint)
                     return '\u03C2'; // small final sigma
 
                 default:
-                    return Character.ToLower(codepoint);
+                    return Character.ToLower(codepoint, culture); // LUCENENET specific - need to use specific culture to override current thread
             }
         }
     }

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/En/PorterStemmer.cs b/src/Lucene.Net.Analysis.Common/Analysis/En/PorterStemmer.cs
@@ -841,9 +841,9 @@ public virtual bool Stem(int i0)
         //                            ch = buffer[offset++];
         //                    }
 
-        //                    if (char.IsLetter((char)ch))
+        //                    if (Character.IsLetter(ch))
         //                    {
-        //                        s.Add(char.ToLowerInvariant((char)ch));
+        //                        s.Add(Character.ToLower(ch, CultureInfo.InvariantCulture));
         //                    }
         //                    else
         //                    {

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ga/IrishLowerCaseFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ga/IrishLowerCaseFilter.cs
@@ -1,5 +1,6 @@
 using J2N;
 using Lucene.Net.Analysis.TokenAttributes;
+using System.Globalization;
 
 namespace Lucene.Net.Analysis.Ga
 {
@@ -28,6 +29,8 @@ public sealed class IrishLowerCaseFilter : TokenFilter
     {
         private readonly ICharTermAttribute termAtt;
 
+        private static readonly CultureInfo culture = new CultureInfo("ga"); // LUCENENET specific - use Irish culture when lowercasing.
+
         /// <summary>
         /// Create an <see cref="IrishLowerCaseFilter"/> that normalises Irish token text.
         /// </summary>
@@ -60,7 +63,7 @@ public override bool IncrementToken()
 
                 for (int i = idx; i < chLen;)
                 {
-                    i += Character.ToChars(Character.ToLower(chArray[i]), chArray, i);
+                    i += Character.ToChars(Character.ToLower(chArray[i], culture), chArray, i); // LUCENENET specific - use Irish culture when lowercasing
                 }
                 return true;
             }

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilter.cs
@@ -3,6 +3,7 @@
 using Lucene.Net.Util;
 using Lucene.Net.Util.Fst;
 using System.Collections.Generic;
+using System.Globalization;
 using System.IO;
 
 namespace Lucene.Net.Analysis.Miscellaneous
@@ -134,7 +135,7 @@ public BytesRef Get(char[] buffer, int bufferLen, FST.Arc<BytesRef> scratchArc,
                 while (bufUpto < bufferLen)
                 {
                     int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen);
-                    if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null)
+                    if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint, CultureInfo.InvariantCulture) : codePoint, scratchArc, scratchArc, fstReader) == null)
                     {
                         return null;
                     }
@@ -192,7 +193,7 @@ public virtual bool Add(string input, string output)
                     char[] buffer = charsSpare.Chars;
                     for (int i = 0; i < length;)
                     {
-                        i += Character.ToChars(Character.ToLower(Character.CodePointAt(input, i)), buffer, i);
+                        i += Character.ToChars(Character.ToLower(Character.CodePointAt(input, i), CultureInfo.InvariantCulture), buffer, i);
                     }
                     UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare);
                 }

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/WordDelimiterIterator.cs b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/WordDelimiterIterator.cs
@@ -1,4 +1,5 @@
-using System.Globalization;
+using J2N;
+using System.Globalization;
 
 namespace Lucene.Net.Analysis.Miscellaneous
 {
@@ -84,17 +85,17 @@ private static byte[] LoadDefaultWordDelimTable() // LUCENENET: Avoid static con
             for (int i = 0; i < 256; i++)
             {
                 byte code = 0;
-                if (char.IsLower((char)i))
+                if (Character.IsLower(i))
                 {
-                    code |= (byte)WordDelimiterFilter.LOWER;
+                    code |= WordDelimiterFilter.LOWER;
                 }
-                else if (char.IsUpper((char)i))
+                else if (Character.IsUpper(i))
                 {
-                    code |= (byte)WordDelimiterFilter.UPPER;
+                    code |= WordDelimiterFilter.UPPER;
                 }
-                else if (char.IsDigit((char)i))
+                else if (Character.IsDigit(i))
                 {
-                    code |= (byte)WordDelimiterFilter.DIGIT;
+                    code |= WordDelimiterFilter.DIGIT;
                 }
                 if (code == 0)
                 {
@@ -318,7 +319,7 @@ private int CharType(int ch)
         /// <returns> Type of the character </returns>
         public static byte GetType(int ch)
         {
-            switch (CharUnicodeInfo.GetUnicodeCategory((char)ch))
+            switch (Character.GetType(ch))
             {
                 case UnicodeCategory.UppercaseLetter:
                     return WordDelimiterFilter.UPPER;

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Synonym/SynonymFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Synonym/SynonymFilter.cs
@@ -5,6 +5,7 @@
 using Lucene.Net.Util.Fst;
 using System;
 using System.Diagnostics;
+using System.Globalization;
 
 namespace Lucene.Net.Analysis.Synonym
 {
@@ -252,9 +253,10 @@ public virtual void Add(char[] output, int offset, int len, int endOffset, int p
 
         /// <param name="input"> input tokenstream </param>
         /// <param name="synonyms"> synonym map </param>
-        /// <param name="ignoreCase"> case-folds input for matching with <see cref="Character.ToLower(int)"/>.
-        ///                   Note, if you set this to true, its your responsibility to lowercase
-        ///                   the input entries when you create the <see cref="SynonymMap"/> </param>
+        /// <param name="ignoreCase"> case-folds input for matching with <see cref="Character.ToLower(int, CultureInfo)"/>
+        ///                   in using <see cref="CultureInfo.InvariantCulture"/>.
+        ///                   Note, if you set this to <c>true</c>, its your responsibility to lowercase
+        ///                   the input entries when you create the <see cref="SynonymMap"/>.</param>
         public SynonymFilter(TokenStream input, SynonymMap synonyms, bool ignoreCase) 
             : base(input)
         {
@@ -411,7 +413,7 @@ private void Parse()
                 while (bufUpto < bufferLen)
                 {
                     int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen);
-                    if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null)
+                    if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint, CultureInfo.InvariantCulture) : codePoint, scratchArc, scratchArc, fstReader) == null)
                     {
                         //System.out.println("    stop");
                         goto byTokenBreak;

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
@@ -95,7 +95,7 @@ protected override bool IncrementWord()
 
             // find the next set of boundaries, skipping over non-tokens
             int end = wordBreaker.Next();
-            while (end != BreakIterator.Done && !char.IsLetterOrDigit((char)Character.CodePointAt(m_buffer, sentenceStart + start, sentenceEnd)))
+            while (end != BreakIterator.Done && !Character.IsLetterOrDigit(Character.CodePointAt(m_buffer, sentenceStart + start, sentenceEnd)))
             {
                 start = end;
                 end = wordBreaker.Next();

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Tr/TurkishLowerCaseFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Tr/TurkishLowerCaseFilter.cs
@@ -1,5 +1,4 @@
 using J2N;
-using J2N.Globalization;
 using Lucene.Net.Analysis.TokenAttributes;
 using System;
 using System.Globalization;
@@ -35,12 +34,13 @@ namespace Lucene.Net.Analysis.Tr
     public sealed class TurkishLowerCaseFilter : TokenFilter
     {
         private const int LATIN_CAPITAL_LETTER_I = '\u0049';
-        private const int LATIN_CAPITAL_LETTER_DOTTED_I = '\u0130';
         private const int LATIN_SMALL_LETTER_I = '\u0069';
         private const int LATIN_SMALL_LETTER_DOTLESS_I = '\u0131';
         private const int COMBINING_DOT_ABOVE = '\u0307';
         private readonly ICharTermAttribute termAtt;
 
+        private static readonly CultureInfo culture = new CultureInfo("tr"); // LUCENENET specific - we need to do a culture-sensitive lowercase operation in Turkish
+
         /// <summary>
         /// Create a new <see cref="TurkishLowerCaseFilter"/>, that normalizes Turkish token text 
         /// to lower case.
@@ -64,7 +64,7 @@ public override sealed bool IncrementToken()
                 {
                     int ch = Character.CodePointAt(buffer, i, length);
 
-                    iOrAfter = (ch == LATIN_CAPITAL_LETTER_I || (iOrAfter && CharUnicodeInfo.GetUnicodeCategory((char)ch) == UnicodeCategory.NonSpacingMark));
+                    iOrAfter = (ch == LATIN_CAPITAL_LETTER_I || (iOrAfter && Character.GetType(ch) == UnicodeCategory.NonSpacingMark));
 
                     if (iOrAfter) // all the special I turkish handling happens here.
                     {
@@ -93,32 +93,8 @@ public override sealed bool IncrementToken()
                         }
                     }
 
-                    using (var culture = new CultureContext("tr"))
-                    {
-                        switch (ch)
-                        {
-                            // LUCENENET: The .NET char.ToLower() function works correctly in 
-                            // Turkish as long as the current thread is set to tr-TR (well, technically the 
-                            // culture change is only required for the LATIN_CAPITAL_LETTER_I case). .NET does 
-                            // not split these characters into separate letter/non-spacing mark characters,
-                            // but the user might still input them that way so we still need the above
-                            // block to handle that case.
-                            //
-                            // LUCENENET TODO: Oddly, the Character.ToLowerCase() function below does not work right
-                            // for Turkish. Which begs the question, should this special case be there so Turkish works
-                            // everywhere? Or should we leave it a special case here because that is the way it works in Java?
-                            //
-                            // References:
-                            // http://haacked.com/archive/2012/07/05/turkish-i-problem-and-why-you-should-care.aspx/
-                            // http://www.i18nguy.com/unicode/turkish-i18n.html
-                            case LATIN_CAPITAL_LETTER_I:
-                            case LATIN_CAPITAL_LETTER_DOTTED_I:
-                                i += Character.ToChars(char.ToLower((char)ch), buffer, i);
-                                continue;
-                        }
-                    }
-
-                    i += Character.ToChars(Character.ToLower(ch), buffer, i);
+                    // LUCENENET specific - need to pass Turkish culture to get the correct lowercase results
+                    i += Character.ToChars(Character.ToLower(ch, culture), buffer, i);
                 }
 
                 termAtt.Length = length;
@@ -139,8 +115,7 @@ private bool IsBeforeDot(char[] s, int pos, int len)
             for (int i = pos; i < len;)
             {
                 int ch = Character.CodePointAt(s, i, len);
-                //if (char.getType(ch) != char.NON_SPACING_MARK)
-                if (CharUnicodeInfo.GetUnicodeCategory((char)ch) != UnicodeCategory.NonSpacingMark)
+                if (Character.GetType(ch) != UnicodeCategory.NonSpacingMark)
                 {
                     return false;
                 }
@@ -161,9 +136,7 @@ private bool IsBeforeDot(char[] s, int pos, int len)
         private int Delete(char[] s, int pos, int len)
         {
             if (pos < len)
-            {
                 Array.Copy(s, pos + 1, s, pos, len - pos - 1);
-            }
 
             return len - 1;
         }

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharArrayMap.cs b/src/Lucene.Net.Analysis.Common/Analysis/Util/CharArrayMap.cs
@@ -664,7 +664,7 @@ private bool Equals(char[] text1, int offset, int length, char[] text2)
                 for (int i = 0; i < length;)
                 {
                     var codePointAt = charUtils.CodePointAt(text1, offset + i, limit);
-                    if (Character.ToLower(codePointAt) != charUtils.CodePointAt(text2, i, text2.Length))
+                    if (Character.ToLower(codePointAt, CultureInfo.InvariantCulture) != charUtils.CodePointAt(text2, i, text2.Length)) // LUCENENET specific - need to use invariant culture to match Java
                     {
                         return false;
                     }
@@ -696,7 +696,7 @@ private bool Equals(ICharSequence text1, char[] text2)
                 for (int i = 0; i < length;)
                 {
                     int codePointAt = charUtils.CodePointAt(text1, i);
-                    if (Character.ToLower(codePointAt) != charUtils.CodePointAt(text2, i, text2.Length))
+                    if (Character.ToLower(codePointAt, CultureInfo.InvariantCulture) != charUtils.CodePointAt(text2, i, text2.Length)) // LUCENENET specific - need to use invariant culture to match Java
                     {
                         return false;
                     }
@@ -728,7 +728,7 @@ private bool Equals(string text1, char[] text2)
                 for (int i = 0; i < length;)
                 {
                     int codePointAt = charUtils.CodePointAt(text1, i);
-                    if (Character.ToLower(codePointAt) != charUtils.CodePointAt(text2, i, text2.Length))
+                    if (Character.ToLower(codePointAt, CultureInfo.InvariantCulture) != charUtils.CodePointAt(text2, i, text2.Length)) // LUCENENET specific - need to use invariant culture to match Java
                     {
                         return false;
                     }
@@ -811,7 +811,7 @@ private int GetHashCode(char[] text, int offset, int length)
                 for (int i = offset; i < stop;)
                 {
                     int codePointAt = charUtils.CodePointAt(text, i, stop);
-                    code = code * 31 + Character.ToLower(codePointAt);
+                    code = code * 31 + Character.ToLower(codePointAt, CultureInfo.InvariantCulture); // LUCENENET specific - need to use invariant culture to match Java
                     i += Character.CharCount(codePointAt);
                 }
             }
@@ -839,7 +839,7 @@ private int GetHashCode(ICharSequence text)
                 for (int i = 0; i < length;)
                 {
                     int codePointAt = charUtils.CodePointAt(text, i);
-                    code = code * 31 + Character.ToLower(codePointAt);
+                    code = code * 31 + Character.ToLower(codePointAt, CultureInfo.InvariantCulture); // LUCENENET specific - need to use invariant culture to match Java
                     i += Character.CharCount(codePointAt);
                 }
             }
@@ -867,7 +867,7 @@ private int GetHashCode(string text)
                 for (int i = 0; i < length;)
                 {
                     int codePointAt = charUtils.CodePointAt(text, i);
-                    code = code * 31 + Character.ToLower(codePointAt);
+                    code = code * 31 + Character.ToLower(codePointAt, CultureInfo.InvariantCulture); // LUCENENET specific - need to use invariant culture to match Java
                     i += Character.CharCount(codePointAt);
                 }
             }

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterUtils.cs b/src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterUtils.cs
@@ -168,7 +168,7 @@ public static CharacterBuffer NewCharacterBuffer(int bufferSize)
 
 
         /// <summary>
-        /// Converts each unicode codepoint to lowerCase via <see cref="Character.ToLower(int)"/> starting 
+        /// Converts each unicode codepoint to lowerCase via <see cref="TextInfo.ToLower(string)"/> in the invariant culture starting 
         /// at the given offset. </summary>
         /// <param name="buffer"> the char buffer to lowercase </param>
         /// <param name="offset"> the offset to start at </param>
@@ -199,7 +199,7 @@ public static CharacterBuffer NewCharacterBuffer(int bufferSize)
         }
 
         /// <summary>
-        /// Converts each unicode codepoint to UpperCase via <see cref="Character.ToUpper(int)"/> starting 
+        /// Converts each unicode codepoint to UpperCase via <see cref="TextInfo.ToUpper(string)"/> in the invariant culture starting 
         /// at the given offset. </summary>
         /// <param name="buffer"> the char buffer to UPPERCASE </param>
         /// <param name="offset"> the offset to start at </param>

diff --git a/src/Lucene.Net.Analysis.Morfologik/Morfologik/MorfologikFilter.cs b/src/Lucene.Net.Analysis.Morfologik/Morfologik/MorfologikFilter.cs
@@ -8,6 +8,7 @@
 using Morfologik.Stemming.Polish;
 using System;
 using System.Collections.Generic;
+using System.Globalization;
 using System.Text;
 using System.Text.RegularExpressions;
 
@@ -55,6 +56,8 @@ public class MorfologikFilter : TokenFilter
 
         private int lemmaListIndex;
 
+        private static readonly CultureInfo culture = new CultureInfo("pl"); // LUCENENET specific - do lowercasing in Polish culture
+
         /// <summary>
         /// Creates a filter with the default (Polish) dictionary.
         /// </summary>
@@ -166,7 +169,7 @@ private string ToLowercase(string chs)
             for (int i = 0; i < length;)
             {
                 i += Character.ToChars(
-                    Character.ToLower(Character.CodePointAt(chs, i)), buffer, i);
+                    Character.ToLower(Character.CodePointAt(chs, i), culture), buffer, i); // LUCENENET specific - need to use explicit culture to override current thread
             }
 
             return scratch.ToString();