BUG: Lucene.Net.Analysis.Common: Fixed classes that were originally u…

…sing invariant culture to do so again. J2N's Character class default is to use the current culture, which had changed from the prior Character class that used invariant culture. Fixes TestICUFoldingFilter::TestRandomStrings().
apache · Aug 2, 2020 · 9ce76e9 · 9ce76e9
1 parent 3c4cfa4
commit 9ce76e9
Show file tree

Hide file tree

Showing 6 changed files with 23 additions and 18 deletions.
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseTokenizer.cs
@@ -1,5 +1,6 @@
 using J2N;
 using Lucene.Net.Util;
+using System.Globalization;
 using System.IO;
 
 namespace Lucene.Net.Analysis.Core
@@ -73,11 +74,11 @@ public LowerCaseTokenizer(LuceneVersion matchVersion, AttributeSource.AttributeF
 
         /// <summary>
         /// Converts char to lower case
-        /// <see cref="Character.ToLower(int)"/>.
+        /// <see cref="Character.ToLower(int, CultureInfo)"/> in the invariant culture.
         /// </summary>
         protected override int Normalize(int c)
         {
-            return Character.ToLower(c);
+            return Character.ToLower(c, CultureInfo.InvariantCulture); // LUCENENET specific - need to use invariant culture to match Java
         }
     }
 }
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilter.cs
@@ -3,6 +3,7 @@
 using Lucene.Net.Util;
 using Lucene.Net.Util.Fst;
 using System.Collections.Generic;
+using System.Globalization;
 using System.IO;
 
 namespace Lucene.Net.Analysis.Miscellaneous
@@ -134,7 +135,7 @@ public BytesRef Get(char[] buffer, int bufferLen, FST.Arc<BytesRef> scratchArc,
                 while (bufUpto < bufferLen)
                 {
                     int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen);
-                    if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null)
+                    if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint, CultureInfo.InvariantCulture) : codePoint, scratchArc, scratchArc, fstReader) == null)
                     {
                         return null;
                     }
@@ -192,7 +193,7 @@ public virtual bool Add(string input, string output)
                     char[] buffer = charsSpare.Chars;
                     for (int i = 0; i < length;)
                     {
-                        i += Character.ToChars(Character.ToLower(Character.CodePointAt(input, i)), buffer, i);
+                        i += Character.ToChars(Character.ToLower(Character.CodePointAt(input, i), CultureInfo.InvariantCulture), buffer, i);
                     }
                     UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare);
                 }

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Synonym/SynonymFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Synonym/SynonymFilter.cs
@@ -5,6 +5,7 @@
 using Lucene.Net.Util.Fst;
 using System;
 using System.Diagnostics;
+using System.Globalization;
 
 namespace Lucene.Net.Analysis.Synonym
 {
@@ -252,9 +253,10 @@ public virtual void Add(char[] output, int offset, int len, int endOffset, int p
 
         /// <param name="input"> input tokenstream </param>
         /// <param name="synonyms"> synonym map </param>
-        /// <param name="ignoreCase"> case-folds input for matching with <see cref="Character.ToLower(int)"/>.
-        ///                   Note, if you set this to true, its your responsibility to lowercase
-        ///                   the input entries when you create the <see cref="SynonymMap"/> </param>
+        /// <param name="ignoreCase"> case-folds input for matching with <see cref="Character.ToLower(int, CultureInfo)"/>
+        ///                   in using <see cref="CultureInfo.InvariantCulture"/>.
+        ///                   Note, if you set this to <c>true</c>, its your responsibility to lowercase
+        ///                   the input entries when you create the <see cref="SynonymMap"/>.</param>
         public SynonymFilter(TokenStream input, SynonymMap synonyms, bool ignoreCase) 
             : base(input)
         {
@@ -411,7 +413,7 @@ private void Parse()
                 while (bufUpto < bufferLen)
                 {
                     int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen);
-                    if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null)
+                    if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint, CultureInfo.InvariantCulture) : codePoint, scratchArc, scratchArc, fstReader) == null)
                     {
                         //System.out.println("    stop");
                         goto byTokenBreak;

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharArrayMap.cs b/src/Lucene.Net.Analysis.Common/Analysis/Util/CharArrayMap.cs
@@ -664,7 +664,7 @@ private bool Equals(char[] text1, int offset, int length, char[] text2)
                 for (int i = 0; i < length;)
                 {
                     var codePointAt = charUtils.CodePointAt(text1, offset + i, limit);
-                    if (Character.ToLower(codePointAt) != charUtils.CodePointAt(text2, i, text2.Length))
+                    if (Character.ToLower(codePointAt, CultureInfo.InvariantCulture) != charUtils.CodePointAt(text2, i, text2.Length)) // LUCENENET specific - need to use invariant culture to match Java
                     {
                         return false;
                     }
@@ -696,7 +696,7 @@ private bool Equals(ICharSequence text1, char[] text2)
                 for (int i = 0; i < length;)
                 {
                     int codePointAt = charUtils.CodePointAt(text1, i);
-                    if (Character.ToLower(codePointAt) != charUtils.CodePointAt(text2, i, text2.Length))
+                    if (Character.ToLower(codePointAt, CultureInfo.InvariantCulture) != charUtils.CodePointAt(text2, i, text2.Length)) // LUCENENET specific - need to use invariant culture to match Java
                     {
                         return false;
                     }
@@ -728,7 +728,7 @@ private bool Equals(string text1, char[] text2)
                 for (int i = 0; i < length;)
                 {
                     int codePointAt = charUtils.CodePointAt(text1, i);
-                    if (Character.ToLower(codePointAt) != charUtils.CodePointAt(text2, i, text2.Length))
+                    if (Character.ToLower(codePointAt, CultureInfo.InvariantCulture) != charUtils.CodePointAt(text2, i, text2.Length)) // LUCENENET specific - need to use invariant culture to match Java
                     {
                         return false;
                     }
@@ -811,7 +811,7 @@ private int GetHashCode(char[] text, int offset, int length)
                 for (int i = offset; i < stop;)
                 {
                     int codePointAt = charUtils.CodePointAt(text, i, stop);
-                    code = code * 31 + Character.ToLower(codePointAt);
+                    code = code * 31 + Character.ToLower(codePointAt, CultureInfo.InvariantCulture); // LUCENENET specific - need to use invariant culture to match Java
                     i += Character.CharCount(codePointAt);
                 }
             }
@@ -839,7 +839,7 @@ private int GetHashCode(ICharSequence text)
                 for (int i = 0; i < length;)
                 {
                     int codePointAt = charUtils.CodePointAt(text, i);
-                    code = code * 31 + Character.ToLower(codePointAt);
+                    code = code * 31 + Character.ToLower(codePointAt, CultureInfo.InvariantCulture); // LUCENENET specific - need to use invariant culture to match Java
                     i += Character.CharCount(codePointAt);
                 }
             }
@@ -867,7 +867,7 @@ private int GetHashCode(string text)
                 for (int i = 0; i < length;)
                 {
                     int codePointAt = charUtils.CodePointAt(text, i);
-                    code = code * 31 + Character.ToLower(codePointAt);
+                    code = code * 31 + Character.ToLower(codePointAt, CultureInfo.InvariantCulture); // LUCENENET specific - need to use invariant culture to match Java
                     i += Character.CharCount(codePointAt);
                 }
             }

diff --git a/src/Lucene.Net.TestFramework/Analysis/MockTokenizer.cs b/src/Lucene.Net.TestFramework/Analysis/MockTokenizer.cs
@@ -7,6 +7,7 @@
 using Debug = Lucene.Net.Diagnostics.Debug; // LUCENENET NOTE: We cannot use System.Diagnostics.Debug because those calls will be optimized out of the release!
 using RegExp = Lucene.Net.Util.Automaton.RegExp;
 using Assert = Lucene.Net.TestFramework.Assert;
+using System.Globalization;
 
 namespace Lucene.Net.Analysis
 {
@@ -290,7 +291,7 @@ protected virtual bool IsTokenChar(int c)
 
         protected virtual int Normalize(int c)
         {
-            return lowerCase ? Character.ToLower(c) : c;
+            return lowerCase ? Character.ToLower(c, CultureInfo.InvariantCulture) : c; // LUCENENET specific - need to use invariant culture to match Java
         }
 
         public override void Reset()

diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharArraySet.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Util/TestCharArraySet.cs
@@ -419,7 +419,7 @@ public virtual void TestCopyCharArraySetBWCompat()
             IList<string> stopwordsUpper = new List<string>();
             foreach (string @string in stopwords)
             {
-                stopwordsUpper.Add(@string.ToUpper());
+                stopwordsUpper.Add(@string.ToUpperInvariant());
             }
             setIngoreCase.addAll(TEST_STOP_WORDS);
             setIngoreCase.Add(Convert.ToInt32(1));
@@ -472,7 +472,7 @@ public virtual void TestCopyCharArraySet()
             IList<string> stopwordsUpper = new List<string>();
             foreach (string @string in stopwords)
             {
-                stopwordsUpper.Add(@string.ToUpper());
+                stopwordsUpper.Add(@string.ToUpperInvariant());
             }
             setIngoreCase.addAll(TEST_STOP_WORDS);
             setIngoreCase.Add(Convert.ToInt32(1));
@@ -523,7 +523,7 @@ public virtual void TestCopyJDKSet()
             IList<string> stopwordsUpper = new List<string>();
             foreach (string @string in stopwords)
             {
-                stopwordsUpper.Add(@string.ToUpper());
+                stopwordsUpper.Add(@string.ToUpperInvariant());
             }
             set.addAll(TEST_STOP_WORDS);