diff --git a/pom.xml b/pom.xml index 3e6ebaaaed5..e606ecac5b7 100644 --- a/pom.xml +++ b/pom.xml @@ -502,6 +502,9 @@ Adrian Ber + + Mark Dacek + diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 5d22f00df7a..c770c90af23 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -46,6 +46,7 @@ The type attribute can be add,update,fix,remove. + Clarify or improve behaviour of int-based indexOf methods in StringUtils Add method for converting string to an array of code points RandomStringUtils random method can overflow and return characters outside of specified range Add methods to insert arrays into arrays at an index diff --git a/src/main/java/org/apache/commons/lang3/CharSequenceUtils.java b/src/main/java/org/apache/commons/lang3/CharSequenceUtils.java index a5112d9d017..e35f5aa8841 100644 --- a/src/main/java/org/apache/commons/lang3/CharSequenceUtils.java +++ b/src/main/java/org/apache/commons/lang3/CharSequenceUtils.java @@ -59,13 +59,42 @@ public static CharSequence subSequence(final CharSequence cs, final int start) { //----------------------------------------------------------------------- /** - *

Finds the first index in the {@code CharSequence} that matches the - * specified character.

+ * Returns the index within cs of the first occurrence of the + * specified character, starting the search at the specified index. + *

+ * If a character with value searchChar occurs in the + * character sequence represented by the cs + * object at an index no smaller than start, then + * the index of the first such occurrence is returned. For values + * of searchChar in the range from 0 to 0xFFFF (inclusive), + * this is the smallest value k such that: + *

+     * (this.charAt(k) == searchChar) && (k >= start)
+     * 
+ * is true. For other values of searchChar, it is the + * smallest value k such that: + *
+     * (this.codePointAt(k) == searchChar) && (k >= start)
+     * 
+ * is true. In either case, if no such character occurs inm cs + * at or after position start, then + * -1 is returned. + * + *

+ * There is no restriction on the value of start. If it + * is negative, it has the same effect as if it were zero: the entire + * CharSequence may be searched. If it is greater than + * the length of cs, it has the same effect as if it were + * equal to the length of cs: -1 is returned. + * + *

All indices are specified in char values + * (Unicode code units). * * @param cs the {@code CharSequence} to be processed, not null * @param searchChar the char to be searched for * @param start the start index, negative starts at the string start * @return the index where the search char was found, -1 if not found + * @since 3.6 updated to behave more like String */ static int indexOf(final CharSequence cs, final int searchChar, int start) { if (cs instanceof String) { @@ -75,9 +104,22 @@ static int indexOf(final CharSequence cs, final int searchChar, int start) { if (start < 0) { start = 0; } - for (int i = start; i < sz; i++) { - if (cs.charAt(i) == searchChar) { - return i; + if (searchChar < Character.MIN_SUPPLEMENTARY_CODE_POINT) { + for (int i = start; i < sz; i++) { + if (cs.charAt(i) == searchChar) { + return i; + } + } + } + //supplementary characters (LANG1300) + if (searchChar <= Character.MAX_CODE_POINT) { + char[] chars = Character.toChars(searchChar); + for (int i = start; i < sz - 1; i++) { + char high = cs.charAt(i); + char low = cs.charAt(i + 1); + if (high == chars[0] && low == chars[1]) { + return i; + } } } return NOT_FOUND; @@ -105,13 +147,30 @@ static int indexOf(final CharSequence cs, final CharSequence searchChar, final i } /** - *

Finds the last index in the {@code CharSequence} that matches the - * specified character.

+ * Returns the index within cs of the last occurrence of + * the specified character, searching backward starting at the + * specified index. For values of searchChar in the range + * from 0 to 0xFFFF (inclusive), the index returned is the largest + * value k such that: + *
+     * (this.charAt(k) == searchChar) && (k <= start)
+     * 
+ * is true. For other values of searchChar, it is the + * largest value k such that: + *
+     * (this.codePointAt(k) == searchChar) && (k <= start)
+     * 
+ * is true. In either case, if no such character occurs in cs + * at or before position start, then -1 is returned. + * + *

All indices are specified in char values + * (Unicode code units). * * @param cs the {@code CharSequence} to be processed * @param searchChar the char to be searched for * @param start the start index, negative returns -1, beyond length starts at end * @return the index where the search char was found, -1 if not found + * @since 3.6 updated to behave more like String */ static int lastIndexOf(final CharSequence cs, final int searchChar, int start) { if (cs instanceof String) { @@ -124,9 +183,27 @@ static int lastIndexOf(final CharSequence cs, final int searchChar, int start) { if (start >= sz) { start = sz - 1; } - for (int i = start; i >= 0; --i) { - if (cs.charAt(i) == searchChar) { - return i; + if (searchChar < Character.MIN_SUPPLEMENTARY_CODE_POINT) { + for (int i = start; i >= 0; --i) { + if (cs.charAt(i) == searchChar) { + return i; + } + } + } + //supplementary characters (LANG1300) + //NOTE - we must do a forward traversal for this to avoid duplicating code points + if (searchChar <= Character.MAX_CODE_POINT) { + char[] chars = Character.toChars(searchChar); + //make sure it's not the last index + if (start == sz - 1) { + return NOT_FOUND; + } + for (int i = start; i >= 0; i--) { + char high = cs.charAt(i); + char low = cs.charAt(i + 1); + if (chars[0] == high && chars[1] == low) { + return i; + } } } return NOT_FOUND; diff --git a/src/main/java/org/apache/commons/lang3/StringUtils.java b/src/main/java/org/apache/commons/lang3/StringUtils.java index 2250595f5ab..dcb335dede7 100644 --- a/src/main/java/org/apache/commons/lang3/StringUtils.java +++ b/src/main/java/org/apache/commons/lang3/StringUtils.java @@ -1276,10 +1276,26 @@ public static boolean equalsAnyIgnoreCase(final CharSequence string, final CharS // IndexOf //----------------------------------------------------------------------- /** - *

Finds the first index within a CharSequence, handling {@code null}. - * This method uses {@link String#indexOf(int, int)} if possible.

- * - *

A {@code null} or empty ("") CharSequence will return {@code INDEX_NOT_FOUND (-1)}.

+ * Returns the index within seq of the first occurrence of + * the specified character. If a character with value + * searchChar occurs in the character sequence represented by + * seq CharSequence object, then the index (in Unicode + * code units) of the first such occurrence is returned. For + * values of searchChar in the range from 0 to 0xFFFF + * (inclusive), this is the smallest value k such that: + *
+     * this.charAt(k) == searchChar
+     * 
+ * is true. For other values of searchChar, it is the + * smallest value k such that: + *
+     * this.codePointAt(k) == searchChar
+     * 
+ * is true. In either case, if no such character occurs in seq, + * then {@code INDEX_NOT_FOUND (-1)} is returned. + * + *

Furthermore, a {@code null} or empty ("") CharSequence will + * return {@code INDEX_NOT_FOUND (-1)}.

* *
      * StringUtils.indexOf(null, *)         = -1
@@ -1294,6 +1310,7 @@ public static boolean equalsAnyIgnoreCase(final CharSequence string, final CharS
      *  -1 if no match or {@code null} string input
      * @since 2.0
      * @since 3.0 Changed signature from indexOf(String, int) to indexOf(CharSequence, int)
+     * @since 3.6 Updated {@link CharSequenceUtils} call to behave more like String
      */
     public static int indexOf(final CharSequence seq, final int searchChar) {
         if (isEmpty(seq)) {
@@ -1303,13 +1320,39 @@ public static int indexOf(final CharSequence seq, final int searchChar) {
     }
 
     /**
-     * 

Finds the first index within a CharSequence from a start position, - * handling {@code null}. - * This method uses {@link String#indexOf(int, int)} if possible.

* - *

A {@code null} or empty ("") CharSequence will return {@code (INDEX_NOT_FOUND) -1}. - * A negative start position is treated as zero. - * A start position greater than the string length returns {@code -1}.

+ * Returns the index within seq of the first occurrence of the + * specified character, starting the search at the specified index. + *

+ * If a character with value searchChar occurs in the + * character sequence represented by the seq CharSequence + * object at an index no smaller than startPos, then + * the index of the first such occurrence is returned. For values + * of searchChar in the range from 0 to 0xFFFF (inclusive), + * this is the smallest value k such that: + *

+     * (this.charAt(k) == searchChar) && (k >= startPos)
+     * 
+ * is true. For other values of searchChar, it is the + * smallest value k such that: + *
+     * (this.codePointAt(k) == searchChar) && (k >= startPos)
+     * 
+ * is true. In either case, if no such character occurs in seq + * at or after position startPos, then + * -1 is returned. + * + *

+ * There is no restriction on the value of startPos. If it + * is negative, it has the same effect as if it were zero: this entire + * string may be searched. If it is greater than the length of this + * string, it has the same effect as if it were equal to the length of + * this string: {@code (INDEX_NOT_FOUND) -1} is returned. Furthermore, a + * {@code null} or empty ("") CharSequence will + * return {@code (INDEX_NOT_FOUND) -1}. + * + *

All indices are specified in char values + * (Unicode code units). * *

      * StringUtils.indexOf(null, *, *)          = -1
@@ -1327,6 +1370,7 @@ public static int indexOf(final CharSequence seq, final int searchChar) {
      *  -1 if no match or {@code null} string input
      * @since 2.0
      * @since 3.0 Changed signature from indexOf(String, int, int) to indexOf(CharSequence, int, int)
+     * @since 3.6 Updated {@link CharSequenceUtils} call to behave more like String
      */
     public static int indexOf(final CharSequence seq, final int searchChar, final int startPos) {
         if (isEmpty(seq)) {
@@ -1586,10 +1630,23 @@ public static int indexOfIgnoreCase(final CharSequence str, final CharSequence s
     // LastIndexOf
     //-----------------------------------------------------------------------
     /**
-     * 

Finds the last index within a CharSequence, handling {@code null}. - * This method uses {@link String#lastIndexOf(int)} if possible.

- * - *

A {@code null} or empty ("") CharSequence will return {@code -1}.

+ * Returns the index within seq of the last occurrence of + * the specified character. For values of searchChar in the + * range from 0 to 0xFFFF (inclusive), the index (in Unicode code + * units) returned is the largest value k such that: + *
+     * this.charAt(k) == searchChar
+     * 
+ * is true. For other values of searchChar, it is the + * largest value k such that: + *
+     * this.codePointAt(k) == searchChar
+     * 
+ * is true. In either case, if no such character occurs in this + * string, then -1 is returned. Furthermore, a {@code null} or empty ("") + * CharSequence will return {@code -1}. The + * seq CharSequence object is searched backwards + * starting at the last character. * *
      * StringUtils.lastIndexOf(null, *)         = -1
@@ -1598,12 +1655,13 @@ public static int indexOfIgnoreCase(final CharSequence str, final CharSequence s
      * StringUtils.lastIndexOf("aabaabaa", 'b') = 5
      * 
* - * @param seq the CharSequence to check, may be null + * @param seq the CharSequence to check, may be null * @param searchChar the character to find * @return the last index of the search character, * -1 if no match or {@code null} string input * @since 2.0 * @since 3.0 Changed signature from lastIndexOf(String, int) to lastIndexOf(CharSequence, int) + * @since 3.6 Updated {@link CharSequenceUtils} call to behave more like String */ public static int lastIndexOf(final CharSequence seq, final int searchChar) { if (isEmpty(seq)) { @@ -1613,16 +1671,29 @@ public static int lastIndexOf(final CharSequence seq, final int searchChar) { } /** - *

Finds the last index within a CharSequence from a start position, - * handling {@code null}. - * This method uses {@link String#lastIndexOf(int, int)} if possible.

- * - *

A {@code null} or empty ("") CharSequence will return {@code -1}. - * A negative start position returns {@code -1}. - * A start position greater than the string length searches the whole string. - * The search starts at the startPos and works backwards; matches starting after the start - * position are ignored. - *

+ * Returns the index within seq of the last occurrence of + * the specified character, searching backward starting at the + * specified index. For values of searchChar in the range + * from 0 to 0xFFFF (inclusive), the index returned is the largest + * value k such that: + *
+     * (this.charAt(k) == searchChar) && (k <= startPos)
+     * 
+ * is true. For other values of searchChar, it is the + * largest value k such that: + *
+     * (this.codePointAt(k) == searchChar) && (k <= startPos)
+     * 
+ * is true. In either case, if no such character occurs in seq + * at or before position startPos, then + * -1 is returned. Furthermore, a {@code null} or empty ("") + * CharSequence will return {@code -1}. A start position greater + * than the string length searches the whole string. + * The search starts at the startPos and works backwards; + * matches starting after the start position are ignored. + * + *

All indices are specified in char values + * (Unicode code units). * *

      * StringUtils.lastIndexOf(null, *, *)          = -1
diff --git a/src/test/java/org/apache/commons/lang3/StringUtilsEqualsIndexOfTest.java b/src/test/java/org/apache/commons/lang3/StringUtilsEqualsIndexOfTest.java
index 9014bfa9ee1..4635a54aa23 100644
--- a/src/test/java/org/apache/commons/lang3/StringUtilsEqualsIndexOfTest.java
+++ b/src/test/java/org/apache/commons/lang3/StringUtilsEqualsIndexOfTest.java
@@ -294,6 +294,28 @@ public void testIndexOf_charInt() {
         assertEquals(2, StringUtils.indexOf("aabaabaa", 'b', -1));
 
         assertEquals(5, StringUtils.indexOf(new StringBuilder("aabaabaa"), 'b', 3));
+        
+        //LANG-1300 tests go here
+        final int CODE_POINT = 0x2070E;
+    	StringBuilder builder = new StringBuilder();
+    	builder.appendCodePoint(CODE_POINT);
+    	assertEquals(0, StringUtils.indexOf(builder, CODE_POINT, 0));
+    	assertEquals(0, StringUtils.indexOf(builder.toString(), CODE_POINT, 0));
+    	builder.appendCodePoint(CODE_POINT);
+    	assertEquals(2, StringUtils.indexOf(builder, CODE_POINT, 1));
+    	assertEquals(2, StringUtils.indexOf(builder.toString(), CODE_POINT, 1));
+    	//inner branch on the supplementary character block
+    	char[] tmp = {(char) 55361};
+    	builder = new StringBuilder();
+    	builder.append(tmp);
+    	assertEquals(-1, StringUtils.indexOf(builder, CODE_POINT, 0));
+    	assertEquals(-1, StringUtils.indexOf(builder.toString(), CODE_POINT, 0));
+    	builder.appendCodePoint(CODE_POINT);
+    	assertEquals(1, StringUtils.indexOf(builder, CODE_POINT, 0));
+    	assertEquals(1, StringUtils.indexOf(builder.toString(), CODE_POINT, 0));
+    	assertEquals(-1, StringUtils.indexOf(builder, CODE_POINT, 2));
+        assertEquals(-1, StringUtils.indexOf(builder.toString(), CODE_POINT, 2));
+        
     }
 
     @Test
@@ -525,6 +547,33 @@ public void testLastIndexOf_charInt() {
         assertEquals(0, StringUtils.lastIndexOf("aabaabaa", 'a', 0));
 
         assertEquals(2, StringUtils.lastIndexOf(new StringBuilder("aabaabaa"), 'b', 2));
+        
+        //LANG-1300 addition test
+        final int CODE_POINT = 0x2070E;
+    	StringBuilder builder = new StringBuilder();
+    	builder.appendCodePoint(CODE_POINT);
+    	assertEquals(0, StringUtils.lastIndexOf(builder, CODE_POINT, 0));
+    	builder.appendCodePoint(CODE_POINT);
+    	assertEquals(0, StringUtils.lastIndexOf(builder, CODE_POINT, 0));
+    	assertEquals(0, StringUtils.lastIndexOf(builder, CODE_POINT, 1));
+        assertEquals(2, StringUtils.lastIndexOf(builder, CODE_POINT, 2));
+
+
+
+    	builder.append("aaaaa");
+    	assertEquals(2, StringUtils.lastIndexOf(builder, CODE_POINT, 4));
+    	//inner branch on the supplementary character block
+    	char[] tmp = {(char) 55361};
+    	builder = new StringBuilder();
+    	builder.append(tmp);
+    	assertEquals(-1, StringUtils.lastIndexOf(builder, CODE_POINT, 0));
+    	builder.appendCodePoint(CODE_POINT);
+    	assertEquals(-1, StringUtils.lastIndexOf(builder, CODE_POINT, 0 ));
+        assertEquals(1, StringUtils.lastIndexOf(builder, CODE_POINT, 1 ));
+        assertEquals(-1, StringUtils.lastIndexOf(builder.toString(), CODE_POINT, 0));
+        assertEquals(1, StringUtils.lastIndexOf(builder.toString(), CODE_POINT, 1));
+
+
     }
 
     @Test