From 0b5ee167c11610e9869ba4a94760fd4f4ee70690 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Mon, 7 Aug 2017 12:38:53 -0700 Subject: [PATCH] [stdlib] Drop Unicode 8 tries in stdlib (entirely) This drops the last vestage of Unicode 8 tries from the standard library. Switches everything over to use ICU. --- stdlib/public/core/CMakeLists.txt | 1 - stdlib/public/core/GroupInfo.json | 1 - .../public/core/StringUnicodeScalarView.swift | 18 +- stdlib/public/core/UnicodeTrie.swift.gyb | 240 ---------------- stdlib/public/stubs/CMakeLists.txt | 2 +- .../UnicodeExtendedGraphemeClusters.cpp.gyb | 137 --------- validation-test/stdlib/UnicodeTrie.swift.gyb | 263 ------------------ 7 files changed, 3 insertions(+), 659 deletions(-) delete mode 100644 stdlib/public/core/UnicodeTrie.swift.gyb delete mode 100644 stdlib/public/stubs/UnicodeExtendedGraphemeClusters.cpp.gyb delete mode 100644 validation-test/stdlib/UnicodeTrie.swift.gyb diff --git a/stdlib/public/core/CMakeLists.txt b/stdlib/public/core/CMakeLists.txt index 43c3765086750..9c163d3744143 100644 --- a/stdlib/public/core/CMakeLists.txt +++ b/stdlib/public/core/CMakeLists.txt @@ -141,7 +141,6 @@ set(SWIFTLIB_ESSENTIAL UnavailableStringAPIs.swift.gyb UnicodeEncoding.swift UnicodeParser.swift - UnicodeTrie.swift.gyb Unmanaged.swift UnsafeBitMap.swift UnsafeBufferPointer.swift.gyb diff --git a/stdlib/public/core/GroupInfo.json b/stdlib/public/core/GroupInfo.json index 117b000748791..5bc3821687590 100644 --- a/stdlib/public/core/GroupInfo.json +++ b/stdlib/public/core/GroupInfo.json @@ -30,7 +30,6 @@ "UnicodeEncoding.swift", "UnicodeParser.swift", "UnicodeScalar.swift", - "UnicodeTrie.swift", "UnavailableStringAPIs.swift", "UTFEncoding.swift", "UTF8.swift", diff --git a/stdlib/public/core/StringUnicodeScalarView.swift b/stdlib/public/core/StringUnicodeScalarView.swift index f452e5682d37b..63fceb096c3b2 100644 --- a/stdlib/public/core/StringUnicodeScalarView.swift +++ b/stdlib/public/core/StringUnicodeScalarView.swift @@ -445,22 +445,8 @@ extension String.UnicodeScalarView { return true } if !_isOnUnicodeScalarBoundary(i) { return false } - let precedingScalar = self[index(before: i)] - - let graphemeClusterBreakProperty = - _UnicodeGraphemeClusterBreakPropertyTrie() - let segmenter = _UnicodeExtendedGraphemeClusterSegmenter() - - let gcb0 = graphemeClusterBreakProperty.getPropertyRawValue( - precedingScalar.value) - - if segmenter.isBoundaryAfter(gcb0) { - return true - } - - let gcb1 = graphemeClusterBreakProperty.getPropertyRawValue(self[i].value) - - return segmenter.isBoundary(gcb0, gcb1) + let str = String(_core) + return i == str.index(before: str.index(after: i)) } } diff --git a/stdlib/public/core/UnicodeTrie.swift.gyb b/stdlib/public/core/UnicodeTrie.swift.gyb deleted file mode 100644 index 4d4cb4e983553..0000000000000 --- a/stdlib/public/core/UnicodeTrie.swift.gyb +++ /dev/null @@ -1,240 +0,0 @@ -//===--- UnicodeTrie.swift.gyb --------------------------------*- swift -*-===// -// -// This source file is part of the Swift.org open source project -// -// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors -// Licensed under Apache License v2.0 with Runtime Library Exception -// -// See https://swift.org/LICENSE.txt for license information -// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors -// -//===----------------------------------------------------------------------===// -// -// A custom trie implementation to quickly retrieve Unicode property values. -// -//===----------------------------------------------------------------------===// - -%{ - -# Note: keep these constants synchronized with the data that it is actually -# generated. There is a runtime check for this, but it is only performed in -# builds with INTERNAL_CHECKS_ENABLED. - -BMPFirstLevelIndexBits = 8 -BMPDataOffsetBits = 8 -SuppFirstLevelIndexBits = 5 -SuppSecondLevelIndexBits = 8 -SuppDataOffsetBits = 8 - -BMPLookupBytesPerEntry = 1 -BMPDataBytesPerEntry = 1 -SuppLookup1BytesPerEntry = 1 -SuppLookup2BytesPerEntry = 1 -SuppDataBytesPerEntry = 1 - -TrieSize = 18961 - -BMPLookupBytesOffset = 0 -BMPDataBytesOffset = 256 -SuppLookup1BytesOffset = 12032 -SuppLookup2BytesOffset = 12049 -SuppDataBytesOffset = 12817 - -}% - -import SwiftShims - -// These case names must be kept in sync with the 'GraphemeClusterBreakProperty' -// enum in C++ and with the names in the GYBUnicodeDataUtils script. -public // @testable -enum _GraphemeClusterBreakPropertyValue : Int { - case Other = 0 - case CR = 1 - case LF = 2 - case Control = 3 - case Extend = 4 - case Regional_Indicator = 5 - case Prepend = 6 - case SpacingMark = 7 - case L = 8 - case V = 9 - case T = 10 - case LV = 11 - case LVT = 12 -} - -// It is expensive to convert a raw enum value to an enum, so we use this type -// safe wrapper around the raw property value to avoid paying the conversion -// cost in hot code paths. -struct _GraphemeClusterBreakPropertyRawValue { - init(_ rawValue: UInt${BMPDataBytesPerEntry * 8}) { - self.rawValue = rawValue - } - - var rawValue: UInt${BMPDataBytesPerEntry * 8} - - // Use with care: this operation is expensive (even with optimization - // turned on the compiler generates code for a switch). - var cookedValue: _GraphemeClusterBreakPropertyValue { - return _GraphemeClusterBreakPropertyValue(rawValue: Int(rawValue))! - } -} - -public // @testable -struct _UnicodeGraphemeClusterBreakPropertyTrie { - static func _checkParameters() { - let metadata = _swift_stdlib_GraphemeClusterBreakPropertyTrieMetadata - - _sanityCheck(metadata.BMPFirstLevelIndexBits == ${BMPFirstLevelIndexBits}) - _sanityCheck(metadata.BMPDataOffsetBits == ${BMPDataOffsetBits}) - _sanityCheck(metadata.SuppFirstLevelIndexBits == ${SuppFirstLevelIndexBits}) - _sanityCheck(metadata.SuppSecondLevelIndexBits == ${SuppSecondLevelIndexBits}) - _sanityCheck(metadata.SuppDataOffsetBits == ${SuppDataOffsetBits}) - - _sanityCheck(metadata.BMPLookupBytesPerEntry == ${BMPLookupBytesPerEntry}) - _sanityCheck(metadata.BMPDataBytesPerEntry == ${BMPDataBytesPerEntry}) - _sanityCheck(metadata.SuppLookup1BytesPerEntry == ${SuppLookup1BytesPerEntry}) - _sanityCheck(metadata.SuppLookup2BytesPerEntry == ${SuppLookup2BytesPerEntry}) - _sanityCheck(metadata.SuppDataBytesPerEntry == ${SuppDataBytesPerEntry}) - - _sanityCheck(metadata.TrieSize == ${TrieSize}) - - _sanityCheck(metadata.BMPLookupBytesOffset == ${BMPLookupBytesOffset}) - _sanityCheck(metadata.BMPDataBytesOffset == ${BMPDataBytesOffset}) - _sanityCheck(metadata.SuppLookup1BytesOffset == ${SuppLookup1BytesOffset}) - _sanityCheck(metadata.SuppLookup2BytesOffset == ${SuppLookup2BytesOffset}) - _sanityCheck(metadata.SuppDataBytesOffset == ${SuppDataBytesOffset}) - } - - let _trieData: UnsafePointer - -% if BMPLookupBytesPerEntry == 1: - @_transparent var _bmpLookup: UnsafePointer { - return _trieData + ${BMPLookupBytesOffset} - } -% end - -% if BMPDataBytesPerEntry == 1: - @_transparent var _bmpData: UnsafePointer { - return _trieData + ${BMPDataBytesOffset} - } -% end - -% if SuppLookup1BytesPerEntry == 1: - @_transparent var _suppLookup1: UnsafePointer { - return _trieData + ${SuppLookup1BytesOffset} - } -% end - -% if SuppLookup2BytesPerEntry == 1: - @_transparent var _suppLookup2: UnsafePointer { - return _trieData + ${SuppLookup2BytesOffset} - } -% end - -% if SuppDataBytesPerEntry == 1: - @_transparent var _suppData: UnsafePointer { - return _trieData + ${SuppDataBytesOffset} - } -% end - - public // @testable - init() { - _UnicodeGraphemeClusterBreakPropertyTrie._checkParameters() - _trieData = _swift_stdlib_GraphemeClusterBreakPropertyTrie - } - - @_transparent - func _getBMPFirstLevelIndex(_ cp: UInt32) -> Int { - return Int(cp >> ${BMPFirstLevelIndexBits}) - } - - @_transparent - func _getBMPDataOffset(_ cp: UInt32) -> Int { - return Int(cp & ((1 << ${BMPDataOffsetBits}) - 1)) - } - - @_transparent - func _getSuppFirstLevelIndex(_ cp: UInt32) -> Int { - return Int(cp >> (${SuppSecondLevelIndexBits} + ${SuppDataOffsetBits})) - } - - @_transparent - func _getSuppSecondLevelIndex(_ cp: UInt32) -> Int { - return Int((cp >> ${SuppDataOffsetBits}) & - ((1 << ${SuppSecondLevelIndexBits}) - 1)) - } - - @_transparent - func _getSuppDataOffset(_ cp: UInt32) -> Int { - return Int(cp & ((1 << ${SuppDataOffsetBits}) - 1)) - } - - func getPropertyRawValue( - _ codePoint: UInt32 - ) -> _GraphemeClusterBreakPropertyRawValue { - // Note: for optimization, the code below uses '&+' instead of '+' to avoid - // a few branches. There is no possibility of overflow here. - // - // The optimizer could figure this out, but right now it keeps extra checks - // if '+' is used. - - if _fastPath(codePoint <= 0xffff) { - let dataBlockIndex = Int(_bmpLookup[_getBMPFirstLevelIndex(codePoint)]) - return _GraphemeClusterBreakPropertyRawValue( - _bmpData[ - (dataBlockIndex << ${BMPDataOffsetBits}) &+ - _getBMPDataOffset(codePoint)]) - } else { - _precondition(codePoint <= 0x10ffff) - let secondLookupIndex = Int(_suppLookup1[_getSuppFirstLevelIndex(codePoint)]) - let dataBlockIndex = Int(_suppLookup2[ - (secondLookupIndex << ${SuppSecondLevelIndexBits}) &+ - _getSuppSecondLevelIndex(codePoint)]) - return _GraphemeClusterBreakPropertyRawValue( - _suppData[ - (dataBlockIndex << ${SuppDataOffsetBits}) &+ - _getSuppDataOffset(codePoint)]) - } - } - - public // @testable - func getPropertyValue( - _ codePoint: UInt32 - ) -> _GraphemeClusterBreakPropertyValue { - return getPropertyRawValue(codePoint).cookedValue - } -} - -// FIXME(ABI)#74 : don't mark this type versioned, or any of its APIs inlineable. -// Grapheme cluster segmentation uses a completely different algorithm in -// Unicode 9.0. -internal struct _UnicodeExtendedGraphemeClusterSegmenter { - let _noBoundaryRulesMatrix: UnsafePointer - - init() { - _noBoundaryRulesMatrix = - _swift_stdlib_ExtendedGraphemeClusterNoBoundaryRulesMatrix - } - - /// Returns `true` if there is always a grapheme cluster break after a code - /// point with a given `Grapheme_Cluster_Break` property value. - func isBoundaryAfter(_ gcb: _GraphemeClusterBreakPropertyRawValue) -> Bool { - let ruleRow = _noBoundaryRulesMatrix[Int(gcb.rawValue)] - return ruleRow == 0 - } - - /// Returns `true` if there is a grapheme cluster break between code points - /// with given `Grapheme_Cluster_Break` property values. - func isBoundary( - _ gcb1: _GraphemeClusterBreakPropertyRawValue, - _ gcb2: _GraphemeClusterBreakPropertyRawValue - ) -> Bool { - let ruleRow = _noBoundaryRulesMatrix[Int(gcb1.rawValue)] - return (ruleRow & (1 << UInt16(gcb2.rawValue))) == 0 - } -} - -// ${'Local Variables'}: -// eval: (read-only-mode 1) -// End: diff --git a/stdlib/public/stubs/CMakeLists.txt b/stdlib/public/stubs/CMakeLists.txt index 2013824c6df6c..4f96fd35db0b5 100644 --- a/stdlib/public/stubs/CMakeLists.txt +++ b/stdlib/public/stubs/CMakeLists.txt @@ -5,7 +5,7 @@ set(swift_stubs_sources KeyPaths.cpp LibcShims.cpp Stubs.cpp - UnicodeExtendedGraphemeClusters.cpp.gyb) +) set(swift_stubs_objc_sources Availability.mm FoundationHelpers.mm diff --git a/stdlib/public/stubs/UnicodeExtendedGraphemeClusters.cpp.gyb b/stdlib/public/stubs/UnicodeExtendedGraphemeClusters.cpp.gyb deleted file mode 100644 index 739b8ea720631..0000000000000 --- a/stdlib/public/stubs/UnicodeExtendedGraphemeClusters.cpp.gyb +++ /dev/null @@ -1,137 +0,0 @@ -//===--- UnicodeExtendedGraphemeClusters.cpp.gyb ----------------*- C++ -*-===// -// -// This source file is part of the Swift.org open source project -// -// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors -// Licensed under Apache License v2.0 with Runtime Library Exception -// -// See https://swift.org/LICENSE.txt for license information -// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors -// -//===----------------------------------------------------------------------===// - -#include "swift/Runtime/Config.h" - -%{ - -# FIXME: this table should be moved to a Swift file in stdlib. Unfortunately, -# in Swift we don't have a way to statically initialize arrays. - -from GYBUnicodeDataUtils import GraphemeClusterBreakPropertyTable, UnicodeTrieGenerator, get_extended_grapheme_cluster_rules_matrix - -grapheme_cluster_break_property_table = \ - GraphemeClusterBreakPropertyTable(unicodeGraphemeBreakPropertyFile) - -trie_generator = UnicodeTrieGenerator() -trie_generator.create_tables() -trie_generator.fill_from_unicode_property(grapheme_cluster_break_property_table) -trie_generator.verify(grapheme_cluster_break_property_table) - -}% - -/// Before compression: -/// bmp_lookup: ${len(trie_generator.bmp_lookup)} -/// bmp_data: ${len(trie_generator.bmp_data)} x ${len(trie_generator.bmp_data[0])} -/// supp_lookup1: ${len(trie_generator.supp_lookup1)} -/// supp_lookup2: ${len(trie_generator.supp_lookup2)} x ${len(trie_generator.supp_lookup2[0])} -/// supp_data: ${len(trie_generator.supp_data)} x ${len(trie_generator.supp_data[0])} - -%{ - -trie_generator.freeze() -trie_generator.verify(grapheme_cluster_break_property_table) - -}% - -/// After compression: -/// bmp_lookup: ${len(trie_generator.bmp_lookup)} -/// bmp_data: ${len(trie_generator.bmp_data)} x ${len(trie_generator.bmp_data[0])} -/// supp_lookup1: ${len(trie_generator.supp_lookup1)} -/// supp_lookup2: ${len(trie_generator.supp_lookup2)} x ${len(trie_generator.supp_lookup2[0])} -/// supp_data: ${len(trie_generator.supp_data)} x ${len(trie_generator.supp_data[0])} - -%{ - -trie_generator.serialize(grapheme_cluster_break_property_table) - -}% - -#include - -static const uint8_t _swift_stdlib_GraphemeClusterBreakPropertyTrieImpl[] = { -% for byte in trie_generator.trie_bytes: - ${byte}, -% end -}; - -SWIFT_RUNTIME_STDLIB_INTERFACE -const uint8_t *_swift_stdlib_GraphemeClusterBreakPropertyTrie = - _swift_stdlib_GraphemeClusterBreakPropertyTrieImpl; - -struct _swift_stdlib_GraphemeClusterBreakPropertyTrieMetadataTy { - unsigned BMPFirstLevelIndexBits; - unsigned BMPDataOffsetBits; - unsigned SuppFirstLevelIndexBits; - unsigned SuppSecondLevelIndexBits; - unsigned SuppDataOffsetBits; - - unsigned BMPLookupBytesPerEntry; - unsigned BMPDataBytesPerEntry; - unsigned SuppLookup1BytesPerEntry; - unsigned SuppLookup2BytesPerEntry; - unsigned SuppDataBytesPerEntry; - - unsigned TrieSize; - - unsigned BMPLookupBytesOffset; - unsigned BMPDataBytesOffset; - unsigned SuppLookup1BytesOffset; - unsigned SuppLookup2BytesOffset; - unsigned SuppDataBytesOffset; -}; - -SWIFT_RUNTIME_STDLIB_INTERFACE -const struct _swift_stdlib_GraphemeClusterBreakPropertyTrieMetadataTy -_swift_stdlib_GraphemeClusterBreakPropertyTrieMetadata = { - ${trie_generator.bmp_first_level_index_bits}, - ${trie_generator.bmp_data_offset_bits}, - ${trie_generator.supp_first_level_index_bits}, - ${trie_generator.supp_second_level_index_bits}, - ${trie_generator.supp_data_offset_bits}, - - ${trie_generator.bmp_lookup_bytes_per_entry}, - ${trie_generator.bmp_data_bytes_per_entry}, - ${trie_generator.supp_lookup1_bytes_per_entry}, - ${trie_generator.supp_lookup2_bytes_per_entry}, - ${trie_generator.supp_data_bytes_per_entry}, - - ${len(trie_generator.trie_bytes)}, - - ${trie_generator.bmp_lookup_bytes_offset}, - ${trie_generator.bmp_data_bytes_offset}, - ${trie_generator.supp_lookup1_bytes_offset}, - ${trie_generator.supp_lookup2_bytes_offset}, - ${trie_generator.supp_data_bytes_offset}, -}; - -%{ - -extended_grapheme_cluster_rules_matrix = \ - get_extended_grapheme_cluster_rules_matrix(grapheme_cluster_break_property_table) - -}% - - -static const uint16_t _swift_stdlib_ExtendedGraphemeClusterNoBoundaryRulesMatrixImpl[] = { -% for row in get_extended_grapheme_cluster_rules_matrix(grapheme_cluster_break_property_table): - ${row}, -% end -}; - -SWIFT_RUNTIME_STDLIB_INTERFACE -const uint16_t *_swift_stdlib_ExtendedGraphemeClusterNoBoundaryRulesMatrix = - _swift_stdlib_ExtendedGraphemeClusterNoBoundaryRulesMatrixImpl; - -// ${'Local Variables'}: -// eval: (read-only-mode 1) -// End: diff --git a/validation-test/stdlib/UnicodeTrie.swift.gyb b/validation-test/stdlib/UnicodeTrie.swift.gyb deleted file mode 100644 index 738deb42c33bb..0000000000000 --- a/validation-test/stdlib/UnicodeTrie.swift.gyb +++ /dev/null @@ -1,263 +0,0 @@ -//===--- UnicodeTrie.swift.gyb --------------------------------*- swift -*-===// -// -// This source file is part of the Swift.org open source project -// -// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors -// Licensed under Apache License v2.0 with Runtime Library Exception -// -// See https://swift.org/LICENSE.txt for license information -// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors -// -//===----------------------------------------------------------------------===// - -// RUN: %empty-directory(%t) -// RUN: %gyb -DunicodeGraphemeBreakPropertyFile=%utils/UnicodeData/GraphemeBreakProperty.txt -DunicodeGraphemeBreakTestFile=%utils/UnicodeData/GraphemeBreakTest.txt %s -o %t/UnicodeTrie.swift -// RUN: %line-directive %t/UnicodeTrie.swift -- %target-build-swift %t/UnicodeTrie.swift -o %t/a.out -g -Xfrontend -disable-access-control -// RUN: %line-directive %t/UnicodeTrie.swift -- %target-run %t/a.out -// REQUIRES: executable_test - -// FIXME: rdar://problem/19648117 Needs splitting objc parts out -// XFAIL: linux - -%{ - -from GYBUnicodeDataUtils import * - -grapheme_cluster_break_property_table = \ - GraphemeClusterBreakPropertyTable(unicodeGraphemeBreakPropertyFile) - -}% - -import SwiftPrivate -import StdlibUnittest -import StdlibCollectionUnittest -import Darwin -import Foundation - -var graphemeBreakPropertyTable = [ -// 'as Int' annotations are needed to help prevent the type-checker from -// blowing the stack. -% for start_code_point, end_code_point, value in grapheme_cluster_break_property_table.property_value_ranges: - (${start_code_point} as Int, ${end_code_point} as Int, _GraphemeClusterBreakPropertyValue.${value}), -% end -] - -var UnicodeTrie = TestSuite("UnicodeTrie") - -UnicodeTrie.test("_UnicodeGraphemeClusterBreakPropertyTrie") { - // Verify that the trie reports correct values of the property for every code - // point. - - var trie = _UnicodeGraphemeClusterBreakPropertyTrie() - - var expected = [_GraphemeClusterBreakPropertyValue]( - repeating: _GraphemeClusterBreakPropertyValue.Other, - count: 0x110000) - for (startCodePoint, endCodePoint, value) in graphemeBreakPropertyTable { - for cp in startCodePoint...endCodePoint { - expected[cp] = value - } - } - - for cp in UInt32(0)...UInt32(0x10ffff) { - if cp % 0x10000 == 0 { - print("\(cp)...") - } - expectEqual( - expected[Int(cp)], trie.getPropertyValue(cp), "code point \(cp)") - } -} - -%{ - -grapheme_cluster_break_tests = \ - get_grapheme_cluster_break_tests_as_unicode_scalars( - unicodeGraphemeBreakTestFile) - -}% - -// The most simple subclass of NSString that CoreFoundation does not know -// about. -class NonContiguousNSString : NSString { - override init() { - _value = [] - super.init() - } - - required init(coder aDecoder: NSCoder) { - fatalError("don't call this initializer") - } - - required init(itemProviderData data: Data, typeIdentifier: String) throws { - fatalError("don't call this initializer") - } - - @nonobjc - init(_ value: [UInt16]) { - _value = value - super.init() - } - - @nonobjc - convenience init(_ scalars: [UInt32]) { - var encoded: [UInt16] = [] - let iter = scalars.makeIterator() - let output: (UInt16) -> Void = { encoded.append($0) } - let hadError = transcode( - iter, - from: UTF32.self, - to: UTF16.self, - stoppingOnError: true, - into: output) - expectFalse(hadError) - self.init(encoded) - } - - @objc(copyWithZone:) - override func copy(with zone: NSZone?) -> Any { - // Ensure that copying this string produces a class that CoreFoundation - // does not know about. - return self - } - - @objc override var length: Int { - return _value.count - } - - @objc override func character(at index: Int) -> unichar { - return _value[index] - } - - var _value: [UInt16] -} - -/// Verify that extended grapheme cluster boundaries in `subject` occur at -/// positions specified in `expectedBoundaries`. -func checkGraphemeClusterSegmentation( - _ expectedBoundaries: [Int], _ subject: String, _ stackTrace: SourceLocStack -) { - var actualBoundaries: [Int] = [ 0 ] - var unicodeScalarCount = 0 - for c in subject.characters { - let currentClusterSize = String(c).unicodeScalars.count - unicodeScalarCount += currentClusterSize - actualBoundaries += [unicodeScalarCount] - } - expectEqual( - expectedBoundaries, actualBoundaries, - "scalars: \(asHex(Array(subject.unicodeScalars.lazy.map { $0.value })))" - ) - - let expectedCharacters: [Character] = Array(subject.characters) - checkBidirectionalCollection(expectedCharacters, subject.characters) -} - -func checkGraphemeClusterSegmentation( - _ expectedBoundaries: [Int], scalars: [UInt32], _ stackTrace: SourceLocStack -) { - let subject = NonContiguousNSString(scalars) as String - checkGraphemeClusterSegmentation(expectedBoundaries, subject, - stackTrace.withCurrentLoc()) -} - -func checkGraphemeClusterSegmentation( - _ expectedBoundaries: [Int], codeUnits: [UInt16], _ stackTrace: SourceLocStack -) { - let subject = NonContiguousNSString(codeUnits) as String - checkGraphemeClusterSegmentation(expectedBoundaries, subject, - stackTrace.withCurrentLoc()) -} - -UnicodeTrie.test("GraphemeClusterSegmentation/UnicodeSpec") { - // Test segmentation algorithm using test data from the Unicode - // specification. - -% for code_points, expected_boundaries in grapheme_cluster_break_tests: - do { - let scalars: [UInt32] = - [ ${", ".join([str(cp) for cp in code_points])} ] - let expectedBoundaries: [Int] = - [ ${", ".join([str(x) for x in expected_boundaries])} ] - checkGraphemeClusterSegmentation(expectedBoundaries, scalars: scalars, - SourceLocStack().withCurrentLoc()) - } - -% end -} - -UnicodeTrie.test("GraphemeClusterSegmentation/Extra") { - // Extra tests for input Strings that contain ill-formed code unit sequences. - - // U+D800 (high-surrogate) - checkGraphemeClusterSegmentation( - [ 0, 1 ], - codeUnits: [ 0xd800 ], - SourceLocStack().withCurrentLoc()) - - // U+D800 (high-surrogate) - // U+D800 (high-surrogate) - checkGraphemeClusterSegmentation( - [ 0, 1, 2 ], - codeUnits: [ 0xd800, 0xd800 ], - SourceLocStack().withCurrentLoc()) - - // U+0041 LATIN CAPITAL LETTER A - // U+D800 (high-surrogate) - checkGraphemeClusterSegmentation( - [ 0, 1, 2 ], - codeUnits: [ 0x0041, 0xd800 ], - SourceLocStack().withCurrentLoc()) - - // U+D800 (high-surrogate) - // U+0041 LATIN CAPITAL LETTER A - checkGraphemeClusterSegmentation( - [ 0, 1, 2 ], - codeUnits: [ 0xd800, 0x0041 ], - SourceLocStack().withCurrentLoc()) - - // U+0041 LATIN CAPITAL LETTER A - // U+0301 COMBINING ACUTE ACCENT - // U+D800 (high-surrogate) - checkGraphemeClusterSegmentation( - [ 0, 2, 3 ], - codeUnits: [ 0x0041, 0x0301, 0xd800 ], - SourceLocStack().withCurrentLoc()) - - // U+D800 (high-surrogate) - // U+0041 LATIN CAPITAL LETTER A - // U+0301 COMBINING ACUTE ACCENT - checkGraphemeClusterSegmentation( - [ 0, 1, 3 ], - codeUnits: [ 0xd800, 0x0041, 0x0301 ], - SourceLocStack().withCurrentLoc()) -} - -UnicodeTrie.test("GraphemeClusterSegmentation/Unicode_7_0_0") { - // Verify that we are using Unicode 7.0.0+ data tables. - - // In Unicode 6.3.0, this sequence was segmented into two grapheme clusters. - // - // U+0041 LATIN CAPITAL LETTER A - // U+1122C KHOJKI VOWEL SIGN AA - checkGraphemeClusterSegmentation( - [ 0, 2 ], - scalars: [ 0x0041, 0x1122c ], - SourceLocStack().withCurrentLoc()) -} - -UnicodeTrie.test("GraphemeClusterSegmentation/Unicode_8_0_0") { - // Verify that we are using Unicode 8.0.0+ data tables. - - // In Unicode 7.0.0, this sequence was segmented into two grapheme clusters. - // - // U+0041 LATIN CAPITAL LETTER A - // U+11720 AHOM VOWEL SIGN A - checkGraphemeClusterSegmentation( - [ 0, 2 ], - scalars: [ 0x0041, 0x11720 ], - SourceLocStack().withCurrentLoc()) -} - - -runAllTests() -