Skip to content

Commit 82d2fc5

Browse files
fix: tagged string subrange calculation crash (#661)
* Tag parsing algorithm improvements: support of nested tags, unexpected extra tags, single tags. * Add convenient functions for extracting tagged/untagged substrings * Add documentation for TaggedString * Add exhaustive TaggedString unit tests
1 parent 410c34a commit 82d2fc5

File tree

4 files changed

+196
-51
lines changed

4 files changed

+196
-51
lines changed

Sources/AlgoliaSearchClient/Models/Search/Response/SearchResponse/Auxiliary/Highlighting/TaggedString.swift

Lines changed: 102 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7,20 +7,43 @@
77

88
import Foundation
99

10+
/// Structure embedding the calculation of tagged substring in the input string
1011
public struct TaggedString {
1112

13+
/// Input string
1214
public let input: String
15+
16+
/// Opening tag
1317
public let preTag: String
18+
19+
/// Closing tag
1420
public let postTag: String
21+
22+
/// String comparison options for tags parsing
1523
public let options: String.CompareOptions
1624

1725
private lazy var storage: (String, [Range<String.Index>]) = TaggedString.computeRanges(for: input, preTag: preTag, postTag: postTag, options: options)
1826

27+
/// Output string cleansed of the opening and closing tags
1928
public private(set) lazy var output: String = { storage.0 }()
29+
30+
/// List of ranges of tagged substrings in the output string
2031
public private(set) lazy var taggedRanges: [Range<String.Index>] = { storage.1 }()
32+
33+
/// List of ranges of untagged substrings in the output string
2134
public private(set) lazy var untaggedRanges: [Range<String.Index>] = TaggedString.computeInvertedRanges(for: output, with: taggedRanges)
2235

23-
public init(string: String, preTag: String, postTag: String, options: String.CompareOptions = []) {
36+
/**
37+
- Parameters:
38+
- string: Input string
39+
- preTag: Opening tag
40+
- postTag: Closing tag
41+
- options: String comparison options for tags parsing
42+
*/
43+
public init(string: String,
44+
preTag: String,
45+
postTag: String,
46+
options: String.CompareOptions = []) {
2447
// This string reconstruction is here to avoid a potential problems due to string encoding
2548
// Check unit test TaggedStringTests -> testWithDecodedString
2649
let string = String(string.indices.map { string [$0] })
@@ -29,45 +52,107 @@ public struct TaggedString {
2952
self.postTag = postTag
3053
self.options = options
3154
}
55+
56+
/// - Returns: The list of tagged substrings of the output string
57+
public mutating func taggedSubstrings() -> [Substring] {
58+
return taggedRanges.map { output[$0] }
59+
}
60+
61+
/// - Returns: The list of untagged substrings of the output string
62+
public mutating func untaggedSubstrings() -> [Substring] {
63+
return untaggedRanges.map { output[$0] }
64+
}
3265

3366
private static func computeRanges(for string: String, preTag: String, postTag: String, options: String.CompareOptions) -> (output: String, ranges: [Range<String.Index>]) {
67+
var output = string
68+
var preStack: [R] = []
69+
var rangesToHighlight = [R]()
70+
71+
typealias R = Range<String.Index>
3472

35-
var newText = string
36-
var rangesToHighlight = [Range<String.Index>]()
37-
38-
while let matchBegin = newText.range(of: preTag, options: options) {
39-
newText.removeSubrange(matchBegin)
40-
let tailRange = matchBegin.lowerBound..<newText.endIndex
41-
guard let matchEnd = newText.range(of: postTag, options: options, range: tailRange) else { continue }
42-
newText.removeSubrange(matchEnd)
43-
rangesToHighlight.append(.init(uncheckedBounds: (matchBegin.lowerBound, matchEnd.lowerBound)))
73+
enum Tag {
74+
case pre(R), post(R)
4475
}
45-
46-
return (newText, rangesToHighlight)
47-
76+
77+
func nextTag(in string: String) -> Tag? {
78+
switch (string.range(of: preTag, options: options), string.range(of: postTag, options: options)) {
79+
case (.some(let pre), .some(let post)) where pre.lowerBound < post.lowerBound:
80+
return .pre(pre)
81+
case (.some, .some(let post)):
82+
return .post(post)
83+
case (.some(let pre), .none):
84+
return .pre(pre)
85+
case (.none, .some(let post)):
86+
return .post(post)
87+
case (.none, .none):
88+
return nil
89+
}
90+
}
91+
92+
while let nextTag = nextTag(in: output) {
93+
switch nextTag {
94+
case .pre(let preRange):
95+
preStack.append(preRange)
96+
output.removeSubrange(preRange)
97+
case .post(let postRange):
98+
if let lastPre = preStack.last {
99+
preStack.removeLast()
100+
rangesToHighlight.append(.init(uncheckedBounds: (lastPre.lowerBound, postRange.lowerBound)))
101+
}
102+
output.removeSubrange(postRange)
103+
}
104+
}
105+
106+
return (output, mergeOverlapping(rangesToHighlight))
48107
}
49-
108+
50109
private static func computeInvertedRanges(for string: String, with ranges: [Range<String.Index>]) -> [Range<String.Index>] {
51-
110+
111+
if ranges.isEmpty {
112+
return ([string.startIndex..<string.endIndex])
113+
}
114+
52115
var lowerBound = string.startIndex
53116

54117
var invertedRanges: [Range<String.Index>] = []
55118

56-
for range in ranges {
119+
for range in ranges where range.lowerBound >= lowerBound {
57120
if lowerBound != range.lowerBound {
58121
let invertedRange = lowerBound..<range.lowerBound
59122
invertedRanges.append(invertedRange)
60123
}
61124
lowerBound = range.upperBound
62125
}
63126

64-
if lowerBound != string.endIndex {
127+
if lowerBound != string.endIndex, lowerBound != string.startIndex {
65128
invertedRanges.append(lowerBound..<string.endIndex)
66129
}
67130

68131
return invertedRanges
69132

70133
}
134+
135+
private static func mergeOverlapping<T: Comparable>(_ input: [Range<T>]) -> [Range<T>] {
136+
var output: [Range<T>] = []
137+
let sortedRanges = input.sorted(by: { $0.lowerBound < $1.lowerBound })
138+
guard let head = sortedRanges.first else { return output }
139+
let tail = sortedRanges.suffix(from: sortedRanges.index(after: sortedRanges.startIndex))
140+
var (lower, upper) = (head.lowerBound, head.upperBound)
141+
for range in tail {
142+
if range.lowerBound <= upper {
143+
if range.upperBound > upper {
144+
upper = range.upperBound
145+
} else {
146+
continue
147+
}
148+
} else {
149+
output.append(.init(uncheckedBounds: (lower: lower, upper: upper)))
150+
(lower, upper) = (range.lowerBound, range.upperBound)
151+
}
152+
}
153+
output.append(.init(uncheckedBounds: (lower: lower, upper: upper)))
154+
return output
155+
}
71156

72157
}
73158

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"source": "<div class=\"c-entry-content\">\n<p id=\"4B922A\">Supercell’s mobile games, <em>Clash of Clans</em> and <em>Clash Royale,</em> are massive: together, the pair has been downloaded more than 3 billion times, according to the developer. Now, the Finnish studio is bringing them together with a new <em>Clash</em> animated short, a 12-minute piece called <em>Lost &amp; Crowned</em>. It’s gorgeous and funny but also part of a broader strategy to make the overarching fantasy world something bigger and more enticing. “When it comes to storytelling, we feel it’s important to expand beyond what’s in the games,” says Christina Lee, who oversees story and franchise development at Supercell.</p>\n<p id=\"WVPh8p\">This isn’t technically the first animated short set in the <em>Clash of Clans</em> universe. <a href=\"https://www.theverge.com/2016/12/23/14053766/clash-a-rama-animated-series-the-simpsons-supercell-clash-of-clans\">That distinction goes to <em>Clash-A-Rama</em></a>, a YouTube series penned by a trio of <em>Simpsons</em> writers that debuted back in 2016. Lee says that the series was a big success, particularly when it came to expanding the fictional world of <em>Clash</em>. The games themselves have very little narrative, instead focusing almost entirely on combat. But <em>Clash-A-Rama</em> showed what the characters did in their off-time, adding a newfound depth to that world. It made Supercell want to do something even bigger.</p>\n<div class=\"c-float-right\"></div>\n<p id=\"z2satY\">“While <em>Clash-A-Rama</em> was a big success, we always saw it as exceptional fan fiction, and decided that it was time to tell Supercell’s take on the world and the characters, and that’s why we set out to do this animated short,” says Lee. That original series featured a bright and approachable 2D art style, whereas <em>Lost &amp; Crowned</em> has a 3D look more in-keeping with the games and CG commercials and trailers. One look, and it’s clear it’s set in the <em>Clash</em> universe.</p>\n<p id=\"wQIGYs\">It was developed in collaboration with animation and design studio Psyops. The studio has previously worked with Riot on <a href=\"https://www.theverge.com/2020/7/24/21336218/league-of-legends-yone-anime-cinematic-riot-games\">a <em>League of Legends</em> short</a>, and it even turned a Travel Oregon commercial into <a href=\"https://www.youtube.com/watch?v=doVV1a7XgyQ\">a Studio Ghibli-style anime clip</a> — and Lee says the main goal, aside from bridging the world of <em>Clash Royale</em> and <em>Clash of Clans</em>, was to focus on a character-driven narrative. “One of the driving principles for our team is to make our characters feel like real people, particularly people you can relate to,” she says. “That relatability is really key for us.”</p>\n<p id=\"TyGasf\">This kind of storytelling is something of a trend among popular multiplayer games. Titles like <em>Overwatch</em> and <em>League of Legends</em> don’t feature much storytelling inside of the actual game, but there are copious animated shorts, comic books, and other content that flesh out their fictional worlds. Lee heads up a team of three focused entirely on this kind of external storytelling for <em>Clash</em>. And while Supercell isn’t ready to announce anything beyond <em>Lost &amp; Crowned</em>, it’s not hard to foresee a future where more animated clips, or even things like comics, further flesh out <em>Clash</em>’s fantasy realm. “We’re in the process of evaluating other opportunities for storytelling,” says Lee.</p>\n</div>",
3+
"markup": "<div class=\"c-entry-content\">\n<p id=\"4B922A\">Supercell’s mobile games, <em>Clash of Clans</em> <em>a</em>nd <em>Clash Royale,</em> <em>a</em>re massive: together, the pair has been downloaded more than 3 billion times, <em>a</em>ccording to the developer. Now, the Finnish studio is bringing them together with <em>a</em> new <em>Clash</em> <em>a</em>nimated short, <em>a</em> 12-minute piece called <em>Lost &amp; Crowned</em>. It’s gorgeous <em>a</em>nd funny but <em>a</em>lso part of <em>a</em> broader strategy to make the overarching fantasy world something bigger <em>a</em>nd more enticing. “When it comes to storytelling, we feel it’s important to expand beyond what’s in the games,” says Christina Lee, who oversees story <em>a</em>nd franchise development <em>a</em>t Supercell.</p>\n<p id=\"WVPh8p\">This isn’t technically the first <em>a</em>nimated short set in the <em>Clash of Clans</em> universe. <a href=\"https://www.theverge.com/2016/12/23/14053766/clash-a-rama-animated-series-the-simpsons-supercell-clash-of-clans\">That distinction goes to <em>Clash-<em>A</em>-Rama</em></a>, <em>a</em> YouTube series penned by <em>a</em> trio of <em>Simpsons</em> writers that debuted back in 2016. Lee says that the series was <em>a</em> big success, particularly when it came to expanding the fictional world of <em>Clash</em>. The games themselves have very little narrative, instead focusing <em>a</em>lmost entirely on combat. But <em>Clash-<em>A</em>-Rama</em> showed what the characters did in their off-time, <em>a</em>dding <em>a</em> newfound depth to that world. It made Supercell want to do something even bigger.</p>\n<div class=\"c-float-right\"></div>\n<p id=\"z2satY\">“While <em>Clash-<em>A</em>-Rama</em> was <em>a</em> big success, we <em>a</em>lways saw it <em>a</em>s exceptional fan fiction, <em>a</em>nd decided that it was time to tell Supercell’s take on the world <em>a</em>nd the characters, <em>a</em>nd that’s why we set out to do this <em>a</em>nimated short,” says Lee. That original series featured <em>a</em> bright <em>a</em>nd <em>a</em>pproachable 2D <em>a</em>rt style, whereas <em>Lost &amp; Crowned</em> has <em>a</em> 3D look more in-keeping with the games <em>a</em>nd CG commercials <em>a</em>nd trailers. One look, <em>a</em>nd it’s clear it’s set in the <em>Clash</em> universe.</p>\n<p id=\"wQIGYs\">It was developed in collaboration with <em>a</em>nimation <em>a</em>nd design studio Psyops. The studio has previously worked with Riot on <a href=\"https://www.theverge.com/2020/7/24/21336218/league-of-legends-yone-anime-cinematic-riot-games\"><em>a</em> <em>League of Legends</em> short</a>, <em>a</em>nd it even turned <em>a</em> Travel Oregon commercial into <a href=\"https://www.youtube.com/watch?v=doVV1a7XgyQ\"><em>a</em> Studio Ghibli-style <em>a</em>nime clip</a> — <em>a</em>nd Lee says the main goal, <em>a</em>side from bridging the world of <em>Clash Royale</em> <em>a</em>nd <em>Clash of Clans</em>, was to focus on <em>a</em> character-driven narrative. “One of the driving principles for our team is to make our characters feel like real people, particularly people you can relate to,” she says. “That relatability is really key for us.”</p>\n<p id=\"TyGasf\">This kind of storytelling is something of <em>a</em> trend <em>a</em>mong popular multiplayer games. Titles like <em>Overwatch</em> <em>a</em>nd <em>League of Legends</em> don’t feature much storytelling inside of the <em>a</em>ctual game, but there <em>a</em>re copious <em>a</em>nimated shorts, comic books, <em>a</em>nd other content that flesh out their fictional worlds. Lee heads up <em>a</em> team of three focused entirely on this kind of external storytelling for <em>Clash</em>. <em>A</em>nd while Supercell isn’t ready to <em>a</em>nnounce <em>a</em>nything beyond <em>Lost &amp; Crowned</em>, it’s not hard to foresee <em>a</em> future where more <em>a</em>nimated clips, or even things like comics, further flesh out <em>Clash</em>’s fantasy realm. “We’re in the process of evaluating other opportunities for storytelling,” says Lee.</p>\n</div>"
4+
}

Tests/AlgoliaSearchClientTests/Unit/HighlightedStringTests.swift

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,17 @@ class HighlightedStringTests: XCTestCase {
5555

5656
}
5757

58+
struct MarkupString: Codable {
59+
let source: String
60+
let markup: HighlightedString
61+
}
62+
63+
func testWithHTMLString() throws {
64+
let data = try Data(filename: "HighlightedHTML.json")
65+
let searchResponse = try JSONDecoder().decode(MarkupString.self, from: data)
66+
let searchResponseJSON = try JSON(searchResponse)
67+
print(searchResponseJSON)
68+
}
5869

5970
}
6071

0 commit comments

Comments
 (0)