Do not parse hashtag emoji as tag — pt 2 (#2245)

* Do not parse hashtag emoji as tag (#2242) * fix: prevent hashtag emoji from being parsed as tag * chore: fmt * fix: properly calculate length of tag * Add a couple tests --------- Co-authored-by: Mary <148872143+mary-ext@users.noreply.github.com>
bluesky-social · Feb 29, 2024 · 61b3d25 · 61b3d25
1 parent 4d062cb
commit 61b3d25
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 12 deletions.
diff --git a/.changeset/dull-hotels-beam.md b/.changeset/dull-hotels-beam.md
@@ -0,0 +1,5 @@
+---
+'@atproto/api': patch
+---
+
+Prevent hashtag emoji from being parsed as a tag
diff --git a/.changeset/short-suits-destroy.md b/.changeset/short-suits-destroy.md
@@ -0,0 +1,5 @@
+---
+'@atproto/api': patch
+---
+
+Properly calculate length of tag
diff --git a/packages/api/src/rich-text/detection.ts b/packages/api/src/rich-text/detection.ts
@@ -70,27 +70,25 @@ export function detectFacets(text: UnicodeString): Facet[] | undefined {
     }
   }
   {
-    const re = /(?:^|\s)(#[^\d\s]\S*)(?=\s)?/g
+    const re = /(^|\s)#((?!\ufe0f)[^\d\s]\S*)(?=\s)?/g
     while ((match = re.exec(text.utf16))) {
-      let [tag] = match
-      const hasLeadingSpace = /^\s/.test(tag)
+      let [, leading, tag] = match
 
       tag = tag.trim().replace(/\p{P}+$/gu, '') // strip ending punctuation
 
-      // inclusive of #, max of 64 chars
-      if (tag.length > 66) continue
+      if (tag.length === 0 || tag.length > 64) continue
 
-      const index = match.index + (hasLeadingSpace ? 1 : 0)
+      const index = match.index + leading.length
 
       facets.push({
         index: {
           byteStart: text.utf16IndexToUtf8Index(index),
-          byteEnd: text.utf16IndexToUtf8Index(index + tag.length), // inclusive of last char
+          byteEnd: text.utf16IndexToUtf8Index(index + 1 + tag.length),
         },
         features: [
           {
             $type: 'app.bsky.richtext.facet#tag',
-            tag: tag.replace(/^#/, ''),
+            tag: tag,
           },
         ],
       })

diff --git a/packages/api/tests/rich-text-detection.test.ts b/packages/api/tests/rich-text-detection.test.ts
@@ -241,15 +241,16 @@ describe('detectFacets', () => {
       ['body #1', [], []],
       ['body #a1', ['a1'], [{ byteStart: 5, byteEnd: 8 }]],
       ['#', [], []],
+      ['#?', [], []],
       ['text #', [], []],
       ['text # text', [], []],
       [
-        'body #thisisa64characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
-        ['thisisa64characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'],
-        [{ byteStart: 5, byteEnd: 71 }],
+        'body #thisisa64characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
+        ['thisisa64characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'],
+        [{ byteStart: 5, byteEnd: 70 }],
       ],
       [
-        'body #thisisa65characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab',
+        'body #thisisa65characterstring_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab',
         [],
         [],
       ],
@@ -297,6 +298,17 @@ describe('detectFacets', () => {
           { byteStart: 17, byteEnd: 22 },
         ],
       ],
+      ['this #️⃣tag should not be a tag', [], []],
+      [
+        'this ##️⃣tag should be a tag',
+        ['#️⃣tag'],
+        [
+          {
+            byteStart: 5,
+            byteEnd: 16,
+          },
+        ],
+      ],
     ]
 
     for (const [input, tags, indices] of inputs) {