Process unicode results from ripgrep correctly

atom · May 22, 2019 · 6748b84 · 6748b84
1 parent 411e2a9
commit 6748b84
Show file tree

Hide file tree

Showing 2 changed files with 53 additions and 1 deletion.
diff --git a/spec/workspace-spec.js b/spec/workspace-spec.js
@@ -2647,6 +2647,28 @@ describe('Workspace', () => {
           })
         }
 
+        it('returns results on lines with unicode strings', async () => {
+          const results = []
+
+          await scan(
+            /line with unico/,
+            {},
+            result => results.push(result)
+          )
+          expect(results.length).toBe(1)
+          const { filePath, matches } = results[0]
+          expect(filePath).toBe(atom.project.getDirectories()[0].resolve('file-with-unicode'))
+          expect(matches).toHaveLength(1)
+          expect(matches[0]).toEqual({
+            matchText: 'line with unico',
+            lineText: 'ДДДДДДДДДДДДДДДДДД line with unicode',
+            lineTextOffset: 0,
+            range: [[0, 19], [0, 34]],
+            leadingContextLines: [],
+            trailingContextLines: []
+          })
+        })
+
         describe('when the core.excludeVcsIgnoredPaths config is truthy', () => {
           let projectPath
           let ignoredPath

diff --git a/src/ripgrep-directory-searcher.js b/src/ripgrep-directory-searcher.js
@@ -92,6 +92,35 @@ function getPositionFromColumn (lines, column) {
   return [currentLine - 1, column - previousLength]
 }
 
+function processUnicodeMatch (match) {
+  if (match.lines.text.length === Buffer.byteLength(match.lines.text)) {
+    // fast codepath for lines that only contain characters of 1 byte length.
+    return
+  }
+
+  let remainingBuffer = Buffer.from(match.lines.text)
+  let currentLength = 0
+  let previousPosition = 0
+
+  function convertPosition (position) {
+    const currentBuffer = remainingBuffer.slice(0, position - previousPosition)
+    currentLength = currentBuffer.toString().length + currentLength
+    remainingBuffer = remainingBuffer.slice(position)
+
+    previousPosition = position
+
+    return currentLength
+  }
+
+  // Iterate over all the submatches to find the convert the start and end values
+  // (which come as bytes from ripgrep) to character positions.
+  // We can do this because submatches come ordered by position.
+  for (const submatch of match.submatches) {
+    submatch.start = convertPosition(submatch.start)
+    submatch.end = convertPosition(submatch.end)
+  }
+}
+
 // This function processes a ripgrep submatch to create the correct
 // range. This is mostly needed for multi-line results, since the range
 // will have differnt start and end rows and we need to calculate these
@@ -247,7 +276,6 @@ module.exports = class RipgrepDirectorySearcher {
         buffer = lines.pop()
         for (const line of lines) {
           const message = JSON.parse(line)
-
           updateTrailingContexts(message, pendingTrailingContexts, options)
 
           if (message.type === 'begin') {
@@ -261,6 +289,8 @@ module.exports = class RipgrepDirectorySearcher {
             const trailingContextLines = []
             pendingTrailingContexts.add(trailingContextLines)
 
+            processUnicodeMatch(message.data)
+
             for (const submatch of message.data.submatches) {
               const { lineText, range } = processSubmatch(
                 submatch,