Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Finish off the search code, and hide it.

--HG--
extra : convert_revision : 25e274adc6811abb9d0a8d21d8715b13ccf59282
  • Loading branch information...
commit dce830fb0e309f8bdebeea1e1c624c47485d7a14 1 parent 34d4996
@bos authored
Showing with 86 additions and 22 deletions.
  1. +77 −20 Data/Text/Search.hs
  2. +8 −1 tests/Properties.hs
  3. +1 −1  text.cabal
View
97 Data/Text/Search.hs
@@ -1,25 +1,62 @@
{-# LANGUAGE BangPatterns, ScopedTypeVariables #-}
-module Data.Text.Search where
+-- |
+-- Module : Data.Text.Search
+-- Copyright : (c) Bryan O'Sullivan 2009
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com, rtharper@aftereternity.co.uk,
+-- duncan@haskell.org
+-- Stability : experimental
+-- Portability : GHC
+--
+-- Fast substring search for 'Text', based on work by Boyer, Moore,
+-- Horspool, Sunday, and Lundh.
+--
+-- References:
+--
+-- * R. S. Boyer, J. S. Moore: A Fast String Searching Algorithm.
+-- Communications of the ACM, 20, 10, 762-772 (1977)
+--
+-- * R. N. Horspool: Practical Fast Searching in Strings. Software -
+-- Practice and Experience 10, 501-506 (1980)
+--
+-- * D. M. Sunday: A Very Fast Substring Search Algorithm.
+-- Communications of the ACM, 33, 8, 132-142 (1990)
+--
+-- * F. Lundh: The Fast Search Algorithm.
+-- <http://effbot.org/zone/stringlib.htm> (2006)
+module Data.Text.Search
+ (
+ indices
+ , slowIndices
+ ) where
-import qualified Data.Text as T ()
+import qualified Data.Text as T
import qualified Data.Text.Array as A
import Data.Word (Word64)
import Data.Text.Internal (Text(..))
import Data.Bits ((.|.), (.&.))
import Data.Text.UnsafeShift (shiftL)
+import Data.Text.Unsafe (iter_)
-indices :: Text -> Text -> [Int]
+-- | /O(n+m)/ Find the offsets of all non-overlapping indices of
+-- @needle@ within @haystack@. In (unlikely) bad cases, this
+-- algorithm's complexity degenerates towards /O(n*m)/.
+indices :: Text -- ^ Substring to search for (@needle@)
+ -> Text -- ^ Text to search in (@haystack@)
+ -> [Int]
indices _needle@(Text narr noff nlen) _haystack@(Text harr hoff hlen)
- | ldiff < 0 = []
- | otherwise = outer 0
+ | nlen == 1 = scanOne (nindex 0)
+ | nlen <= 0 || ldiff < 0 = []
+ | otherwise = scan 0
where
- ldiff = hlen - nlen
- nlast = nlen - 1
- z = nindex nlast
+ ldiff = hlen - nlen
+ nlast = nlen - 1
+ z = nindex nlast
nindex k = A.unsafeIndex narr (noff+k)
hindex k = A.unsafeIndex harr (hoff+k)
- (mask :: Word64, skip :: Int) = buildTable 0 0 (nlen-2)
+ (mask :: Word64, skip) = buildTable 0 0 (nlen-2)
buildTable !i !msk !skp
| i >= nlast = (msk .|. swizzle z, skp)
| otherwise = buildTable (i+1) (msk .|. swizzle c) skp'
@@ -27,18 +64,38 @@ indices _needle@(Text narr noff nlen) _haystack@(Text harr hoff hlen)
skp' | c == z = nlen - i - 2
| otherwise = skp
swizzle k = 1 `shiftL` (fromEnum k .&. 0x3f)
- outer !i
- | i > ldiff = []
- | c == z = if candidateMatch 0
- then i : outer (i + nlast)
- else if nextInPattern
- then outer (i + nlen)
- else outer (i + skip)
- | nextInPattern = outer (i + nlen)
- | otherwise = outer (i + 1)
- where c = hindex (i+nlast)
+ scan !i
+ | i > ldiff = []
+ | c == z && candidateMatch 0 = i : scan (i + nlast)
+ | otherwise = scan (i + delta)
+ where c = hindex (i + nlast)
candidateMatch j
| j >= nlast = True
| hindex (i+j) /= nindex j = False
| otherwise = candidateMatch (j+1)
- nextInPattern = (mask .&. swizzle (hindex (i+nlen))) == 0
+ delta
+ | nextInPattern = nlen + 1
+ | c == z = skip + 1
+ | otherwise = 1
+ nextInPattern = (mask .&. swizzle (hindex (i+nlen))) == 0
+ scanOne c = loop 0
+ where loop !i | i >= hlen = []
+ | hindex i == c = i : loop (i+1)
+ | otherwise = loop (i+1)
+{-# INLINE indices #-}
+
+-- | /O(n*m)/ Find the offsets of all non-overlapping indices of
+-- @needle@ within @haystack@. Provided purely as a slower, reliable
+-- counterpart to 'indices'.
+slowIndices :: Text -- ^ Substring to search for (@needle@)
+ -> Text -- ^ Text to search in (@haystack@)
+ -> [Int]
+slowIndices needle@(Text _narr _noff nlen) haystack@(Text harr hoff hlen)
+ | T.null needle = []
+ | otherwise = scan 0
+ where
+ scan i | i >= hlen = []
+ | needle `T.isPrefixOf` t = i : scan (i+nlen)
+ | otherwise = scan (i+d)
+ where t = Text harr (hoff+i) (hlen-i)
+ d = iter_ haystack i
View
9 tests/Properties.hs
@@ -26,6 +26,7 @@ import Prelude hiding (replicate)
import System.IO.Unsafe (unsafePerformIO)
import Test.Framework (defaultMain, testGroup)
import Test.Framework.Providers.QuickCheck (testProperty)
+import Data.Text.Search
import QuickCheckUtils (NotEmpty(..))
@@ -486,6 +487,10 @@ sf_zipWith p c s = (L.zipWith c (L.filter p s) . L.filter p) `eqP` (unpackS . S
t_zipWith c s = L.zipWith c s `eqP` (unpackS . T.zipWith c (packS s))
tl_zipWith c s = L.zipWith c s `eqP` (unpackS . TL.zipWith c (packS s))
+t_indices = unsquare (\s -> slowIndices s `eq` indices s)
+t_indices_occurs t = unsquare (\ts -> let s = T.intercalate t ts
+ in slowIndices t s == indices t s)
+
-- Bit shifts.
shiftL w = forAll (choose (0,width-1)) $ \k -> Bits.shiftL w k == U.shiftL w k
where width = round (log (fromIntegral m) / log 2 :: Double)
@@ -836,7 +841,9 @@ tests = [
testProperty "t_elemIndices" t_elemIndices,
testProperty "tl_elemIndices" tl_elemIndices,
testProperty "t_count" t_count,
- testProperty "tl_count" tl_count
+ testProperty "tl_count" tl_count,
+ testProperty "t_indices" t_indices,
+ testProperty "t_indices_occurs" t_indices_occurs
],
testGroup "zips" [
View
2  text.cabal
@@ -22,7 +22,6 @@ library
Data.Text.Foreign
Data.Text.Lazy
Data.Text.Lazy.Encoding
- Data.Text.Search
other-modules:
Data.Text.Array
Data.Text.Encoding.Fusion
@@ -38,6 +37,7 @@ library
Data.Text.Lazy.Encoding.Fusion
Data.Text.Lazy.Fusion
Data.Text.Lazy.Internal
+ Data.Text.Search
Data.Text.Unsafe
Data.Text.UnsafeChar
Data.Text.UnsafeShift

0 comments on commit dce830f

Please sign in to comment.
Something went wrong with that request. Please try again.