Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Initial commit of edit-distance package

  • Loading branch information...
commit 8be13bce326fef9cc1f80cf35160a883b5de4a4c 0 parents
@batterseapower authored
22 LICENSE
@@ -0,0 +1,22 @@
+Copyright (c) 2008, Maximilian Bolingbroke
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted
+provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this list of
+ conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, this list of
+ conditions and the following disclaimer in the documentation and/or other materials
+ provided with the distribution.
+ * Neither the name of Maximilian Bolingbroke nor the names of other contributors may be used to
+ endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4 Setup.lhs
@@ -0,0 +1,4 @@
+#! /usr/bin/env runhaskell
+
+> import Distribution.Simple
+> main = defaultMain
160 Text/EditDistance.hs
@@ -0,0 +1,160 @@
+{-# LANGUAGE PatternGuards, ScopedTypeVariables #-}
+
+module Text.EditDistance (
+ EditCosts(..), defaultEditCosts,
+ levenshteinDistance, restrictedDamerauLevenshteinDistance
+ ) where
+
+import Control.Monad
+import Control.Monad.ST
+import Data.Array.ST
+import Data.Array.MArray
+
+
+data EditCosts = EditCosts {
+ deletionCost :: Int,
+ insertionCost :: Int,
+ substitutionCost :: Int,
+ transpositionCost :: Int
+}
+
+defaultEditCosts = EditCosts {
+ deletionCost = 1,
+ insertionCost = 1,
+ substitutionCost = 1,
+ transpositionCost = 1
+}
+
+-- | Find the Levenshtein edit distance between two strings. That is to say, the number of deletion,
+-- insertion and substitution operations that are required to make the two strings equal. Note that
+-- this algorithm thereforec does not make use of the 'transpositionCost' field of the costs. See also:
+-- <http://en.wikipedia.org/wiki/Levenshtein_distance>
+levenshteinDistance :: EditCosts -> String -> String -> Int
+levenshteinDistance costs str1 str2 = runST (levenshteinDistanceST costs str1 str2)
+
+levenshteinDistanceST :: EditCosts -> String -> String -> ST s Int
+levenshteinDistanceST costs str1 str2 = do
+ -- Create string arrays
+ str1_array <- stringToArray str1
+ str2_array <- stringToArray str2
+
+ -- Create array of costs. Say we index it by (i, j) where i is the column index and j the row index.
+ -- Rows correspond to characters of str2 and columns to characters of str1.
+ cost_array <- newArray_ ((0, 0), (str1_len, str2_len)) :: ST s (STUArray s (Int, Int) Int)
+
+ -- Fill out the first row (j = 0)
+ forM_ [1..str1_len] $ \i -> writeArray cost_array (i, 0) (deletionCost costs * i)
+
+ -- Fill the remaining rows (j >= 1)
+ forM_ [1..str2_len] (\j -> do
+ row_char <- readArray str2_array j
+ prev_row_char <- readArray str2_array (j - 1)
+
+ -- Initialize the first element of the row (i = 0)
+ writeArray cost_array (0, j) (insertionCost costs * j)
+
+ -- Fill the remaining elements of the row (i >= 1)
+ forM_ [1..str1_len] (\i -> do
+ col_char <- readArray str1_array i
+ prev_col_char <- readArray str1_array (i - 1)
+
+ cost <- standardCosts costs cost_array row_char col_char (i, j)
+ writeArray cost_array (i, j) cost))
+
+ -- Return an actual answer
+ readArray cost_array (str1_len, str2_len)
+ where
+ str1_len = length str1
+ str2_len = length str2
+
+
+-- | Find the "restricted" Damerau-Levenshtein edit distance between two strings. This algorithm calculates the cost of
+-- the so-called optimal string alignment, which does not always equal the appropriate edit distance. The cost of the optimal
+-- string alignment is the number of edit operations needed to make the input strings equal under the condition that no substring
+-- is edited more than once. See also: <http://en.wikipedia.org/wiki/Damerau-Levenshtein_distance>.
+restrictedDamerauLevenshteinDistance :: EditCosts -> String -> String -> Int
+restrictedDamerauLevenshteinDistance costs str1 str2 = runST (restrictedDamerauLevenshteinDistanceST costs str1 str2)
+
+restrictedDamerauLevenshteinDistanceST :: EditCosts -> String -> String -> ST s Int
+restrictedDamerauLevenshteinDistanceST costs str1 str2 = do
+ -- Create string arrays
+ str1_array <- stringToArray str1
+ str2_array <- stringToArray str2
+
+ -- Create array of costs. Say we index it by (i, j) where i is the column index and j the row index.
+ -- Rows correspond to characters of str2 and columns to characters of str1.
+ cost_array <- newArray_ ((0, 0), (str1_len, str2_len)) :: ST s (STUArray s (Int, Int) Int)
+
+ -- Fill out the first row (j = 0)
+ forM_ [1..str1_len] $ \i -> writeArray cost_array (i, 0) (deletionCost costs * i)
+
+ -- Fill out the second row (j = 1)
+ when (str2_len > 0) $ do
+ initial_row_char <- readArray str2_array 1
+
+ -- Initialize the first element of the second row (i = 0)
+ writeArray cost_array (0, 1) (insertionCost costs)
+
+ -- Initialize the remaining elements of the row (i >= 1)
+ forM_ [1..str1_len] $ \i -> do
+ col_char <- readArray str1_array i
+
+ cost <- standardCosts costs cost_array initial_row_char col_char (i, 1)
+ writeArray cost_array (i, 1) cost
+
+ -- Fill the remaining rows (j >= 2)
+ forM_ [2..str2_len] (\j -> do
+ row_char <- readArray str2_array j
+ prev_row_char <- readArray str2_array (j - 1)
+
+ -- Initialize the first element of the row (i = 0)
+ writeArray cost_array (0, j) (insertionCost costs * j)
+
+ -- Initialize the second element of the row (i = 1)
+ when (str1_len > 0) $ do
+ col_char <- readArray str1_array 1
+
+ cost <- standardCosts costs cost_array row_char col_char (1, j)
+ writeArray cost_array (1, j) cost
+
+ -- Fill the remaining elements of the row (i >= 2)
+ forM_ [2..str1_len] (\i -> do
+ col_char <- readArray str1_array i
+ prev_col_char <- readArray str1_array (i - 1)
+
+ standard_cost <- standardCosts costs cost_array row_char col_char (i, j)
+ cost <- if prev_row_char == col_char && prev_col_char == row_char
+ then do transpose_cost <- fmap (+ (transpositionCost costs)) $ readArray cost_array (i - 2, j - 2)
+ return (standard_cost `min` transpose_cost)
+ else return standard_cost
+ writeArray cost_array (i, j) cost))
+
+ -- Return an actual answer
+ readArray cost_array (str1_len, str2_len)
+ where
+ str1_len = length str1
+ str2_len = length str2
+
+
+{-# INLINE standardCosts #-}
+standardCosts :: EditCosts -> STUArray s (Int, Int) Int -> Char -> Char -> (Int, Int) -> ST s Int
+standardCosts costs cost_array row_char col_char (i, j) = do
+ deletion_cost <- fmap (+ (deletionCost costs)) $ readArray cost_array (i - 1, j)
+ insertion_cost <- fmap (+ (insertionCost costs)) $ readArray cost_array (i, j - 1)
+ subst_cost <- fmap (+ if row_char == col_char then 0 else substitutionCost costs) $ readArray cost_array (i - 1, j - 1)
+ return $ deletion_cost `min` insertion_cost `min` subst_cost
+
+stringToArray :: String -> ST s (STUArray s Int Char)
+stringToArray str = do
+ array <- newArray_ (1, length str)
+ forM_ (zip [1..] str) (uncurry (writeArray array))
+ return array
+
+{-
+showArray :: STUArray s (Int, Int) Int -> ST s String
+showArray array = do
+ ((il, jl), (iu, ju)) <- getBounds array
+ flip (flip foldM "") [(i, j) | i <- [il..iu], j <- [jl.. ju]] $ \rest (i, j) -> do
+ elt <- readArray array (i, j)
+ return $ rest ++ show (i, j) ++ ": " ++ show elt ++ ", "
+-}
26 Text/EditDistance/Tests.hs
@@ -0,0 +1,26 @@
+module Main where
+
+import Text.EditDistance.Tests.Framework
+import Text.EditDistance.Tests.Properties
+
+import Test.QuickCheck
+
+
+myTest :: (Testable a) => a -> IO ()
+myTest | True = check (defaultConfig { configMaxTest = 300 })
+ | otherwise = test -- Quicker
+
+putStrIndented :: Int -> String -> IO ()
+putStrIndented indent what = putStr $ (replicate indent ' ') ++ what
+
+
+runTest :: Int -> Test -> IO ()
+runTest indent (Property name the_property) = do
+ putStrIndented indent name
+ myTest the_property
+runTest indent (TestGroup name tests) = do
+ putStrIndented indent (name ++ ":")
+ mapM_ (runTest (indent + 2)) tests
+
+main :: IO ()
+main = mapM_ (runTest 0) tests
93 Text/EditDistance/Tests/Framework.hs
@@ -0,0 +1,93 @@
+{-# LANGUAGE PatternGuards, RankNTypes, ExistentialQuantification #-}
+
+module Text.EditDistance.Tests.Framework where
+
+import Text.EditDistance
+
+import Test.QuickCheck
+import System.Random
+import Control.Monad
+import Data.Char
+
+
+data Test = forall a. Testable a => Property String a
+ | TestGroup String [Test]
+
+
+instance Arbitrary Char where
+ arbitrary = choose ('\32', '\128')
+ coarbitrary c = variant (ord c `rem` 4)
+
+
+class Arbitrary ops => EditOperation ops where
+ edit :: Arbitrary a => [a] -> ops -> Gen [a]
+ editCost :: EditCosts -> ops -> Int
+
+instance EditOperation op => EditOperation [op] where
+ edit = foldM edit
+ editCost costs = sum . map (editCost costs)
+
+instance EditOperation ExtendedEditOperation where
+ edit str op = do
+ gen <- rand
+ let max_split_ix | Transposition <- op = length str - 1
+ | otherwise = length str
+ (split_ix, _) = randomR (1, max_split_ix) gen
+ (str_l, str_r) = splitAt split_ix str
+ non_null = not $ null str
+ transposable = length str > 1
+ case op of
+ Deletion | non_null -> return $ init str_l ++ str_r
+ Insertion | non_null -> do
+ new_ch <- arbitrary
+ return $ str_l ++ new_ch : str_r
+ Insertion | otherwise -> fmap singleton arbitrary -- Need special case because randomR (1, 0) is undefined
+ Substitution | non_null -> do
+ new_ch <- arbitrary
+ return $ init str_l ++ new_ch : str_r
+ Transposition | transposable -> do -- Need transposable rather than non_null because randomR (1, 0) is undefined
+ return $ init str_l ++ head str_r : last str_l : tail str_r
+ _ -> return str
+ where
+ singleton :: a -> [a]
+ singleton x = [x]
+
+ editCost costs Deletion = deletionCost costs
+ editCost costs Insertion = insertionCost costs
+ editCost costs Substitution = substitutionCost costs
+ editCost costs Transposition = transpositionCost costs
+
+
+data EditedString ops = MkEditedString {
+ oldString :: String,
+ newString :: String,
+ operations :: ops
+}
+
+type ExtendedMultiplyEditedString = EditedString [ExtendedEditOperation]
+type ExtendedSinglyEditedString = EditedString ExtendedEditOperation
+
+instance Show ops => Show (EditedString ops) where
+ show (MkEditedString old_string new_string ops) = show old_string ++ " ==> " ++ show new_string ++ " (by " ++ show ops ++ ")"
+
+instance EditOperation ops => Arbitrary (EditedString ops) where
+ arbitrary = do
+ old_string <- arbitrary
+ edit_operations <- arbitrary
+ new_string <- edit old_string edit_operations
+ return $ MkEditedString {
+ oldString = old_string,
+ newString = new_string,
+ operations = edit_operations
+ }
+
+
+data ExtendedEditOperation = Deletion
+ | Insertion
+ | Substitution
+ | Transposition
+ deriving (Enum, Bounded, Show)
+
+instance Arbitrary ExtendedEditOperation where
+ arbitrary = fmap toEnum $ choose (fromEnum (minBound :: ExtendedEditOperation), fromEnum (maxBound :: ExtendedEditOperation))
+ coarbitrary op = variant (fromEnum op)
41 Text/EditDistance/Tests/Properties.hs
@@ -0,0 +1,41 @@
+{-# LANGUAGE PatternGuards, PatternSignatures #-}
+
+module Text.EditDistance.Tests.Properties (tests) where
+
+import Text.EditDistance
+import Text.EditDistance.Tests.Framework
+
+import Test.QuickCheck
+
+
+tests = [ TestGroup "Damerau-Levenshtein Distance" restrictedDamerauLevenshteinDistanceTests
+ ]
+
+
+testableCosts = EditCosts {
+ deletionCost = 1,
+ insertionCost = 2,
+ substitutionCost = 3, -- Can't be higher than deletion + insertion
+ transpositionCost = 3 -- Can't be higher than deletion + insertion
+}
+
+
+restrictedDamerauLevenshteinDistanceTests = [ Property "Self distance is zero" prop_self_distance_zero
+ , Property "Pure deletion has the right cost" prop_pure_deletion_cost_correct
+ , Property "Pure insertion has the right cost" prop_pure_insertion_cost_correct
+ , Property "Single operations have the right cost" prop_single_op_cost_is_distance
+ , Property "Cost bound is respected" prop_combined_op_cost_at_least_distance
+ ]
+ where
+ testableDistance = restrictedDamerauLevenshteinDistance testableCosts
+
+ prop_self_distance_zero str
+ = testableDistance str str == 0
+ prop_pure_deletion_cost_correct str
+ = testableDistance str "" == (deletionCost testableCosts) * length str
+ prop_pure_insertion_cost_correct str
+ = testableDistance "" str == (insertionCost testableCosts) * length str
+ prop_single_op_cost_is_distance (MkEditedString old new ops :: ExtendedSinglyEditedString)
+ = (length old > 2) ==> testableDistance old new == editCost testableCosts ops || old == new
+ prop_combined_op_cost_at_least_distance (MkEditedString old new ops :: ExtendedMultiplyEditedString)
+ = testableDistance old new <= editCost testableCosts ops
27 edit-distance.cabal
@@ -0,0 +1,27 @@
+Name: edit-distance
+Version: 0.1
+Cabal-Version: >= 1.2
+Description: Edit distances for fuzzy matching, including Levenhstein and Damerau-Levenshtein algorithms.
+License: BSD3
+License-File: LICENSE
+Author: Max Bolingbroke
+Maintainer: batterseapower@hotmail.com
+Build-Type: Simple
+
+Flag Tests
+ Description: Enable building the tests
+ Default: False
+
+
+Library
+ Exposed-Modules: Text.EditDistance
+ Build-Depends: base, array >= 0.1, random >= 1.0
+
+Executable edit-distance-tests
+ Main-Is: Text/EditDistance/Tests.hs
+ Build-Depends: base, array >= 0.1, random >= 1.0, QuickCheck >= 1.1
+ Extensions: PatternGuards, PatternSignatures,
+ RankNTypes, ExistentialQuantification
+
+ if !flag(tests)
+ Buildable: False
Please sign in to comment.
Something went wrong with that request. Please try again.