Permalink
Browse files

Initial revision

  • Loading branch information...
0 parents commit e9e674a2e86e175836b8da25459e826bea7e10bf @b4winckler committed May 15, 2012
Showing with 379 additions and 0 deletions.
  1. +1 −0 .gitignore
  2. +194 −0 Data/UpDownSignature.hs
  3. +84 −0 Data/UpDownSignature/Utility.hs
  4. +30 −0 LICENSE
  5. +13 −0 README.markdown
  6. +2 −0 Setup.hs
  7. +15 −0 tests/Tests.hs
  8. +40 −0 up-down-signature.cabal
@@ -0,0 +1 @@
+dist/
@@ -0,0 +1,194 @@
+{-# LANGUAGE BangPatterns #-}
+-- |
+-- Functions for computing up-down signatures distribution.
+
+module Data.UpDownSignature (
+ -- * Signature distribution
+ cdf
+ , pdf
+ -- * Signature score
+ , score
+ , approxScore
+ ) where
+
+import Control.Applicative ((<$>))
+import Data.Bits
+import Data.List (foldl', group, unfoldr)
+import Data.UpDownSignature.Utility (log2, fac, fst3, paths)
+import System.Random.MWC
+import qualified Data.MemoCombinators as Memo
+
+
+-- Internal representation of an up-down signature.
+--
+-- Signatures values are created using 'signature'. Do not assume that it will
+-- always be an Int.
+--
+-- Permutations on [0..N] have 2^(N-1) signatures, so Int should suffice
+-- since we can only handle N of the order 20 anyway.
+type Signature = Int
+
+-- The laps of an ordered list is the maximal number of points that are
+-- ordered monotonically. For example:
+--
+-- laps [1,2.2,3,0,4] = [2,1,1]
+type Laps = [Int]
+
+-- Representation of a count of permutations.
+--
+-- The number of permutations with a given signature grows very quickly, so
+-- Integer becomes necessary even for small signatures.
+type Count = Integer
+
+
+-- Largest signature that is likely to be used. Smaller signatures are subject
+-- to memoization.
+maxSignature :: Signature
+maxSignature = bit 20
+
+-- Convert ordered data to a signature. A standing assumption is that no two
+-- data points have the same order (this is not checked).
+signature :: Ord a => [a] -> Signature
+signature = lapsToSignature . laps
+
+-- Convert laps to signature.
+-- It is assumed that the laps only consists of positive integers!
+--
+-- Invariant:
+--
+-- lapsToSignature [n] = 2^n - 1
+lapsToSignature :: Laps -> Signature
+lapsToSignature l = fst3 $ foldl' f (0,0,odd $ length l) l
+ where
+ f (a,n,p) k = (if p then a + (2^k - 1) `shiftL` n else a, n+k, not p)
+
+-- Convert ordered data to laps.
+laps :: Ord a => [a] -> Laps
+laps = map length . group . unfoldr isIncreasing
+ where
+ isIncreasing (x:y:zs) = Just (x<y, y:zs)
+ isIncreasing _ = Nothing
+
+-- Count the number of permutations with the given signature.
+-- (This is memoized because it would be too slow otherwise.)
+count :: Signature -> Count
+count = Memo.arrayRange (1, maxSignature) count'
+ where
+ count' n | n > 1 = foldl' (\ !a !b -> a + count b) 0 (predecessors n)
+ | n == 1 = 2
+ | otherwise = 0
+
+-- The predecessors of 'n' are all numbers which can be produced by removing
+-- one bit from a block. If the topmost block consists of exactly one 1, then
+-- the result is inverted.
+--
+-- Examples: (postfix 'b' indicates base-2)
+--
+-- predecessors 110b = [11b, 10b]
+-- predecessors 101b = [10b,11b,10b] (Note: the last 10b is 01b inverted)
+predecessors :: Signature -> [Signature]
+predecessors n = go 0 n
+ where
+ go _ 0 = []
+ go !k !r
+ -- Special case for topmost block having exactly one digit
+ | k' == 1 && r' == 0 = [n .&. mask `xor` mask]
+ | otherwise = n .&. mask + n `shiftR` (k+1) `shiftL` k : go (k+k') r'
+ where
+ mask = 1 `shiftL` k - 1
+ (k',r') = shiftBlock r
+
+-- Shift away block of least significant ones or zeros and return how many
+-- digits were shifted as well as the shifted number.
+--
+-- Examples: (postfix 'b' indicates base-2)
+--
+-- shiftBlock 10111b = (3,10b)
+-- shiftBlock 10b = (1,1b)
+-- shiftBlock 1b = (1,0)
+shiftBlock :: Signature -> (Int, Signature)
+shiftBlock r = go 0 (r `mod` 2) r
+ where
+ go c _ 0 = (c,0)
+ go !c !p !n
+ | p == p' = go (c+1) p n'
+ | otherwise = (c,n)
+ where
+ (n',p') = n `divMod` 2
+
+
+-- | Probability that a permutation has the given signature.
+prob :: Signature -> Double
+prob n = count n `bigDiv` fac (log2 n + 2)
+
+-- | Probability that a permutation has a signature that is not more commmon
+-- than the given signature.
+cumulative :: Signature -> Double
+cumulative = Memo.arrayRange (1, maxSignature) cumulative'
+ where
+ cumulative' n = sum (filter (<= count n) counts) `bigDiv` fac (m+1)
+ where
+ counts = map count [2^(m-1)..2^m - 1]
+ m = 1 + log2 n
+
+bigDiv :: (Real a, Real b, Fractional c) => a -> b -> c
+bigDiv x y = fromRational $ toRational x / toRational y
+
+-- | Calculate score for the given paths.
+scorePaths :: (Ord a) => [[a]] -> Double
+scorePaths = exp . fst . foldl' step (0,0)
+ where
+ i2d = fromIntegral :: Int -> Double
+ step (!s,!n) !x =
+ let n' = succ n
+ in ((i2d n / i2d n') * s + (log . cdf) x / i2d n', n')
+
+-- | Pick one element of the given list.
+pick :: [a] -> GenIO -> IO a
+pick xs gen = (xs !!) <$> uniformR (0, length xs - 1) gen
+
+-- | Pick one path from the given configuration.
+pickPath :: [[a]] -> GenIO -> IO [a]
+pickPath xs gen = mapM (`pick` gen) xs
+
+-- | Pick one path from each configuration.
+pickPaths :: [[[a]]] -> GenIO -> IO [[a]]
+pickPaths xs gen = mapM (`pickPath` gen) xs
+
+
+-- | Probability that a permutation picked at random has the same signature as
+-- the given ordered list.
+pdf :: (Ord a) => [a] -> Double
+pdf = prob . signature
+
+-- | Probability that a permutation picked at random has a signature which is
+-- not more common than the signature of the given ordered list.
+--
+-- We say that n is not more common than m iff @count n <= count m@.
+--
+-- Another way to think of this is that @cumulative n@ is near 1 if most
+-- signatures are more unusual than n and it is near 0 if n is one of the
+-- most unusual signatures.
+cdf :: (Ord a) => [a] -> Double
+cdf = cumulative . signature
+
+-- | Score for a configuration of points. This equals the geometric mean of
+-- the cumulative probabilities of all paths through the configuration.
+score :: (Ord a)
+ => [[a]] -- ^ Configuration
+ -> Double -- ^ Exact score
+score = scorePaths . paths
+
+-- | Approximate the score for a configuration of points by sampling given
+-- number of times from all possible paths through the configuration.
+--
+-- The difference between 'approxScore' and 'score' is that the latter
+-- calculates 'cdf' for all paths through the configuration. This number grows
+-- very rapidly with the number of points in the configuration so 'approxScore'
+-- may have to be used for configurations that are not small.
+approxScore :: (Ord a)
+ => GenIO -- ^ Random number generator
+ -> Int -- ^ Number of times to sample all paths
+ -> [[a]] -- ^ Configuration
+ -> IO Double -- ^ Approximate score
+approxScore gen n conf = scorePaths <$> (pickPaths (replicate n conf) gen)
@@ -0,0 +1,84 @@
+module Data.UpDownSignature.Utility (
+ choose
+ , choosem
+ , fac
+ , fst3
+ , log2
+ , logStr
+ , logStrLn
+ , joinFirstTwo
+ , mid
+ , pad
+ , paths
+ , sums
+ ) where
+
+import Data.Bits (Bits,shiftR)
+import System.IO (hPutStr, hPutStrLn, stderr)
+
+
+-- | Integer base-2 logarithm
+log2 :: (Bits a, Ord a, Num a1) => a -> a1
+log2 n | n > 1 = 1 + log2 (n `shiftR` 1)
+ | otherwise = 0
+
+-- | Factorial
+fac :: Integer -> Integer
+fac n = product [1..n]
+
+-- | First element of a triple
+fst3 :: (a,b,c) -> a
+fst3 (x,_,_) = x
+
+-- | Binomial coefficient
+choose :: Integral a => a -> a -> a
+choose _ 0 = 1
+choose 0 _ = 0
+choose n k = choose (n-1) (k-1) * n `div` k
+
+-- | Multinomial coefficient (choose many / multinomial)
+-- choosem :: [Integer] -> Integer
+choosem :: Integral a => [a] -> a
+choosem ks = product [ n `choose` k | (n,k) <- zip (sums ks) ks ]
+
+-- | Partial sums from first to n-th element
+-- sums :: [Integer] -> [Integer]
+sums :: Num a => [a] -> [a]
+sums = scanl1 (+)
+
+-- | Pick out mid element of a list
+mid :: [a] -> a
+mid xs = xs !! n
+ where
+ n = length xs `div` 2
+
+-- | Pad string with spaces so that it is at least 'n' chars wide
+pad :: Int -> String -> String
+pad n s | n > k = replicate (n-k) ' ' ++ s
+ | otherwise = s
+ where
+ k = length s
+
+-- | Log string with newline to stderr
+logStrLn :: String -> IO ()
+logStrLn = hPutStrLn stderr
+
+-- | Log string to stderr
+logStr :: String -> IO ()
+logStr = hPutStr stderr
+
+-- | Join first two lists with the given function
+joinFirstTwo :: ([a] -> [a] -> [a]) -> [[a]] -> [[a]]
+joinFirstTwo f (x:y:zs) = f x y : zs
+joinFirstTwo _ xs = xs
+
+-- | Given a list of lists '[l1,...,ln]' enumerate all ways to pick exactly one
+-- element from each list 'li'. For example:
+--
+-- > paths [[1,2],[],[3,4],[5]] == [[1,3,5],[1,4,5],[2,3,5],[2,4,5]]
+--
+paths :: [[a]] -> [[a]]
+paths [] = [[]]
+paths ([]:xss) = paths xss
+paths (xs:xss) = [ x:path | x <- xs, path <- paths xss ]
+
30 LICENSE
@@ -0,0 +1,30 @@
+Copyright (c)2012, Björn Winckler
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials provided
+ with the distribution.
+
+ * Neither the name of Björn Winckler nor the names of other
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,13 @@
+This project contains functions for computing with up-down signatures.
+
+Up-down signatures are described in:
+
+- Fink, Willbrand, Brown. *1-D random landscapes and non-random data series*,
+ EPL **79**, 2007
+
+The algorithms in this library are a generalization of those described above in
+that they can handle the case where each bin of the categorical value can hold
+more than one value.
+
+This library is used by the command line tool
+[sigscore](https://github.com/b4winckler/sigscore).
@@ -0,0 +1,2 @@
+import Distribution.Simple
+main = defaultMain
@@ -0,0 +1,15 @@
+import Test.Framework (Test, defaultMain, testGroup)
+import Test.Framework.Providers.QuickCheck2 (testProperty)
+import Test.QuickCheck (Arbitrary(..))
+
+
+main :: IO ()
+main = defaultMain tests
+
+tests :: [Test]
+tests = [
+ testGroup "group" [
+ testProperty "prop1" prop1
+ , testProperty "prop2" prop2
+ ]
+ ]
@@ -0,0 +1,40 @@
+name: up-down-signature
+version: 0.1
+synopsis: Library for up-down signature calculations
+homepage: https://github.com/b4winckler/up-down-signature
+license: BSD3
+license-file: LICENSE
+author: Björn Winckler <bjorn.winckler@gmail.com>
+maintainer: Björn Winckler <bjorn.winckler@gmail.com>
+copyright: 2012 Björn Winckler
+category: Math
+build-type: Simple
+cabal-version: >= 1.2
+extra-source-files: README.markdown
+
+description:
+ This library provides functions to calculate probabilities based on up-down
+ signatures.
+
+library
+ exposed-modules: Data.UpDownSignature
+ other-modules: Data.UpDownSignature.Utility
+ ghc-options: -Wall
+ build-depends: base >= 4 && < 5,
+ mwc-random >= 0.12,
+ data-memocombinators >= 0.4.3
+
+-- test-suite tests
+-- type: exitcode-stdio-1.0
+-- hs-source-dirs: tests
+-- main-is: Tests.hs
+-- ghc-options: -Wall
+-- build-depends: base
+-- -- QuickCheck >= 2
+-- -- test-framework,
+-- -- test-framework-quickcheck2,
+-- -- test-framework-hunit,
+
+source-repository head
+ type: git
+ location: https://github.com/b4winckler/up-down-signature

0 comments on commit e9e674a

Please sign in to comment.