diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..849ddff --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +dist/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..11c63b4 --- /dev/null +++ b/LICENSE @@ -0,0 +1,30 @@ +Copyright (c)2012, Björn Winckler + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of Björn Winckler nor the names of other + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Setup.hs b/Setup.hs new file mode 100644 index 0000000..9a994af --- /dev/null +++ b/Setup.hs @@ -0,0 +1,2 @@ +import Distribution.Simple +main = defaultMain diff --git a/sigscore.cabal b/sigscore.cabal new file mode 100644 index 0000000..9fe9e6b --- /dev/null +++ b/sigscore.cabal @@ -0,0 +1,40 @@ +name: sigscore +version: 0.1 +synopsis: Signature score tool +homepage: https://github.com/b4winckler/up-down-signature +license: BSD3 +license-file: LICENSE +author: Björn Winckler +maintainer: Björn Winckler +copyright: 2012 Björn Winckler +category: Math +build-type: Simple +cabal-version: >= 1.2 +extra-source-files: README.markdown + +description: + Calculate signature scores for point configurations. + +executable sigscore + main-is: Main.hs + hs-source-dirs: src + build-depends: base >= 4 && < 5, + bytestring, + bytestring-lexing, + mwc-random >= 0.12, + up-down-signature >= 0.1 + +-- test-suite tests +-- type: exitcode-stdio-1.0 +-- hs-source-dirs: tests +-- main-is: Tests.hs +-- ghc-options: -Wall +-- build-depends: base +-- -- QuickCheck >= 2 +-- -- test-framework, +-- -- test-framework-quickcheck2, +-- -- test-framework-hunit, + +source-repository head + type: git + location: https://github.com/b4winckler/sigscore diff --git a/src/Main.hs b/src/Main.hs new file mode 100644 index 0000000..9e13363 --- /dev/null +++ b/src/Main.hs @@ -0,0 +1,70 @@ +{-# LANGUAGE BangPatterns #-} +module Main where + +import Control.Applicative ((<$>)) +import Control.Monad (forM_, when) +import Data.List (nub, sort) +import Data.Maybe (catMaybes) +import System.Environment (getArgs) +import System.Exit (exitSuccess) +import System.Random.MWC (withSystemRandom) + +import Parsers (parseDoubles) +import Utility (logStr, logStrLn, exitBecause, bin, pad) + +import qualified Data.ByteString.Lazy.Char8 as B +import qualified Data.UpDownSignature as Sig + + +main = withSystemRandom $ \gen -> do + as <- getArgs + ls <- B.lines <$> if null as then B.getContents else B.readFile (head as) + + when (null ls) exitSuccess -- Nothing to do! + + -- The first line is a list of integers representing categories + let cats = map truncate $ catMaybes $ parseDoubles $ head ls + ncats = length cats + + -- Sanity check and log info about categories + logStrLn $ foundCategories cats + when (ncats < 3) $ exitBecause tooFewCategories + when (length (nub cats) > 20) $ logStrLn tooManyCategories + + -- Each line is a list of doubles, with as many elements as there were in the + -- categories line. Missing values and NAs are allowed. Go through each and + -- compute the signature score. + forM_ (zip [1..] $ tail ls) $ \(lnum,line) -> do + let elems = parseDoubles line + nelems = length elems + if ncats == nelems + then do + let bins = map catMaybes $ bin cats elems + s <- Sig.approxScore gen numPaths $ bins + -- let s = Sig.score $ trimBins binSize $ map catMaybes $ bin cats elems + putStrLn $ show s + when (lnum `mod` 100 == 0) $ logStr $ " " ++ pad 4 (show lnum) + when (lnum `mod` 1000 == 0) $ logStrLn "" + else + logStrLn $ skippingLine lnum nelems ncats + + +numPaths = 10000 +binSize = 4 + +tooManyCategories = "\ +\WARNING: More than 20 categories. Break up the data into fewer categories\n\ +\ to cut down on processing times." + +tooFewCategories = "\ +\ERROR: First line must be a list of at least three integer categories." + +foundCategories xs = "Found " ++ show n ++ " " + ++ if n == 1 then "category" else "categories" ++ ": " ++ show (sort cs) + where + cs = nub xs + n = length cs + +skippingLine lnum actual expected = + "WARNING: Skipping line " ++ show lnum ++ + " (expected " ++ show expected ++ " elements, got " ++ show actual ++ ")" diff --git a/src/Parsers.hs b/src/Parsers.hs new file mode 100644 index 0000000..29f44ea --- /dev/null +++ b/src/Parsers.hs @@ -0,0 +1,39 @@ +module Parsers (parseDoubles) where + +import Data.List (unfoldr) +import qualified Data.ByteString.Lazy.Char8 as B +import qualified Data.ByteString.Lex.Lazy.Double as B + +-- | Parse doubles separated by space or separator character (as determined by +-- 'isSeparator'). Anything else is translated into a 'Nothing'. +-- +-- Examples: +-- +-- > parseDoubles "1 2.3" = [Just 1.0,Just 2.3] +-- > parseDoubles "1,,2.3," = [Just 1.0,Nothing,Just 2.3,Nothing] +-- > parseDoubles "1 NA 3" = [Just 1.0,Nothing,Just 3.0] +-- +parseDoubles :: B.ByteString -> [Maybe Double] +parseDoubles bs | B.null bs = [] + | isSeparator (B.last bs) = ds ++ [Nothing] + | otherwise = ds + where + ds = unfoldr go (skipWhitespace bs) + go xs | B.null xs = Nothing + | otherwise = case B.readDouble xs of + Just (d, ys) -> Just (Just d, skipSep ys) + Nothing -> Just (Nothing, skipSep $ skipNA xs) + +isWhitespace c = c == ' ' || c == '\t' + +isSeparator c = c `elem` ",;" + +skipWhitespace = B.dropWhile isWhitespace + +skipNA = B.dropWhile (not . \c -> isWhitespace c || isSeparator c) + +skipSep = skipWhitespace . skip . skipWhitespace + where + skip bs = case B.uncons bs of + Just (x, bs') -> if isSeparator x then bs' else B.cons x bs' + Nothing -> B.empty diff --git a/src/Utility.hs b/src/Utility.hs new file mode 100644 index 0000000..54fa72d --- /dev/null +++ b/src/Utility.hs @@ -0,0 +1,80 @@ +-- | +-- Utility functions. +-- +module Utility ( + alternate + , bin + , equating + , exitBecause + , logStr + , logStrLn + , pad + , pleat + , trimBins + ) where + +import Data.Ord (comparing) +import Data.List (sort, sortBy, groupBy) +import System.Exit (exitFailure) +import System.IO (hPutStr, hPutStrLn, stderr) + + +-- | Put data in bins defined by the given categories. +bin :: Ord a => [a] -> [b] -> [[b]] +bin cats = map (snd . unzip) + . groupBy (equating fst) + . sortBy (comparing fst) + . zip cats + +-- | Trim bins so that each bin contains at most the given number of elements. +trimBins :: Ord a => Int -> [[a]] -> [[a]] +trimBins n = map (take n . pleat) + +-- | Take a list and reorder it so that the median element comes first and the +-- extremal elements come last. Example: +-- +-- > pleat [1..9] = [5,4,6,3,7,2,8,1] +-- > pleat [0..9] = [5,4,6,3,7,2,8,1,9,0] +-- +-- Note that the input need not be sorted. +pleat :: Ord a => [a] -> [a] +pleat xs = alternate bs (reverse as) + where + (as,bs) = splitAt (length xs `div` 2) $ sort xs + +-- | Combine two lists into one by alternately taking an element from the +-- first, then from the second. For example: +-- +-- > alternate [1..3] [5..9] = [1,5,2,6,3,7,8,9] +-- +-- Note that no elements from either list are lost. +alternate :: [a] -> [a] -> [a] +alternate (x:xs) (y:ys) = x : y : alternate xs ys +alternate xs [] = xs +alternate [] ys = ys + +-- | Use e.g. with sorting routines as in +-- +-- > sortBy (equating fst) +-- +equating :: Eq a => (t -> a) -> t -> t -> Bool +equating f x y = f x == f y + +-- | Exit program with failure and log reason to stderr. +exitBecause :: String -> IO b +exitBecause reason = logStrLn reason >> exitFailure + +-- | Log string with newline to stderr +logStrLn :: String -> IO () +logStrLn = hPutStrLn stderr + +-- | Log string to stderr +logStr :: String -> IO () +logStr = hPutStr stderr + +-- | Pad string with spaces so that it is at least 'n' chars wide +pad :: Int -> String -> String +pad n s | n > k = replicate (n-k) ' ' ++ s + | otherwise = s + where + k = length s