Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Initial commit.

  • Loading branch information...
commit 5dbc67e2918ef9f64439d874fa58f9bf0c79a67e 0 parents
@bos authored
26 LICENSE
@@ -0,0 +1,26 @@
+Copyright (c) 2009, Bryan O'Sullivan
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials provided
+ with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3  Setup.lhs
@@ -0,0 +1,3 @@
+#!/usr/bin/env runhaskell
+> import Distribution.Simple
+> main = defaultMain
61 Statistics/Distribution.hs
@@ -0,0 +1,61 @@
+{-# LANGUAGE BangPatterns, ScopedTypeVariables #-}
+-- |
+-- Module : Statistics.Distribution
+-- Copyright : (c) 2009 Bryan O'Sullivan
+-- License : BSD3
+--
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : portable
+--
+-- Types and functions common to many probability distributions.
+
+module Statistics.Distribution
+ (
+ Distribution(..)
+ , findRoot
+ ) where
+
+-- | The interface shared by all probability distributions.
+class Distribution d where
+ -- | Probability density function. The probability that a
+ -- stochastic variable @x@ has the value @X@, i.e. @P(x=X)@.
+ probability :: d -> Double -> Double
+
+ -- | Cumulative distribution function. The probability that a
+ -- stochastic variable @x@ is less than @X@, i.e. @P(x<X)@.
+ cumulative :: d -> Double -> Double
+
+ -- | Inverse of the cumulative distribution function. The value
+ -- @X@ for which @P(x<X)@.
+ inverse :: d -> Double -> Double
+
+-- | Approximate the value of @X@ for which @P(x>X) == p@.
+--
+-- This method uses a combination of Newton-Raphson iteration and
+-- bisection with the given guess as a starting point. The upper and
+-- lower bounds specify the interval in which the probability
+-- distribution reaches the value @p@.
+findRoot :: Distribution d => d
+ -> Double -- ^ Probability @p@
+ -> Double -- ^ Initial guess
+ -> Double -- ^ Lower bound on interval
+ -> Double -- ^ Upper bound on interval
+ -> Double
+findRoot d prob = loop 0 1
+ where
+ loop !(i::Int) !dx !x !lo !hi
+ | abs dx <= accuracy || i >= maxIters = x
+ | otherwise = loop (i+1) dx'' x'' lo' hi'
+ where
+ err = cumulative d x - prob
+ (lo',hi') | err < 0 = (x, hi)
+ | otherwise = (lo, x)
+ pdf = probability d x
+ (dx',x') | pdf /= 0 = (err / pdf, x - dx)
+ | otherwise = (dx, x)
+ (dx'',x'')
+ | x' < lo' || x' > hi' || pdf == 0 = (x'-x, (lo + hi) / 2)
+ | otherwise = (dx', x')
+ accuracy = 1e-15
+ maxIters = 150
61 Statistics/Distribution/Normal.hs
@@ -0,0 +1,61 @@
+module Statistics.Distribution.Normal
+ (
+ NormalDistribution
+ , fromParams
+ , fromSample
+ , standard
+ ) where
+
+import Control.Exception (assert)
+import Data.Number.Erf (erfc)
+import Statistics.Internal (huge, sqrt_2, sqrt_2_pi)
+import qualified Statistics.Distribution as D
+import qualified Statistics.Sample as S
+
+data NormalDistribution = NormalDistribution {
+ mean :: {-# UNPACK #-} !Double
+ , variance :: {-# UNPACK #-} !Double
+ , pdfDenom :: {-# UNPACK #-} !Double
+ , cdfDenom :: {-# UNPACK #-} !Double
+ } deriving (Eq, Ord, Read, Show)
+
+instance D.Distribution NormalDistribution where
+ probability = probability
+ cumulative = cumulative
+ inverse = inverse
+
+standard :: NormalDistribution
+standard = NormalDistribution {
+ mean = 0.0
+ , variance = 1.0
+ , cdfDenom = sqrt_2
+ , pdfDenom = sqrt_2_pi
+ }
+
+fromParams :: Double -> Double -> NormalDistribution
+fromParams m v = assert (v > 0) $
+ NormalDistribution {
+ mean = m
+ , variance = v
+ , cdfDenom = sqrt_2 * sv
+ , pdfDenom = sqrt_2_pi * sv
+ }
+ where sv = sqrt v
+
+fromSample :: S.Sample -> NormalDistribution
+fromSample a = fromParams (S.mean a) (S.variance a)
+
+probability :: NormalDistribution -> Double -> Double
+probability d x = exp (-xm * xm / (2 * variance d)) / pdfDenom d
+ where xm = x - mean d
+
+cumulative :: NormalDistribution -> Double -> Double
+cumulative d x = erfc (-(x-mean d) / cdfDenom d) / 2
+
+inverse :: NormalDistribution -> Double -> Double
+inverse d p
+ | p == 0 = -huge
+ | p == 1 = huge
+ | p == 0.5 = mean d
+ | otherwise = x * sqrt (variance d) + mean d
+ where x = D.findRoot standard p 0 (-100) 100
29 Statistics/Internal.hs
@@ -0,0 +1,29 @@
+-- |
+-- Module : Statistics.Internal
+-- Copyright : (c) 2009 Bryan O'Sullivan
+-- License : BSD3
+--
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : portable
+--
+-- Types and functions common to much statistics code.
+
+module Statistics.Internal
+ (
+ huge
+ , sqrt_2
+ , sqrt_2_pi
+ ) where
+
+huge :: Double
+huge = 1e308
+{-# INLINE huge #-}
+
+sqrt_2 :: Double
+sqrt_2 = sqrt 2
+{-# INLINE sqrt_2 #-}
+
+sqrt_2_pi :: Double
+sqrt_2_pi = sqrt (2 * pi)
+{-# INLINE sqrt_2_pi #-}
215 Statistics/Sample.hs
@@ -0,0 +1,215 @@
+-- |
+-- Module : Statistics.Sample
+-- Copyright : (c) 2008 Don Stewart
+-- License : BSD3
+--
+-- Maintainer : bos@serpentine.com
+-- Stability : experimental
+-- Portability : portable
+--
+-- Commonly used sample statistics, also known as descriptive
+-- statistics.
+
+module Statistics.Sample
+ (
+ -- * Types
+ Sample
+ , Weights
+ -- * Statistics of location
+ , mean
+ , harmonicMean
+ , geometricMean
+ -- * Statistics of dispersion
+ -- $variance
+
+ -- ** Two-pass functions (numerically robust)
+ -- $robust
+ , variance
+ , varianceUnbiased
+ , stdDev
+
+ -- ** Single-pass functions (faster, less safe)
+ -- $cancellation
+ , fastVariance
+ , fastVarianceUnbiased
+ , fastStdDev
+
+ -- * References
+ -- $references
+ ) where
+
+import Data.Array.Vector
+
+type Sample = UArr Double
+type Weights = UArr Double
+
+-- | Arithmetic mean. This uses Welford's algorithm to provide
+-- numerical stability, using a single pass over the sample data.
+mean :: Sample -> Double
+mean = fstT . foldlU k (T 0 0)
+ where
+ k (T m n) x = T m' n'
+ where m' = m + (x - m) / fromIntegral n'
+ n' = n + 1
+{-# INLINE mean #-}
+
+-- | Harmonic mean. This algorithm performs a single pass over the
+-- sample.
+harmonicMean :: Sample -> Double
+harmonicMean xs = fromIntegral a / b
+ where
+ T b a = foldlU k (T 0 0) xs
+ k (T b a) n = T (b + (1/n)) (a+1)
+{-# INLINE harmonicMean #-}
+
+-- | Geometric mean of a sample containing no negative values.
+geometricMean :: Sample -> Double
+geometricMean xs = p ** (1 / fromIntegral n)
+ where
+ T p n = foldlU k (T 1 0) xs
+ k (T p n) a = T (p * a) (n + 1)
+{-# INLINE geometricMean #-}
+
+-- $variance
+--
+-- The variance&#8212;and hence the standard deviation&#8212;of a
+-- sample of fewer than two elements are both defined to be zero.
+
+-- $robust
+--
+-- These functions use the compensated summation algorithm of Chan et
+-- al. for numerical robustness, but require two passes over the
+-- sample data as a result.
+--
+-- Because of the need for two passes, these functions are /not/
+-- subject to stream fusion.
+
+robustVar :: Sample -> T
+robustVar s = fini . foldlU go (T1 0 0 0) $ s
+ where
+ go (T1 n s c) x = T1 n' s' c'
+ where n' = n + 1
+ s' = s + d * d
+ c' = c + d
+ d = x - m
+ fini (T1 n s c) = T (s - c ** (2 / fromIntegral n)) n
+ m = mean s
+
+-- | Maximum likelihood estimate of a sample's variance.
+variance :: Sample -> Double
+variance = fini . robustVar
+ where fini (T v n)
+ | n > 1 = v / fromIntegral n
+ | otherwise = 0
+{-# INLINE variance #-}
+
+-- | Unbiased estimate of a sample's variance.
+varianceUnbiased :: Sample -> Double
+varianceUnbiased = fini . robustVar
+ where fini (T v n)
+ | n > 1 = v / fromIntegral (n-1)
+ | otherwise = 0
+{-# INLINE varianceUnbiased #-}
+
+-- | Standard deviation. This is simply the square root of the
+-- maximum likelihood estimate of the variance.
+stdDev :: Sample -> Double
+stdDev = sqrt . varianceUnbiased
+
+-- $cancellation
+--
+-- The functions prefixed with the name @fast@ below perform a single
+-- pass over the sample data using Knuth's algorithm. They usually
+-- work well, but see below for caveats. These functions are subject
+-- to array fusion.
+--
+-- /Note/: in cases where most sample data is close to the sample's
+-- mean, Knuth's algorithm gives inaccurate results due to
+-- catastrophic cancellation.
+
+fastVar :: Sample -> T1
+fastVar = foldlU go (T1 0 0 0)
+ where
+ go (T1 n m s) x = T1 n' m' s'
+ where n' = n + 1
+ m' = m + d / fromIntegral n'
+ s' = s + d * (x - m')
+ d = x - m
+
+-- | Maximum likelihood estimate of a sample's variance.
+fastVariance :: Sample -> Double
+fastVariance = fini . fastVar
+ where fini (T1 n m s)
+ | n > 1 = s / fromIntegral n
+ | otherwise = 0
+{-# INLINE fastVariance #-}
+
+-- | Unbiased estimate of a sample's variance.
+fastVarianceUnbiased :: Sample -> Double
+fastVarianceUnbiased = fini . fastVar
+ where fini (T1 n m s)
+ | n > 1 = s / fromIntegral (n - 1)
+ | otherwise = 0
+{-# INLINE fastVarianceUnbiased #-}
+
+-- | Standard deviation. This is simply the square root of the
+-- maximum likelihood estimate of the variance.
+fastStdDev :: UArr Double -> Double
+fastStdDev = sqrt . fastVariance
+{-# INLINE fastStdDev #-}
+
+------------------------------------------------------------------------
+-- Helper code. Monomorphic unpacked accumulators.
+
+-- don't support polymorphism, as we can't get unboxed returns if we use it.
+data T = T {-# UNPACK #-}!Double {-# UNPACK #-}!Int
+
+data T1 = T1 {-# UNPACK #-}!Int {-# UNPACK #-}!Double {-# UNPACK #-}!Double
+
+fstT :: T -> Double
+fstT (T a _) = a
+
+-- this is a terrible name, and probably a bad place to be doing this
+quotT1 :: T1 -> Double
+quotT1 (T1 n _ m2) = m2 / (fromIntegral $ n - 2)
+
+{-
+
+Consider this core:
+
+with data T a = T !a !Int
+
+$wfold :: Double#
+ -> Int#
+ -> Int#
+ -> (# Double, Int# #)
+
+and without,
+
+$wfold :: Double#
+ -> Int#
+ -> Int#
+ -> (# Double#, Int# #)
+
+yielding to boxed returns and heap checks.
+
+-}
+
+-- $references
+--
+-- * Chan, T. F.; Golub, G.H.; LeVeque, R.J. (1979) Updating formulae
+-- and a pairwise algorithm for computing sample
+-- variances. Technical Report STAN-CS-79-773, Department of
+-- Computer Science, Stanford
+-- University. <ftp://reports.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf>
+--
+-- * Knuth, D.E. (1998) The art of computer programming, volume 2:
+-- seminumerical algorithms, 3rd ed., p. 232.
+--
+-- * Welford, B.P. (1962) Note on a method for calculating corrected
+-- sums of squares and products. /Technometrics/
+-- 4(3):419&#8211;420. <http://www.jstor.org/stable/1266577>
+--
+-- * West, D.H.D. (1979) Updating mean and variance estimates: an
+-- improved method. /Communications of the ACM/
+-- 22(9):532&#8211;535. <http://doi.acm.org/10.1145/359146.359153>
34 statistics.cabal
@@ -0,0 +1,34 @@
+name: statistics
+version: 0.1
+synopsis: A library of statistical types, data, and functions.
+description: A library of statistical types, data, and functions.
+license: BSD3
+license-file: LICENSE
+author: Bryan O'Sullivan <bos@serpentine.com>
+maintainer: Bryan O'Sullivan <bos@serpentine.com>
+copyright: 2009 Bryan O'Sullivan
+category: Math, Statistics
+build-type: Simple
+cabal-version: >= 1.2
+extra-source-files: README
+
+library
+ exposed-modules:
+ Statistics.Distribution
+ Statistics.Distribution.Normal
+ Statistics.Internal
+ Statistics.Sample
+ build-depends:
+ base < 5,
+ erf,
+ uvector >= 0.1.0.4
+ if impl(ghc >= 6.10)
+ build-depends:
+ base >= 4
+
+ -- gather extensive profiling data for now
+ ghc-prof-options: -auto-all
+
+ ghc-options: -Wall -funbox-strict-fields -O2
+ if impl(ghc >= 6.8)
+ ghc-options: -fwarn-tabs

0 comments on commit 5dbc67e

Please sign in to comment.
Something went wrong with that request. Please try again.