# bos/statistics

Lots of doco tweaking.

1 parent 06f2dba commit 22f9781bf146971b95eceb079b3a3df024d113b3 committed Sep 13, 2009
46 Statistics/Autocorrelation.hs
 @@ -0,0 +1,46 @@ +-- | +-- Module : Statistics.Autocorrelation +-- Copyright : (c) 2009 Bryan O'Sullivan +-- License : BSD3 +-- +-- Maintainer : bos@serpentine.com +-- Stability : experimental +-- Portability : portable +-- +-- Functions for computing autocovariance and autocorrelation of a +-- sample. + +module Statistics.Autocorrelation + ( + autocovariance + , autocorrelation + ) where + +import Data.Array.Vector +import Statistics.Sample (Sample, mean) + +-- | Compute the autocovariance of a sample, i.e. the covariance of +-- the sample against a shifted version of itself. +autocovariance :: Sample -> UArr Double +autocovariance a = mapU f . enumFromToU 0 \$ l-2 + where + f k = sumU (zipWithU (*) (takeU (l-k) c) (sliceU c k (l-k))) + / fromIntegral l + c = mapU (subtract (mean a)) a + l = lengthU a + +-- | Compute the autocorrelation function of a sample, and the upper +-- and lower bounds of confidence intervals for each element. +-- +-- /Note/: The calculation of the 95% confidence interval assumes a +-- stationary Gaussian process. +autocorrelation :: Sample -> (UArr Double, UArr Double, UArr Double) +autocorrelation a = (r, ci (-), ci (+)) + where + r = mapU (/ headU c) c + where c = autocovariance a + dllse = mapU f . scanl1U (+) . mapU square \$ r + where f v = 1.96 * sqrt ((v * 2 + 1) / l) + l = fromIntegral (lengthU a) + ci f = consU 1 . tailU . mapU (f (-1/l)) \$ dllse + square x = x * x
2 Statistics/Constants.hs
 @@ -25,7 +25,7 @@ m_huge :: Double m_huge = 1.797693e308 {-# INLINE m_huge #-} --- | The largest 'Int' /x/ such that @2**(x-1)@ is approximately +-- | The largest 'Int' /x/ such that 2**(/x/-1) is approximately -- representable as a 'Double'. m_max_exp :: Int m_max_exp = 1024
12 Statistics/Distribution.hs
 @@ -21,15 +21,15 @@ module Statistics.Distribution -- | The interface shared by all probability distributions. class Distribution d where -- | Probability density function. The probability that a - -- stochastic variable @x@ has the value @X@, i.e. @P(x=X)@. + -- stochastic variable /x/ has the value /X/, i.e. P(/x/=/X/). probability :: d -> Double -> Double -- | Cumulative distribution function. The probability that a - -- stochastic variable @x@ is less than @X@, i.e. @P(x Double -> Double -- | Inverse of the cumulative distribution function. The value - -- @X@ for which @P(x Double -> Double class Distribution d => Mean d where @@ -38,14 +38,14 @@ class Distribution d => Mean d where class Mean d => Variance d where variance :: d -> Double --- | Approximate the value of @X@ for which @P(x>X) == p@. +-- | Approximate the value of /X/ for which P(/x/>/X/)=/p/. -- -- This method uses a combination of Newton-Raphson iteration and -- bisection with the given guess as a starting point. The upper and -- lower bounds specify the interval in which the probability --- distribution reaches the value @p@. +-- distribution reaches the value /p/. findRoot :: Distribution d => d - -> Double -- ^ Probability @p@ + -> Double -- ^ Probability /p/ -> Double -- ^ Initial guess -> Double -- ^ Lower bound on interval -> Double -- ^ Upper bound on interval
6 Statistics/Function.hs
 @@ -21,15 +21,15 @@ import Data.Array.Vector.Algorithms.Combinators (apply) import Data.Array.Vector ((:*:)(..), UA, UArr, foldlU) import qualified Data.Array.Vector.Algorithms.Intro as I --- | Sort. +-- | Sort an array. sort :: (UA e, Ord e) => UArr e -> UArr e sort = apply I.sort {-# INLINE sort #-} --- | Partially sort, such that the least @k@ elements will be +-- | Partially sort an array, such that the least /k/ elements will be -- at the front. partialSort :: (UA e, Ord e) => - Int -- ^ The number @k@ of least elements + Int -- ^ The number /k/ of least elements. -> UArr e -> UArr e partialSort k = apply (\a -> I.partialSort a k)
9 Statistics/Math.hs
 @@ -62,8 +62,9 @@ n `choose` k data F = F {-# UNPACK #-} !Word64 {-# UNPACK #-} !Word64 --- | Compute the factorial function. Returns ∞ if the input is --- above 170. +-- | Compute the factorial function /n/!. Returns ∞ if the +-- input is above 170 (above which the result cannot be represented by +-- a 64-bit 'Double'). factorial :: Int -> Double factorial n | n < 0 = error "Statistics.Math.factorial: negative input" @@ -89,7 +90,7 @@ logFactorial n 2.7777777777778e-3) * y + 8.3333333333333e-2 {-# INLINE logFactorial #-} --- | Compute the incomplete gamma integral function, γ(/s/,/x/). +-- | Compute the incomplete gamma integral function γ(/s/,/x/). -- Uses Algorithm AS 239 by Shea. incompleteGamma :: Double -- ^ /s/ -> Double -- ^ /x/ @@ -135,7 +136,7 @@ incompleteGamma x p -- Adapted from http://people.sc.fsu.edu/~burkardt/f_src/asa245/asa245.html --- | Compute the logarithm of the gamma function, Γ(/x/). Uses +-- | Compute the logarithm of the gamma function Γ(/x/). Uses -- Algorithm AS 245 by Macleod. -- -- Gives an accuracy of 10–12 significant decimal digits, except
73 Statistics/Quantile.hs
 @@ -8,15 +8,20 @@ -- Stability : experimental -- Portability : portable -- --- Functions for approximating quantiles. +-- Functions for approximating quantiles, i.e. points taken at regular +-- intervals from the cumulative distribution function of a random +-- variable. +-- +-- The number of quantiles is described below by the variable /q/, so +-- with /q/=4, a 4-quantile (also known as a /quartile/) has 4 +-- intervals, and contains 5 points. The parameter /k/ describes the +-- desired point, where 0 ≤ /k/ ≤ /q/. module Statistics.Quantile ( - -- * Types - ContParam(..) - -- * Quantile estimation functions - , weightedAvg + weightedAvg + , ContParam(..) , continuousBy -- * Parameters for the continuous sample method @@ -33,14 +38,15 @@ module Statistics.Quantile import Control.Exception (assert) import Data.Array.Vector (allU, indexU, lengthU) +import Statistics.Constants (m_epsilon) import Statistics.Function (partialSort) import Statistics.Types (Sample) --- | Use the weighted average method to estimate the @k@th --- @q@-quantile of a sample. -weightedAvg :: Int -- ^ @k@, the desired quantile - -> Int -- ^ @q@, the number of quantiles - -> Sample -- ^ @x@, the sample data +-- | Estimate the /k/th /q/-quantile of a sample, using the weighted +-- average method. +weightedAvg :: Int -- ^ /k/, the desired quantile. + -> Int -- ^ /q/, the number of quantiles. + -> Sample -- ^ /x/, the sample data. -> Double weightedAvg k q x = assert (q >= 2) . @@ -57,15 +63,17 @@ weightedAvg k q x = sx = partialSort (j+2) x {-# INLINE weightedAvg #-} --- | Parameters @a@ and @b@ to the 'quantileBy' function. +-- | Parameters /a/ and /b/ to the 'continuousBy' function. data ContParam = ContParam {-# UNPACK #-} !Double {-# UNPACK #-} !Double --- | Using the continuous sample method with the given parameters, --- estimate the @k@th @q@-quantile of a sample @x@. -continuousBy :: ContParam -- ^ Parameters @a@ and @b@ - -> Int -- ^ @k@, the desired quantile - -> Int -- ^ @q@, the number of quantiles - -> Sample -- ^ @x@, the sample data +-- | Estimate the /k/th /q/-quantile of a sample /x/, using the +-- continuous sample method with the given parameters. This is the +-- method used by most statistical software, such as R, Mathematica, +-- SPSS, and S. +continuousBy :: ContParam -- ^ Parameters /a/ and /b/. + -> Int -- ^ /k/, the desired quantile. + -> Int -- ^ /q/, the number of quantiles. + -> Sample -- ^ /x/, the sample data. -> Double continuousBy (ContParam a b) k q x = assert (q >= 2) . @@ -80,50 +88,51 @@ continuousBy (ContParam a b) k q x = h | abs r < eps = 0 | otherwise = r where r = t - fromIntegral j - eps = 8.881784e-16 + eps = m_epsilon * 4 n = lengthU x item = indexU sx . bracket sx = partialSort (bracket j + 1) x bracket m = min (max m 0) (n - 1) {-# INLINE continuousBy #-} --- | California Department of Public Works definition, @a=0,b=1@. --- Gives a linear interpolation of the empirical CDF. --- This corresponds to method 4 in R and Mathematica. +-- | California Department of Public Works definition, /a/=0, /b/=1. +-- Gives a linear interpolation of the empirical CDF. This +-- corresponds to method 4 in R and Mathematica. cadpw :: ContParam cadpw = ContParam 0 1 {-# INLINE cadpw #-} --- | Hazen's definition, @a=0.5,b=0.5@. This is claimed to be popular --- among hydrologists. This corresponds to method 5 in R and +-- | Hazen's definition, /a/=0.5, /b/=0.5. This is claimed to be +-- popular among hydrologists. This corresponds to method 5 in R and -- Mathematica. hazen :: ContParam hazen = ContParam 0.5 0.5 {-# INLINE hazen #-} --- | SPSS definition, @a=0,b=0@, also known as Weibull's definition. --- This corresponds to method 6 in R and Mathematica. +-- | Definition used by the SPSS statistics application, with /a/=0, +-- /b/=0 (also known as Weibull's definition). This corresponds to +-- method 6 in R and Mathematica. spss :: ContParam spss = ContParam 0 0 {-# INLINE spss #-} --- | S definition, @a=1,b=1@. The interpolation points divide the --- sample range into @n-1@ intervals. This corresponds to method 7 in --- R and Mathematica. +-- | Definition used by the S statistics application, with /a/=1, +-- /b/=1. The interpolation points divide the sample range into @n-1@ +-- intervals. This corresponds to method 7 in R and Mathematica. s :: ContParam s = ContParam 1 1 {-# INLINE s #-} --- | Median unbiased definition, @a=1/3,b=1/3@. The resulting quantile --- estimates are approximately median unbiased regardless of the --- distribution of @x@. This corresponds to method 8 in R and +-- | Median unbiased definition, /a/=1\/3, /b/=1\/3. The resulting +-- quantile estimates are approximately median unbiased regardless of +-- the distribution of /x/. This corresponds to method 8 in R and -- Mathematica. medianUnbiased :: ContParam medianUnbiased = ContParam third third where third = 1/3 {-# INLINE medianUnbiased #-} --- | Normal unbiased definition, @a=3/8,b=3/8@. An approximately +-- | Normal unbiased definition, /a/=3\/8, /b/=3\/8. An approximately -- unbiased estimate if the empirical distribution approximates the -- normal distribution. This corresponds to method 9 in R and -- Mathematica.
3 Statistics/Resampling.hs
 @@ -25,7 +25,8 @@ import Statistics.Types (Estimator, Sample) import System.Random.Mersenne (MTGen, random) -- | A resample drawn randomly, with replacement, from a set of data --- points. +-- points. Distinct from a normal array to make it harder for your +-- humble author's brain to go wrong. newtype Resample = Resample { fromResample :: UArr Double } deriving (Eq, Show)
4 Statistics/Sample.hs
 @@ -12,8 +12,10 @@ module Statistics.Sample ( + -- * Types + Sample -- * Statistics of location - mean + , mean , harmonicMean , geometricMean
5 Statistics/Types.hs
 @@ -18,9 +18,12 @@ module Statistics.Types import Data.Array.Vector (UArr) +-- | Sample data. type Sample = UArr Double --- | A function that estimates a property of a sample, such as 'mean'. +-- | A function that estimates a property of a sample, such as its +-- 'mean'. type Estimator = Sample -> Double +-- | Weights for affecting the importance of elements of a sample. type Weights = UArr Double
20 statistics.cabal
 @@ -1,7 +1,20 @@ name: statistics -version: 0.1 -synopsis: A library of statistical types, data, and functions. -description: A library of statistical types, data, and functions. +version: 0.2 +synopsis: A library of statistical types, data, and functions +description: + This library provides a number of common functions and types useful + in statistics. Our focus is on high performance, numerical + robustness, and use of good algorithms. Where possible, we provide + references to the statistical literature. + . + The library's facilities can be divided into three broad categories: + . + Working with widely used discrete and continuous probability + distributions. (There are dozens of exotic distributions in use; we + focus on the most common.) + . + Computing with sample data: quantile estimation, kernel density + estimation, bootstrap methods, and autocorrelation analysis. license: BSD3 license-file: LICENSE homepage: http://darcs.serpentine.com/statistics @@ -15,6 +28,7 @@ extra-source-files: README library exposed-modules: + Statistics.Autocorrelation Statistics.Constants Statistics.Distribution Statistics.Distribution.Binomial