diff --git a/lib/statistex.ex b/lib/statistex.ex index deccf5c..956f41b 100644 --- a/lib/statistex.ex +++ b/lib/statistex.ex @@ -18,10 +18,12 @@ defmodule Statistex do defstruct [ :total, :average, + :variance, :standard_deviation, :standard_deviation_ratio, :median, :percentiles, + :frequency_distribution, :mode, :minimum, :maximum, @@ -36,10 +38,12 @@ defmodule Statistex do @type t :: %__MODULE__{ total: number, average: float, + variance: float, standard_deviation: float, standard_deviation_ratio: float, median: number, percentiles: percentiles, + frequency_distribution: %{sample => pos_integer}, mode: mode, minimum: number, maximum: number, @@ -92,10 +96,18 @@ defmodule Statistex do iex> Statistex.statistics([200, 400, 400, 400, 500, 500, 500, 700, 900]) %Statistex{ average: 500.0, + variance: 40_000.0, standard_deviation: 200.0, standard_deviation_ratio: 0.4, median: 500.0, percentiles: %{50 => 500.0}, + frequency_distribution: %{ + 200 => 1, + 400 => 3, + 500 => 3, + 700 => 1, + 900 => 1 + }, mode: [500, 400], minimum: 200, maximum: 900, @@ -109,10 +121,12 @@ defmodule Statistex do iex> Statistex.statistics([0, 0, 0, 0]) %Statistex{ average: 0.0, + variance: 0.0, standard_deviation: 0.0, standard_deviation_ratio: 0.0, median: 0.0, percentiles: %{50 => 0.0}, + frequency_distribution: %{0 => 4}, mode: 0, minimum: 0, maximum: 0, @@ -132,26 +146,26 @@ defmodule Statistex do total = total(samples) sample_size = length(samples) average = average(samples, total: total, sample_size: sample_size) - standard_deviation = standard_deviation(samples, average: average, sample_size: sample_size) + variance = variance(samples, average: average, sample_size: sample_size) + standard_deviation = standard_deviation(samples, variance: variance) standard_deviation_ratio = - standard_deviation_ratio( - samples, - average: average, - standard_deviation: standard_deviation - ) + standard_deviation_ratio(samples, standard_deviation: standard_deviation) percentiles = calculate_percentiles(samples, configuration) - median = median(samples, percentiles: percentiles) + + frequency_distribution = frequency_distribution(samples) %__MODULE__{ total: total, average: average, + variance: variance, standard_deviation: standard_deviation, standard_deviation_ratio: standard_deviation_ratio, - median: median, + median: median(samples, percentiles: percentiles), percentiles: percentiles, - mode: mode(samples), + frequency_distribution: frequency_distribution, + mode: mode(samples, frequency_distribution: frequency_distribution), minimum: minimum(samples), maximum: maximum(samples), sample_size: sample_size @@ -243,9 +257,9 @@ defmodule Statistex do end @doc """ - Calculate the standard deviation. + Calculate the variance. - A measurement how much samples vary (the higher the more the samples vary). + A measurement how much samples vary (the higher the more the samples vary). This is the variance of a sample and is hence in its calculation divided by sample_size - 1 (Bessel's correction). ## Options If already calculated, the `:average` and `:sample_size` options can be provided to avoid recalulating those values. @@ -254,43 +268,78 @@ defmodule Statistex do ## Examples - iex> Statistex.standard_deviation([4, 9, 11, 12, 17, 5, 8, 12, 12]) - 4.0 + iex> Statistex.variance([4, 9, 11, 12, 17, 5, 8, 12, 12]) + 16.0 - iex> Statistex.standard_deviation([4, 9, 11, 12, 17, 5, 8, 12, 12], sample_size: 9, average: 10.0) - 4.0 + iex> Statistex.variance([4, 9, 11, 12, 17, 5, 8, 12, 12], sample_size: 9, average: 10.0) + 16.0 - iex> Statistex.standard_deviation([42]) + iex> Statistex.variance([42]) 0.0 - iex> Statistex.standard_deviation([1, 1, 1, 1, 1, 1, 1]) + iex> Statistex.variance([1, 1, 1, 1, 1, 1, 1]) 0.0 - iex> Statistex.standard_deviation([]) + iex> Statistex.variance([]) ** (ArgumentError) Passed an empty list ([]) to calculate statistics from, please pass a list containing at least on number. """ - @spec standard_deviation(samples, keyword) :: float - def standard_deviation(samples, options \\ []) - def standard_deviation([], _), do: raise(ArgumentError, @empty_list_error_message) + @spec variance(samples, keyword) :: float + def variance(samples, options \\ []) + def variance([], _), do: raise(ArgumentError, @empty_list_error_message) - def standard_deviation(samples, options) do + def variance(samples, options) do sample_size = Keyword.get_lazy(options, :sample_size, fn -> sample_size(samples) end) average = Keyword.get_lazy(options, :average, fn -> average(samples, sample_size: sample_size) end) - do_standard_deviation(samples, average, sample_size) + do_variance(samples, average, sample_size) end - defp do_standard_deviation(_samples, _average, 1), do: 0.0 + defp do_variance(_samples, _average, 1), do: 0.0 - defp do_standard_deviation(samples, average, sample_size) do + defp do_variance(samples, average, sample_size) do total_variance = Enum.reduce(samples, 0, fn sample, total -> total + :math.pow(sample - average, 2) end) - variance = total_variance / (sample_size - 1) + total_variance / (sample_size - 1) + end + + @doc """ + Calculate the standard deviation. + + A measurement how much samples vary (the higher the more the samples vary). It's the square root of the variance. Unlike the variance, its unit is the same as that of the sample (as calculating the variance includes squaring). + + ## Options + If already calculated, the `:variance` option can be provided to avoid recalulating those values. + + `Argumenterror` is raised if the given list is empty. + + ## Examples + + iex> Statistex.standard_deviation([4, 9, 11, 12, 17, 5, 8, 12, 12]) + 4.0 + + iex> Statistex.standard_deviation([4, 9, 11, 12, 17, 5, 8, 12, 12], variance: 16.0) + 4.0 + + iex> Statistex.standard_deviation([42]) + 0.0 + + iex> Statistex.standard_deviation([1, 1, 1, 1, 1, 1, 1]) + 0.0 + + iex> Statistex.standard_deviation([]) + ** (ArgumentError) Passed an empty list ([]) to calculate statistics from, please pass a list containing at least on number. + """ + @spec standard_deviation(samples, keyword) :: float + def standard_deviation(samples, options \\ []) + def standard_deviation([], _), do: raise(ArgumentError, @empty_list_error_message) + + def standard_deviation(samples, options) do + variance = Keyword.get_lazy(options, :variance, fn -> variance(samples) end) :math.sqrt(variance) end @@ -299,13 +348,13 @@ defmodule Statistex do This helps put the absolute standard deviation value into perspective expressing it relative to the average. It's what percentage of the absolute value of the average the variance takes. + `Argumenterror` is raised if the given list is empty. + ## Options If already calculated, the `:average` and `:standard_deviation` options can be provided to avoid recalulating those values. If both values are provided, the provided samples will be ignored. - `Argumenterror` is raised if the given list is empty. - ## Examples iex> Statistex.standard_deviation_ratio([4, 9, 11, 12, 17, 5, 8, 12, 12]) @@ -401,13 +450,44 @@ defmodule Statistex do defdelegate(percentiles(samples, percentiles), to: Percentile) @doc """ - Calculates the mode of the given samples. + A map showing which sample occurs how often in the samples. + Goes from a concrete occurence of the sample to the number of times it was observed in the samples. - Mode is the sample(s) that occur the most. Often one value, but can be multiple values if they occur the same amount of times. If no value occurs at least twice, this value will be nil. + ## Examples + + iex> Statistex.frequency_distribution([1, 2, 4.23, 7, 2, 99]) + %{ + 2 => 2, + 1 => 1, + 4.23 => 1, + 7 => 1, + 99 => 1 + } + + iex> Statistex.frequency_distribution([]) + ** (ArgumentError) Passed an empty list ([]) to calculate statistics from, please pass a list containing at least on number. + """ + @spec frequency_distribution(samples) :: %{required(sample) => pos_integer} + def frequency_distribution([]), do: raise(ArgumentError, @empty_list_error_message) + + def frequency_distribution(samples) do + Enum.reduce(samples, %{}, fn sample, counts -> + Map.update(counts, sample, 1, fn old_value -> old_value + 1 end) + end) + end + + @doc """ + Calculates the mode of the given samples. + + Mode is the sample(s) that occur the most. Often one value, but can be multiple values if they occur the same amount of times. If no value occurs at least twice, there is no mode and it hence returns `nil`. `Argumenterror` is raised if the given list is empty. + ## Options + + If already calculated, the `:frequency_distribution` option can be provided to avoid recalulating it. + ## Examples iex> Statistex.mode([5, 3, 4, 5, 1, 3, 1, 3]) @@ -423,8 +503,9 @@ defmodule Statistex do iex> Enum.sort(mode) [1, 3, 5] """ - @spec mode(samples) :: mode + @spec mode(samples, keyword) :: mode defdelegate mode(samples), to: Mode + defdelegate mode(samples, opts), to: Mode @doc """ Calculates the median of the given samples. diff --git a/lib/statistex/mode.ex b/lib/statistex/mode.ex index 7666886..d5a2410 100644 --- a/lib/statistex/mode.ex +++ b/lib/statistex/mode.ex @@ -1,21 +1,25 @@ defmodule Statistex.Mode do @moduledoc false - @spec mode(Statistex.samples()) :: Statistex.mode() - def mode([]) do + import Statistex + + @spec mode(Statistex.samples(), keyword) :: Statistex.mode() + def mode(samples, opts \\ []) + + def mode([], _) do raise( ArgumentError, "Passed an empty list ([]) to calculate statistics from, please pass a list containing at least on number." ) end - def mode(samples) do - samples - |> Enum.reduce(%{}, fn sample, counts -> - Map.update(counts, sample, 1, fn old_value -> old_value + 1 end) - end) - |> max_multiple - |> decide_mode + def mode(samples, opts) do + frequencies = + Keyword.get_lazy(opts, :frequency_distribution, fn -> frequency_distribution(samples) end) + + frequencies + |> max_multiple() + |> decide_mode() end defp max_multiple(map) do diff --git a/test/statistex_test.exs b/test/statistex_test.exs index 444f1fb..3c602a2 100644 --- a/test/statistex_test.exs +++ b/test/statistex_test.exs @@ -41,6 +41,7 @@ defmodule Statistex.StatistexTest do assert stats.median == stats.percentiles[50] + assert stats.variance >= 0 assert stats.standard_deviation >= 0 assert stats.standard_deviation_ratio >= 0 @@ -58,6 +59,30 @@ defmodule Statistex.StatistexTest do mode -> assert mode in samples end + + frequency_distribution = stats.frequency_distribution + frequency_entry_count = map_size(frequency_distribution) + + assert frequency_entry_count >= 1 + assert frequency_entry_count <= stats.sample_size + + # frequencies actually occur in samples + Enum.each(frequency_distribution, fn {key, value} -> + assert key in samples + assert value >= 1 + assert is_integer(value) + end) + + # all samples are in frequencies + Enum.each(samples, fn sample -> assert Map.has_key?(frequency_distribution, sample) end) + + # counts some up to sample_size + count_sum = + frequency_distribution + |> Map.values() + |> Enum.sum() + + assert count_sum == stats.sample_size end defp big_list_big_floats do