Stream data me (#302)

* basic properties for statistex * Fix lists full of negative floats breaking the ratio property We want to be the display of percentage to be always positive which wasn't the case if the average was negative. * Harden test suite showing why the length 1 clause is necessary property tests would catch this as well with an arithmetic error but I think the dicymentation gain from the doc tests is still very much worth it. * Failing percentile property No Internet atm to research what correct behaviour even looks like it. * I obviously know how to mark something pending... cough * fix our percentile misfortunes, we missed a clause * fix formatter error with access to stream_data
bencheeorg · Jun 25, 2019 · 797ef50 · 797ef50
1 parent 075bf0a
commit 797ef50
Show file tree

Hide file tree

Showing 7 changed files with 126 additions and 36 deletions.
diff --git a/.formatter.exs b/.formatter.exs
@@ -5,5 +5,6 @@
     "mix/**/*.{ex,exs}",
     "./mix.exs",
     "samples/**/*.{ex,exs}"
-  ]
+  ],
+  import_deps: [:stream_data]
 ]
diff --git a/lib/statistex.ex b/lib/statistex.ex
@@ -262,6 +262,12 @@ defmodule Statistex do
       iex> Statistex.standard_deviation([4, 9, 11, 12, 17, 5, 8, 12, 12], sample_size: 9, average: 10.0)
       4.0
 
+      iex> Statistex.standard_deviation([42])
+      0.0
+
+      iex> Statistex.standard_deviation([1, 1, 1, 1, 1, 1, 1])
+      0.0
+
       iex> Statistex.standard_deviation([])
       ** (ArgumentError) Passed an empty list ([]) to calculate statistics from, please pass a list containing at least on number.
   """
@@ -278,7 +284,6 @@ defmodule Statistex do
     do_standard_deviation(samples, average, sample_size)
   end
 
-  defp do_standard_deviation(_samples, _average, 0), do: 0.0
   defp do_standard_deviation(_samples, _average, 1), do: 0.0
 
   defp do_standard_deviation(samples, average, sample_size) do
@@ -294,7 +299,7 @@ defmodule Statistex do
   @doc """
     Calculate the standard deviation relative to the average.
 
-    This helps put the absolute standard deviation value into perspective expressing it relative to the average.
+    This helps put the absolute standard deviation value into perspective expressing it relative to the average. It's what percentage of the absolute value of the average the variance takes.
 
     ## Options
     If already calculated, the `:average` and `:standard_deviation` options can be provided to avoid recalulating those values.
@@ -308,6 +313,9 @@ defmodule Statistex do
         iex> Statistex.standard_deviation_ratio([4, 9, 11, 12, 17, 5, 8, 12, 12])
         0.4
 
+        iex> Statistex.standard_deviation_ratio([-4, -9, -11, -12, -17, -5, -8, -12, -12])
+        0.4
+
         iex> Statistex.standard_deviation_ratio([4, 9, 11, 12, 17, 5, 8, 12, 12], average: 10.0, standard_deviation: 4.0)
         0.4
 
@@ -332,7 +340,7 @@ defmodule Statistex do
     if average == 0 do
       0.0
     else
-      std_dev / average
+      abs(std_dev / average)
     end
   end
 
@@ -360,6 +368,8 @@ defmodule Statistex do
 
   Percentiles must be between 0 and 100 (excluding the boundaries).
 
+  The method used for interpolation is [described here and recommended by NIST](https://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm).
+
   `Argumenterror` is raised if the given list is empty.
 
   ## Examples

diff --git a/lib/statistex/percentile.ex b/lib/statistex/percentile.ex
@@ -27,10 +27,19 @@ defmodule Statistex.Percentile do
   end
 
   defp percentile(sorted_samples, number_of_samples, percentile_rank) do
-    rank = percentile_rank / 100 * max(0, number_of_samples + 1)
+    percent = percentile_rank / 100
+    rank = percent * (number_of_samples + 1)
     percentile_value(sorted_samples, rank)
   end
 
+  # According to https://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm
+  # the full integer of rank being 0 is an edge case and we simple choose the first
+  # element. See clause 2, our rank is k there.
+  defp percentile_value(sorted_samples, rank) when rank < 1 do
+    [first | _] = sorted_samples
+    first
+  end
+
   defp percentile_value(sorted_samples, rank) do
     index = max(0, trunc(rank) - 1)
     {pre_index, post_index} = Enum.split(sorted_samples, index)
@@ -60,6 +69,8 @@ defmodule Statistex.Percentile do
     |> to_float
   end
 
+  # Interpolation implemented according to: https://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm
+  #
   # "Type 6" interpolation strategy. There are many ways to interpolate a value
   # when the rank is not an integer (in other words, we don't exactly land on a
   # particular sample). Of the 9 main strategies, (types 1-9), types 6, 7, and 8
@@ -69,6 +80,7 @@ defmodule Statistex.Percentile do
   # - https://stat.ethz.ch/R-manual/R-devel/library/stats/html/quantile.html
   # - http://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm
   defp interpolation_value(lower_bound, upper_bound, rank) do
+    # in our source rank is k, and interpolation_weitgh is d
     interpolation_weight = rank - trunc(rank)
     interpolation_weight * (upper_bound - lower_bound)
   end

diff --git a/mix.exs b/mix.exs
@@ -55,6 +55,8 @@ defmodule Benchee.Mixfile do
       {:ex_doc, "~> 0.20.0", only: :dev},
       {:earmark, "~> 1.0", only: :dev},
       {:excoveralls, "~> 0.7", only: :test},
+      # dev and test so that the formatter has access
+      {:stream_data, "~> 0.4", only: [:dev, :test]},
       {:inch_ex, "~> 2.0", only: :docs},
       {:dialyxir, "~> 1.0.0-rc.4", only: :dev, runtime: false}
     ]

diff --git a/mix.lock b/mix.lock
@@ -22,5 +22,6 @@
   "parse_trans": {:hex, :parse_trans, "3.3.0", "09765507a3c7590a784615cfd421d101aec25098d50b89d7aa1d66646bc571c1", [:rebar3], [], "hexpm"},
   "poison": {:hex, :poison, "3.1.0", "d9eb636610e096f86f25d9a46f35a9facac35609a7591b3be3326e99a0484665", [:mix], [], "hexpm"},
   "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.4", "f0eafff810d2041e93f915ef59899c923f4568f4585904d010387ed74988e77b", [:make, :mix, :rebar3], [], "hexpm"},
+  "stream_data": {:hex, :stream_data, "0.4.3", "62aafd870caff0849a5057a7ec270fad0eb86889f4d433b937d996de99e3db25", [:mix], [], "hexpm"},
   "unicode_util_compat": {:hex, :unicode_util_compat, "0.4.1", "d869e4c68901dd9531385bb0c8c40444ebf624e60b6962d95952775cac5e90cd", [:rebar3], [], "hexpm"},
 }
diff --git a/test/statistex/percentile_test.exs b/test/statistex/percentile_test.exs
@@ -31,64 +31,76 @@ defmodule Statistex.PercentileTest do
   end
 
   describe "a list of one element" do
-    setup do
-      {:ok, samples: [300]}
-    end
-
-    test "1st percentile", %{samples: samples} do
-      %{1 => result} = percentiles(samples, [1])
+    @samples [300]
+    test "1st percentile" do
+      %{1 => result} = percentiles(@samples, [1])
       assert result == 300.0
     end
 
-    test "50th percentile", %{samples: samples} do
-      %{50 => result} = percentiles(samples, [50])
+    test "50th percentile" do
+      %{50 => result} = percentiles(@samples, [50])
       assert result == 300.0
     end
 
-    test "99th percentile", %{samples: samples} do
-      %{99 => result} = percentiles(samples, [99])
+    test "99th percentile" do
+      %{99 => result} = percentiles(@samples, [99])
       assert result == 300.0
     end
   end
 
   describe "a list of two elements" do
-    setup do
-      {:ok, samples: [300, 200]}
-    end
-
-    test "1st percentile", %{samples: samples} do
-      %{1 => result} = percentiles(samples, [1])
-      assert result == 203.0
+    @samples [300, 200]
+    test "1st percentile (small sample size simply picks first element)" do
+      %{1 => result} = percentiles(@samples, [1])
+      assert result == 200.0
     end
 
-    test "50th percentile", %{samples: samples} do
-      %{50 => result} = percentiles(samples, [50])
+    test "50th percentile" do
+      %{50 => result} = percentiles(@samples, [50])
       assert result == 250.0
     end
 
-    test "99th percentile", %{samples: samples} do
-      %{99 => result} = percentiles(samples, [99])
+    test "99th percentile" do
+      %{99 => result} = percentiles(@samples, [99])
       assert result == 300.0
     end
   end
 
-  describe "a list of three elements" do
-    setup do
-      {:ok, samples: [100, 300, 200]}
+  describe "seemingly problematic 2 element list [9, 1]" do
+    @samples [9, 1]
+
+    percentiles = %{
+      25 => 1,
+      50 => 5,
+      75 => 9.0,
+      90 => 9.0,
+      99 => 9.0
+    }
+
+    for {percentile, expected} <- percentiles do
+      @percentile percentile
+      @expected expected
+      test "#{percentile}th percentile" do
+        %{@percentile => result} = percentiles(@samples, [@percentile])
+        assert result == @expected
+      end
     end
+  end
 
-    test "1st percentile", %{samples: samples} do
-      %{1 => result} = percentiles(samples, [1])
-      assert result == 104.0
+  describe "a list of three elements" do
+    @samples [100, 300, 200]
+    test "1st percentile (small sample size simply picks first element)" do
+      %{1 => result} = percentiles(@samples, [1])
+      assert result == 100.0
     end
 
-    test "50th percentile", %{samples: samples} do
-      %{50 => result} = percentiles(samples, [50])
+    test "50th percentile" do
+      %{50 => result} = percentiles(@samples, [50])
       assert result == 200.0
     end
 
-    test "99th percentile", %{samples: samples} do
-      %{99 => result} = percentiles(samples, [99])
+    test "99th percentile" do
+      %{99 => result} = percentiles(@samples, [99])
       assert result == 300.0
     end
   end

diff --git a/test/statistex_test.exs b/test/statistex_test.exs
@@ -1,4 +1,56 @@
 defmodule Statistex.StatistexTest do
   use ExUnit.Case, async: true
   doctest Statistex
+
+  use ExUnitProperties
+  import Statistex
+  import StreamData
+
+  describe "property testing as we might get loads of data" do
+    property "doesn't blow up no matter what kind of nonempty list of floats it's given" do
+      check all samples <- list_of(float(), min_length: 1) do
+        stats = statistics(samples)
+
+        assert stats.sample_size >= 1
+        assert stats.minimum <= stats.maximum
+
+        assert stats.minimum <= stats.average
+        assert stats.average <= stats.maximum
+
+        assert stats.minimum <= stats.median
+        assert stats.median <= stats.maximum
+
+        assert stats.median == stats.percentiles[50]
+
+        assert stats.standard_deviation >= 0
+        assert stats.standard_deviation_ratio >= 0
+
+        # mode actually occurs in the samples
+        case stats.mode do
+          [_ | _] ->
+            Enum.each(stats.mode, fn mode ->
+              assert(mode in samples)
+            end)
+
+          # nothing to do there is no real mode
+          nil ->
+            nil
+
+          mode ->
+            assert mode in samples
+        end
+      end
+    end
+
+    property "percentiles are correctly related to each other" do
+      check all samples <- list_of(float(), min_length: 1) do
+        percies = percentiles(samples, [25, 50, 75, 90, 99])
+
+        assert percies[25] <= percies[50]
+        assert percies[50] <= percies[75]
+        assert percies[75] <= percies[90]
+        assert percies[90] <= percies[99]
+      end
+    end
+  end
 end