Merge 5a7810f into 23bb4b5

JuliaAI · Nov 18, 2019 · f066add · f066add
2 parents 23bb4b5 + 5a7810f
commit f066add
Show file tree

Hide file tree

Showing 4 changed files with 184 additions and 23 deletions.
diff --git a/docs/src/evaluating_model_performance.md b/docs/src/evaluating_model_performance.md
@@ -86,10 +86,7 @@ Or define their own re-usable `ResamplingStrategy` objects, - see
 [Custom resampling strategies](@ref) below.
 
 
-### Resampling strategies
-
-`Holdout` and `CV` (cross-validation) resampling strategies are
-available:
+### Built-in resampling strategies
 
 
 ```@docs
@@ -100,6 +97,10 @@ Holdout
 CV
 ```
 
+```@docs
+StratifiedCV
+```
+
 
 ### Custom resampling strategies
 

diff --git a/src/MLJ.jl b/src/MLJ.jl
@@ -8,7 +8,8 @@ export MLJ_VERSION
 export @curve, @pcurve, pretty,                   # utilities.jl
     coerce, supervised, unsupervised,             # tasks.jl
     report,                                       # machines.jl
-    Holdout, CV, evaluate!, Resampler,            # resampling.jl
+    Holdout, CV, StratifiedCV, evaluate!,         # resampling.jl
+    Resampler,                                    # resampling.jl
     Params, params, set_params!,                  # parameters.jl
     strange, iterator,                            # parameters.jl
     Grid, TunedModel, learning_curve!,            # tuning.jl

diff --git a/src/resampling.jl b/src/resampling.jl
@@ -15,12 +15,17 @@ train_test_pairs(s::ResamplingStrategy, rows, X, y) =
     train_test_pairs(s, rows)
 
 """
-    Holdout(; fraction_train=0.7,
-              shuffle=false,
-              rng=Random.GLOBAL_RNG)
+    holdout = Holdout(; fraction_train=0.7,
+                         shuffle=false,
+                         rng=Random.GLOBAL_RNG)
 
-Single train-test split with a (randomly selected) portion of the
-data being selected for training and the rest for testing.
+Holdout resampling strategy, for use in `evaluate!`, `evaluate` and in tuning.
+
+    train_test_pairs(holdout, rows)
+
+Returns the pair `[(train, test)]`, where `train` and `test` are
+vectors such that `rows=vcat(train, test)` and
+`length(train)/length(test) ≈ fraction_train`.
 
 If `rng` is an integer, then `MersenneTwister(rng)` is the random
 number generator used for shuffling rows. Otherwise some `AbstractRNG`
@@ -60,19 +65,25 @@ end
 
 
 """
-    CV(; nfolds=6,  shuffle=false, rng=Random.GLOBAL_RNG)
+    cv = CV(; nfolds=6,  shuffle=false, rng=Random.GLOBAL_RNG)
 
-Cross validation resampling where the data is (randomly) partitioned
-in `nfolds` folds and the model is evaluated `nfolds` times, each time
-taking one fold for testing and the remaining folds for training.
+Cross-validation resampling strategy, for use in `evaluate!`,
+`evaluate` and tuning.
 
-For instance, if `nfolds=3` then the data will be partitioned in three
-folds A, B and C and the model will be trained three times, first with
-A and B and tested on C, then on A, C and tested on B and finally on
-B, C and tested on A.
+    train_test_pairs(cv, rows)
 
-If `rng` is an integer, then `MersenneTwister(rng)` is the random
-number generator used for shuffling rows. Otherwise some `AbstractRNG`
+Returns an `nfolds`-length iterator of `(train, test)` pairs of
+vectors (row indices), where each `train` and `test` is a sub-vector
+of `rows`. The `test` vectors are mutually exclusive and exhaust
+`rows`. Each `train` vector is the complement of the
+corresponding `test` vector. With no shuffling, the order of `rows` is
+preserved, in the sense that `rows` coincides precisely with the
+concatenation of the `test` vectors, in the order they are
+generated. All but the last `test` vector have equal length.
+
+Declaring `shuffle=true` results in `rows` being shuffled first. If
+`rng` is an integer, then `MersenneTwister(rng)` is the random number
+generator used for shuffling `rows`. Otherwise some `AbstractRNG`
 object is expected.
 
 """
@@ -113,16 +124,129 @@ function train_test_pairs(cv::CV, rows)
     # define the (trainrows, testrows) pairs:
     firsts = 1:k:((nfolds - 1)*k + 1) # itr of first `test` rows index
     seconds = k:k:(nfolds*k)          # itr of last  `test` rows index
+
     ret = map(1:nfolds) do k
         f = firsts[k]
         s = seconds[k]
+        k < nfolds || (s = n_observations)
         return (vcat(rows[1:(f - 1)], rows[(s + 1):end]), # trainrows
-                rows[f:s])                               # testrows
+                rows[f:s])                                # testrows
     end
 
     return ret
 end
 
+"""
+    stratified_cv = StratifiedCV(; nfolds=6,  shuffle=false, rng=Random.GLOBAL_RNG)
+
+Stratified cross-validation resampling strategy, for use in
+`evaluate!`, `evaluate` and in tuning. Applies only to classification
+problems (`OrderedFactor` or `Multiclass` targets).
+
+    train_test_pairs(stratified_cv, rows, X, y)        # X is ignored
+ 
+Returns an `nfolds`-length iterator of `(train, test)` pairs of
+vectors (row indices) where each `train` and `test` is a sub-vector of
+`rows`. The `test` vectors are mutually exclusive and exhaust
+`rows`. Each `train` vector is the complement of the corresponding
+`test` vector. 
+
+Unlike regular cross-validation, the distribution of the levels of the
+target `y` corresponding to each `train` and `test` is constrained, as
+far as possible, to replicate that of `y[rows]` as a whole.
+
+Specifically, the data is split into a number of groups on which `y`
+is constant, and each individual group is resampled according to the
+ordinary cross-validation strategy `CV(nfolds=nfolds)`. To obtain the
+final `(train, test)` pairs of row indices, the per-group pairs are
+collated in such a way that each collated `train` and `test` respects
+the original order of `rows` (after shuffling, if `shuffle=true`).
+
+If `rng` is an integer, then `MersenneTwister(rng)` is the random
+number generator used for shuffling rows. Otherwise some `AbstractRNG`
+object is expected.
+
+"""
+struct StratifiedCV <: ResamplingStrategy
+    nfolds::Int
+    shuffle::Bool
+    rng::Union{Int,AbstractRNG}
+    function StratifiedCV(nfolds, shuffle, rng)
+        nfolds > 1 || error("Must have nfolds > 1. ")
+        return new(nfolds, shuffle, rng)
+    end
+end
+
+# Constructor with keywords
+StratifiedCV(; nfolds::Int=6,  shuffle::Bool=false,
+   rng::Union{Int,AbstractRNG}=Random.GLOBAL_RNG) =
+       StratifiedCV(nfolds, shuffle, rng)
+
+
+function train_test_pairs(stratified_cv::StratifiedCV, rows, X, y)
+    if stratified_cv.rng isa Integer
+        rng = MersenneTwister(stratified_cv.rng)
+    else
+        rng = stratified_cv.rng
+    end
+
+    n_observations = length(rows)
+    nfolds = stratified_cv.nfolds
+
+    if stratified_cv.shuffle
+        rows=shuffle!(rng, collect(rows))
+    end
+
+    st = scitype(y)
+    st <: AbstractArray{<:Finite} ||
+        error("Supplied target has scitpye $st but stratified "*
+              "cross-validation applies only to classification problems. ")
+
+
+    freq_given_level = countmap(y[rows])
+    minimum(values(freq_given_level)) >= nfolds ||
+        error("The number of observations for which the target takes on a "*
+              "given class must, for each class, exceed `nfolds`. Try "*
+              "reducing `nfolds`. ")
+
+    levels_seen = keys(freq_given_level) |> collect
+
+    cv = CV(nfolds=nfolds)
+
+    # the target is constant on each stratum, a subset of `rows`:
+    class_rows = [rows[y[rows] .== c] for c in levels_seen]
+
+    # get the cv train/test pairs for each level:
+    train_test_pairs_per_level = (MLJ.train_test_pairs(cv, class_rows[m])
+                              for m in eachindex(levels_seen))
+
+    # just the train rows in each level:
+    trains_per_level = map(x -> first.(x),
+                           train_test_pairs_per_level)
+
+    # just the test rows in each level:
+    tests_per_level  = map(x -> last.(x),
+                                train_test_pairs_per_level)
+
+    # for each fold, concatenate the train rows over levels: 
+    trains_per_fold = map(x->vcat(x...), zip(trains_per_level...))
+
+    # for each fold, concatenate the test rows over levels: 
+    tests_per_fold = map(x->vcat(x...), zip(tests_per_level...))
+
+    # restore ordering specified by rows:
+    trains_per_fold = map(trains_per_fold) do train
+        filter(in(train), rows)
+    end
+    tests_per_fold = map(tests_per_fold) do test
+        filter(in(test), rows)
+    end
+
+    # re-assemble:
+    return zip(trains_per_fold, tests_per_fold) |> collect
+
+end
+
 
 ## DIRECT EVALUATION METHODS
 
@@ -131,9 +255,9 @@ end
               resampling=CV(),
               measure=nothing,
               weights=nothing,
-              operation=predict,  
+              operation=predict,
               acceleration=DEFAULT_RESOURCE[],
-              force=false, 
+              force=false,
               verbosity=1)
 
 Estimate the performance of a machine `mach` wrapping a supervised

diff --git a/test/resampling.jl b/test/resampling.jl
@@ -118,6 +118,41 @@ end
     @test shuffled.measurement[1] != result.measurement[1]
 end
 
+@testset "stratified_cv" begin
+
+    # check in explicit example:
+    y = categorical(['c', 'a', 'b', 'a', 'c', 'x',
+                 'c', 'a', 'a', 'b', 'b', 'b', 'b', 'b'])
+    rows = [14, 13, 12, 11, 10, 9, 8, 7, 5, 4, 3, 2, 1]
+    @test y[rows] == collect("bbbbbaaccabac")
+    scv = StratifiedCV(nfolds=3)
+    pairs = MLJ.train_test_pairs(scv, rows, nothing, y)
+    @test pairs == [([12, 11, 10, 8, 5, 4, 3, 2, 1], [14, 13, 9, 7]),
+                    ([14, 13, 10, 9, 7, 4, 3, 2, 1], [12, 11, 8, 5]),
+                    ([14, 13, 12, 11, 9, 8, 7, 5], [10, 4, 3, 2, 1])]
+    scv_random = StratifiedCV(nfolds=3, shuffle=true)
+    pairs_random = MLJ.train_test_pairs(scv_random, rows, nothing, y)
+    @test pairs != pairs_random
+
+    # wrong target type throws error:
+    @test_throws Exception MLJ.train_test_pairs(scv, rows, nothing, get.(y))
+
+    # too many folds throws error:
+    @test_throws Exception MLJ.train_test_pairs(StratifiedCV(nfolds=4),
+                                                rows, nothing, y)
+
+    # check class distribution is preserved in a larger randomized example:
+    N = 3
+    y = shuffle(vcat(fill(:a, N), fill(:b, 2N),
+                        fill(:c, 3N), fill(:d, 4N))) |> categorical;
+    d = fit(UnivariateFinite, y)
+    pairs = MLJ.train_test_pairs(scv, 1:10N, nothing, y)
+    folds = vcat(first.(pairs), last.(pairs))
+    @test all([fit(UnivariateFinite, y[fold]) ≈ d for fold in folds])
+
+
+end
+
 @testset "weights" begin
 
     # cv: