Merge ba65d9d into 48f23c1

alan-turing-institute · Nov 27, 2020 · 5311eed · 5311eed
2 parents 48f23c1 + ba65d9d
commit 5311eed
Show file tree

Hide file tree

Showing 10 changed files with 182 additions and 217 deletions.
diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml
@@ -1,11 +1,15 @@
 name: TagBot
 on:
-  schedule:
-    - cron: 0 * * * *
+  issue_comment:
+    types:
+      - created
+  workflow_dispatch:
 jobs:
   TagBot:
+    if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot'
     runs-on: ubuntu-latest
     steps:
       - uses: JuliaRegistries/TagBot@v1
         with:
           token: ${{ secrets.GITHUB_TOKEN }}
+          ssh: ${{ secrets.DOCUMENTER_KEY }}
diff --git a/Project.toml b/Project.toml
@@ -21,12 +21,12 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
-CategoricalArrays = "^0.8"
+CategoricalArrays = "^0.8,^0.9"
 ComputationalResources = "^0.3"
 Distributions = "^0.21,^0.22,^0.23, 0.24"
-MLJBase = "^0.15.1"
-MLJModels = "^0.12.1"
-MLJScientificTypes = "^0.3.0"
+MLJBase = "^0.16"
+MLJModels = "^0.12.1,^0.13"
+MLJScientificTypes = "^0.4.1"
 MLJTuning = "^0.5.1"
 ProgressMeter = "^1.1"
 StatsBase = "^0.32,^0.33"

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -2,20 +2,19 @@
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
-DecisionTree = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
-GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7"
 MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
+MLJDecisionTreeInterface = "c6f25543-311c-4c74-83dc-3ea6d1015661"
+MLJGLMInterface = "caf8df21-4939-456d-ac9c-5fefbfb04c0c"
 MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
 MLJModels = "d491faf4-2d78-11e9-2867-c94bc002c0b7"
 MLJMultivariateStatsInterface = "1b6a4a23-ba22-4f51-9698-8599985d3728"
 MLJScientificTypes = "2e2323e0-db8b-457b-ae0d-bdfb3bc63afd"
 MLJTuning = "03970b2e-30c4-11ea-3135-d1576263f10f"
 Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
-MultivariateStats = "6f286f6a-111f-5878-ab1e-185364afe411"
 NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
 RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"

diff --git a/docs/src/adding_models_for_general_use.md b/docs/src/adding_models_for_general_use.md
@@ -964,16 +964,12 @@ user using the package `JLSO`. There are two scenarios in which a new
 MLJ model API implementation will want to overload two additional
 methods `save` and `restore` to support serialization:
 
-1. The algorithm-providing package already has it's own serialization
-  format for learned parameters and/or hyper-parameters, which users
-  may want to access. In that case *the implementation overloads* `save`.
+1. The algorithm-providing package already has it's own serialization format for learned parameters and/or hyper-parameters, which users may want to access. In that case *the implementation overloads* `save`.
 
-2. The `fitresult` is not a sufficiently persistent object; for
-  example, it is a pointer passed from wrapped C code. In that case
-  *the implementation overloads* `save` *and* `restore`.
+2. The `fitresult` is not a sufficiently persistent object; for example, it is a pointer passed from wrapped C code. In that case *the implementation overloads* `save` *and* `restore`.
 
-In case 2, 1 presumably holds also, for otherwise MLJ serialization is
-probably not going to be possible without changes to the
+In case 2, 1 presumably applies also, for otherwise MLJ serialization
+is probably not going to be possible without changes to the
 algorithm-providing package. An example is given below.
 
 Note that in case 1, MLJ will continue to create it's own

diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md
@@ -271,7 +271,7 @@ vectors, matrices and tables - have a scientific type.
 ```@repl doda
 scitype(4.6)
 scitype(42)
-x1 = categorical(["yes", "no", "yes", "maybe"]);
+x1 = coerce(["yes", "no", "yes", "maybe"], Multiclass);
 scitype(x1)
 X = (x1=x1, x2=rand(4), x3=rand(4))  # a "column table"
 scitype(X)
@@ -367,9 +367,8 @@ are the key features of that convention:
 
 - Any `Integer` is interpreted as `Count`.
 
-- Any `CategoricalValue` or `CategoricalString`, `x`, is interpreted
-  as `Multiclass` or `OrderedFactor`, depending on the value of
-  `x.pool.ordered`.
+- Any `CategoricalValue` `x`, is interpreted as `Multiclass` or
+  `OrderedFactor`, depending on the value of `x.pool.ordered`.
 
 - `String`s and `Char`s are *not* interpreted as `Multiclass` or
   `OrderedFactor` (they have scitypes `Textual` and `Unknown`

diff --git a/docs/src/internals.md b/docs/src/internals.md
@@ -36,7 +36,7 @@ machine(model::M, Xtable, y) = Machine{M}(model, Xtable, y)
 ### fit! and predict/transform
 
 ````julia
-function fit!(machine::Machine; rows=nothing, force=false, verbosity=1)
+function fit!(mach::Machine; rows=nothing, force=false, verbosity=1)
 
     warning = clean!(mach.model)
     isempty(warning) || verbosity < 0 || @warn warning

diff --git a/docs/src/model_search.md b/docs/src/model_search.md
@@ -8,7 +8,7 @@ the `matching` method, and the search executed with the `models`
 methods, as detailed below. 
 
 A table of all models is also given at [List of Supported
-Models](@ref).
+Models](@ref model_list).
 
 
 ## Model metadata

diff --git a/docs/src/performance_measures.md b/docs/src/performance_measures.md
@@ -1,17 +1,20 @@
 # Performance Measures
 
-In MLJ loss functions, scoring rules, sensitivities, and so on, are collectively referred
-to as *measures*. Presently, MLJ includes a few built-in measures,
-provides support for the loss functions in the
-[LossFunctions.jl](https://github.com/JuliaML/LossFunctions.jl) library,
-and allows for users to define their own custom measures.
+In MLJ loss functions, scoring rules, sensitivities, and so on, are
+collectively referred to as *measures*. These include re-exported loss
+functions from the
+[LossFunctions.jl](https://github.com/JuliaML/LossFunctions.jl)
+library, overloaded to behave the same way as the built-in measures.
 
-Providing further measures for probabilistic predictors, such as
-proper scoring rules, and for constructing multi-target product
-measures, is a work in progress.
+To see list all measures, run `measures()`.  Further measures for
+probabilistic predictors, such as proper scoring rules, and for
+constructing multi-target product measures, are planned.  If you'd like
+to see measure added to MLJ, post a comment
+[here](https://github.com/alan-turing-institute/MLJBase.jl/issues/299)
 
-*Note for developers:* The measures interface and the built-in measures
- described here are defined in MLJBase.
+*Note for developers:* The measures interface and the built-in
+measures described here are defined in MLJBase, but will ultimately live
+in a separate package.
 
 
 ## Using built-in measures
@@ -31,20 +34,29 @@ measure(ŷ, y, w)
 where `y` iterates over observations of some target variable, and `ŷ`
 iterates over predictions (`Distribution` or `Sampler` objects in the
 probabilistic case). Here `w` is an optional vector of sample weights,
-which can be provided when the measure supports this.
+or a dictionary of class weights, when these are supported by the
+measure.
 
 ```@repl losses_and_scores
 using MLJ
 y = [1, 2, 3, 4];
 ŷ = [2, 3, 3, 3];
 w = [1, 2, 2, 1];
 rms(ŷ, y) # reports an aggregrate loss
-l1(ŷ, y, w) # reports per observation losses
-y = categorical(["male", "female", "female"])
-male = y[1]; female = y[2];
-d = UnivariateFinite([male, female], [0.55, 0.45]);
+l2(ŷ, y, w) # reports per observation losses
+y = coerce(["male", "female", "female"], Multiclass)
+d = UnivariateFinite(["male", "female"], [0.55, 0.45], pool=y);
 ŷ = [d, d, d];
-cross_entropy(ŷ, y)
+log_loss(ŷ, y)
+```
+
+The measures `rms`, `l2` and `log_loss` illustrated here are actually
+	instances of measure *types*. For, example, `l2 = LPLoss(p=2)` and
+`log_loss = LogLoss() = LogLoss(tol=eps())`. Common aliases are
+provided:
+
+```@repl losses_and_scores
+cross_entropy
 ```
 
 ## Traits and custom measures
@@ -58,9 +70,18 @@ method:
 info(l1)
 ```
 
-Use `measures()` to list all measures and `measures(conditions...)` to
+Query the doc-string for a measure using the name of its type:
+
+```@repl losses_and_scores
+rms
+@doc RootMeanSquaredError # same as `?RootMeanSqauredError
+```
+
+Use `measures()` to list all measures, and `measures(conditions...)` to
 search for measures with given traits (as you would [query
-models](model_search.md)).
+models](model_search.md)). The trait `instances` list the actual
+callable instances of a given measure type (typically aliases for the
+default instance).
 
 ```@docs
 measures(conditions...)
@@ -118,160 +139,39 @@ dispatched.
 
 The [LossFunctions.jl](https://github.com/JuliaML/LossFunctions.jl)
 package includes "distance loss" functions for `Continuous` targets,
-and "marginal loss" functions for `Binary` targets. While the
-LossFunctions,jl interface differs from the present one (for, example
-`Binary` observations must be +1 or -1), one can safely pass the loss
-functions defined there to any MLJ algorithm, which re-interprets it
-under the hood. Note that the "distance losses" in the package apply
-to deterministic predictions, while the "marginal losses" apply to
-probabilistic predictions.
-
-```@repl losses_and_scores
-using LossFunctions
-X = (x1=rand(5), x2=rand(5)); y = categorical(["y", "y", "y", "n", "y"]); w = [1, 2, 1, 2, 3];
-mach = machine(ConstantClassifier(), X, y);
-holdout = Holdout(fraction_train=0.6);
-evaluate!(mach,
-          measure=[ZeroOneLoss(), L1HingeLoss(), L2HingeLoss(), SigmoidLoss()],
-          resampling=holdout,
-          operation=predict,
-          weights=w,
-          verbosity=0)
-```
-
-*Note:* Although `ZeroOneLoss(ŷ, y)` makes no sense (neither `ŷ` nor
-`y` have a type expected by LossFunctions.jl), one can instead use the
-adaptor `MLJ.value` as discussed above:
-
-```@repl losses_and_scores
-ŷ = predict(mach, X);
-loss = MLJ.value(ZeroOneLoss(), ŷ, X, y, w) # X is ignored here
-mean(loss) ≈ misclassification_rate(mode.(ŷ), y, w)
-```
-
-
-## Built-in measures 
-
-
-```@docs
-area_under_curve
-```
-
-```@docs
-accuracy
-```
-
-```@docs
-balanced_accuracy
-```
-
-```@docs
-BrierScore
-```
-
-```@docs
-cross_entropy
-```
-
-```@docs
-FScore
-```
-
-```@docs
-false_discovery_rate
-```
-
-```@docs
-false_negative
-```
-
-```@docs
-false_negative_rate
-```
+and "marginal loss" functions for `Finite{2}` (binary) targets. While the
+LossFunctions.jl interface differs from the present one (for, example
+binary observations must be +1 or -1), MLJ has overloaded instances
+of the LossFunctions.jl types to behave the same as the built-in
+types.
 
-```@docs
-false_positive
-```
-
-```@docs
-false_positive_rate
-```
-
-```@docs
-l1
-```
-
-```@docs
-l2
-```
-
-```@docs
-mae
-```
-
-```@docs
-matthews_correlation
-```
-
-```@docs
-misclassification_rate
-```
-
-```@docs
-negative_predictive_value
-```
-
-```@docs
-positive_predictive_value
-```
-
-```@docs
-rms
-```
-
-```@docs
-rmsl
-```
-
-```@docs
-rmslp1
-```
+Note that the "distance losses" in the package apply to deterministic
+predictions, while the "marginal losses" apply to probabilistic
+predictions.
 
-```@docs
-rmsp
-```
-
-```@docs
-true_negative
-```
 
-```@docs
-true_negative_rate
-```
+## List of measures
 
-```@docs
-true_positive
+```@setup losses_and_scores
+using DataFrames
 ```
 
-```@docs
-true_positive_rate
+```@example losses_and_scores
+ms = measures()
+types = map(ms) do m m.name end
+instance = map(ms) do m m.instances end
+t = (type=types, instances=instance)
+DataFrame(t)
 ```
 
-## List of LossFunctions.jl measures
-
-`DWDMarginLoss()`, `ExpLoss()`, `L1HingeLoss()`, `L2HingeLoss()`,
-`L2MarginLoss()`, `LogitMarginLoss()`, `ModifiedHuberLoss()`,
-`PerceptronLoss()`, `ScaledMarginLoss()`, `SigmoidLoss()`,
-`SmoothedL1HingeLoss()`, `ZeroOneLoss()`, `HuberLoss()`,
-`L1EpsilonInsLoss()`, `L2EpsilonInsLoss()`, `LPDistLoss()`,
-`LogitDistLoss()`, `PeriodicLoss()`, `QuantileLoss()`,
-`ScaledDistanceLoss()`.
-
 
 ## Other performance related tools
 
+In MLJ one computes a confusion matrix by calling an instance of the
+`ConfusionMatrix` measure type on the data:
+
 ```@docs
-confusion_matrix
+ConfusionMatrix
 ```
 
 ```@docs