Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
1687 lines (1209 sloc) 66.2 KB
(*
Monadic contextual classification Mathematica package
Copyright (C) 2017 Anton Antonov
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Written by Anton Antonov,
antononcube @ gmail.com,
Windermere, Florida, USA.
*)
(*
Mathematica is (C) Copyright 1988-2017 Wolfram Research, Inc.
Protected by copyright law and international treaties.
Unauthorized reproduction or distribution subject to severe civil
and criminal penalties.
Mathematica is a registered trademark of Wolfram Research, Inc.
*)
(* :Title: MonadicContextualClassification *)
(* :Context: MonadicContextualClassification` *)
(* :Author: Anton Antonov *)
(* :Date: 2017-06-05 *)
(* :Package Version: 1.0 *)
(* :Mathematica Version: *)
(* :Copyright: (c) 2017 Anton Antonov *)
(* :Keywords: *)
(* :Discussion:
## Introduction
This package provides functions for classification with classifiers with contexts.
That is achieved by extending the functions of a State monad generated by the package
"StateMonadCodeGenerator.m", [1], with functions specific to classification work flow.
See [2] for explanations on monad code generation and extension.
Here is an example of a pipeline created with the functions in the package:
res =
ClCon[ds, <||>] ⟹
ClConSplitData[0.75] ⟹
ClConMakeClassifier["NearestNeighbors"] ⟹
ClConEchoFunctionContext[ClassifierInformation[#["classifier"]] &] ⟹
ClConClassifierMeasurements[{"Accuracy", "Precision", "Recall"}] ⟹
ClConEchoValue ⟹
(If[#1["Accuracy"] > 0.7, None, ClCon[#1, #2]] &) ⟹
ClConMakeClassifier["RandomForest"] ⟹
ClConEchoFunctionContext[ClassifierInformation[#["classifier"]] &] ⟹
ClConClassifierMeasurements[{"Accuracy", "Precision", "Recall"}] ⟹
ClConEchoValue;
## Contexts
The classifier contexts are Association objects. The pipeline values can have the form:
ClCon[ val, context:(_String|_Association) ]
see the explanations in [1] for more details.
Some of the specific functions set or retrieve values from contexts for the keys:
"trainingData", "testData", "classifier".
## Error messages
The error messages are print-outs with `Echo`. They can be easily changed to use `Message` instead.
(`Echo` is used since it fits the monadic pipeline "culture.")
## Examples
### Data
Assume we have the Titanic data as our working dataset:
dataName = "Titanic";
ds = Dataset[Flatten@*List @@@ ExampleData[{"MachineLearning", dataName}, "Data"]];
varNames = Flatten[List @@ ExampleData[{"MachineLearning", dataName}, "VariableDescriptions"]];
ds = ds[All, AssociationThread[varNames -> #] &];
### Complete usage example
TBD...
### Complete usage example with string contexts
TBD...
## References
[1] Anton Antonov, StateMonadCodeGenerator.m, 2017, MathematicaForPrediction at GitHub.
URL: https://github.com/antononcube/MathematicaForPrediction/blob/master/MonadicProgramming/StateMonadCodeGenerator.m
[2] Anton Antonov, "Monad code generation and extension", 2017, MathematicaForPrediction at GitHub.
URL: https://github.com/antononcube/MathematicaForPrediction/blob/master/MarkdownDocuments/Monad-code-generation-and-extension.md
## End matters
This file was created by Mathematica Plugin for IntelliJ IDEA.
Anton Antonov
Windermere, FL, USA
2017-06-05
*)
(*
TODO:
1. Add examples explained in detail.
2. CANCELED Make a true package.
3. DONE Add classifier ensemble handling.
4. Give examples of tracking symbols.
*)
(**************************************************************)
(* Importing packages (if needed) *)
(**************************************************************)
If[Length[DownValues[MathematicaForPredictionUtilities`RecordsSummary]] == 0,
Echo["MathematicaForPredictionUtilities.m", "Importing from GitHub:"];
Import["https://raw.githubusercontent.com/antononcube/MathematicaForPrediction/master/MathematicaForPredictionUtilities.m"]
];
If[Length[DownValues[StateMonadCodeGenerator`GenerateStateMonadCode]] == 0,
Echo["StateMonadCodeGenerator.m", "Importing from GitHub:"];
Import["https://raw.githubusercontent.com/antononcube/MathematicaForPrediction/master/MonadicProgramming/StateMonadCodeGenerator.m"]
];
If[Length[DownValues[ClassifierEnsembles`EnsembleClassifierMeasurements]] == 0,
Echo["ClassifierEnsembles.m", "Importing from GitHub:"];
Import["https://raw.githubusercontent.com/antononcube/MathematicaForPrediction/master/ClassifierEnsembles.m"]
];
(* Loaded in ClassifierEnsembles.m . *)
(*If[Length[DownValues[ROCFunctions`ROCPlot]] == 0,
Echo["ROCFunctions.m", "Importing from GitHub:"];
Import["https://raw.githubusercontent.com/antononcube/MathematicaForPrediction/master/ROCFunctions.m"]
];*)
If[Length[DownValues[VariableImportanceByClassifiers`AccuracyByVariableShuffling]] == 0,
Echo["VariableImportanceByClassifiers.m", "Importing from GitHub:"];
Import["https://raw.githubusercontent.com/antononcube/MathematicaForPrediction/master/VariableImportanceByClassifiers.m"]
];
If[Length[DownValues[CrossTabulate`CrossTabulate]] == 0,
Echo["CrossTabulate.m", "Importing from GitHub:"];
Import["https://raw.githubusercontent.com/antononcube/MathematicaForPrediction/master/CrossTabulate.m"]
];
If[Length[DownValues[SSparseMatrix`SSparseMatrixToTriplets]] == 0,
Echo["SSparseMatrix.m", "Importing from GitHub:"];
Import["https://raw.githubusercontent.com/antononcube/MathematicaForPrediction/master/SSparseMatrix.m"]
];
If[Length[DownValues[OutlierIdentifiers`HampelIdentifierParameters]] == 0,
Echo["OutlierIdentifiers.m", "Importing from GitHub:"];
Import["https://raw.githubusercontent.com/antononcube/MathematicaForPrediction/master/OutlierIdentifiers.m"]
];
(**************************************************************)
(* Package definition *)
(**************************************************************)
BeginPackage["MonadicContextualClassification`"]
$ClConFailure::usage = "Failure symbol for the monad ClCon."
ClConSplitData::usage = "ClConSplitData[fr_?NumberQ] splits the pipeline value into training and test parts. "
ClConRecoverData::usage = "ClConRecoverData joins split data from context or the current pipeline value into the pipeline value. \
The Association values of \"trainingData\", \"testData\", \"validationData\" are combined/joined into one."
ClConMakeClassifier::usage = "ClConMakeClassifier[methodSpec_?MethodSpecQ] makes a classifier with the specified method \
Using Association values of \"trainingData\", \"testData\", \"validationData\". \
The obtained classifier object is put as the result pipeline value; also in the context under the key \"classifier\". \
The Association values of \"trainingData\", \"testData\", \"validationData\" are put in the context too, if taken from \
the current pipeline value. "
ClConClassifierMeasurements::usage = "ClConClassifierMeasurements[measures : (_String | {_String ..})] \
computes the specified measurements for the classifier in the context. (Does not modify the context.)"
ClConClassifierMeasurementsByThreshold::usage = "ClConClassifierMeasurementsByThreshold[measures : (_String | {_String ..}), clLbl->th_?NumberQ] \
computes the specified measurements for the classifier in the context \
using the threshold th for the specified class label clLbl. \
(Does not modify the context.)"
ClConAccuracyByVariableShuffling::usage = "ClConAccuracyByVariableShuffling[opts : OptionsPattern[]] computes \
the variable importance. (Does not modify the context.)"
ClConSummarizeData::usage = "Summarizes the data in long form. Does not modify the context. \
Echoes the result with the default option values."
ClConSummarizeDataLongForm::usage = "Summarizes the data in long form. Does not modify the context. \
Does not echo the result."
ClConToNormalClassifierData::usage = "Non-monadic function. Converts data of different forms into record-label rules. \
I.e. in the form { (rec:{___}->lbl_)..} ."
ClConSetTrainingData::usage = "Sets the training data in the context. Does not change the pipeline value."
ClConSetTestData::usage = "Sets the test data in the context. Does not change the pipeline value."
ClConSetValidationData::usage = "Sets the validation data in the context. Does not change the pipeline value."
ClConSetClassifier::usage = "Sets the classifier in the context. Does not change the pipeline value."
ClConSetVariableNames::usage = "Sets the variable names in the context. Does not change the pipeline value."
ClConTakeTrainingData::usage = "Takes the training data in the context."
ClConTakeTestData::usage = "Takes the test data in the context."
ClConTakeValidationData::usage = "Takes the validation data in the context."
ClConTakeData::usage = "Recovers the data and gives it as a non-monadic value."
ClConTakeClassifier::usage = "Gives the classifier as non-monadic value."
ClConTakeROCData::usage = "Gives the ROC data as non-monadic value."
ClConTakeVariableNames::usage = "Finds the variable names and returns them as a non-monadic value."
ClConGetVariableNames::usage = "Finds the variable names and puts them as the pipeline value. Does not modify the context."
ClConEchoVariableNames::usage = "Finds and echoes the variable names. Does not modify the context."
ClConROCData::usage = "Computes the ROC data using the classifier and test data in the context. \
The obtained ROC data is put as the result pipeline value; also in the context under the key \"rocData\"."
ClConROCPlot::usage = "Makes a ROC plot and echoes it. The result pipeline value is the plot."
ClConROCListLinePlot::usage = "Makes ListLinePlot over specified ROC functions and echoes it. The result pipeline value is the plot."
ClConSuggestROCThresholds::usage = "Suggest thresholds based on ROC data."
ClConAssignVariableNames::usage = "Puts a value for \"variableNames\" in the context in correspondence to \"trainingData\" in the context. \
If an empty list is given the variable names are automatically derived."
ClConOutlierPosition::usage = "Find outlier positions in the data."
(*ClConFindOutliersPerClassLabel::usage = "Find outlier positions in the data per class label."*)
(*ClConDropOutliersPerClassLabel::usage = "Find and from outliers in the data per class label."*)
ClConReduceDimension::usage = "Applies dimension reduction with SVD. \
(If the non-label parts of the training data and test data can be converted numerical matrices.)"
(*ClConDeleteMissing::usage = "Deletes records with missing data values."*)
Begin["`Private`"]
Needs["MathematicaForPredictionUtilities`"]
Needs["StateMonadCodeGenerator`"]
Needs["ClassifierEnsembles`"]
Needs["ROCFunctions`"]
Needs["VariableImportanceByClassifiers`"]
Needs["CrossTabulate`"]
Needs["SSparseMatrix`"]
Needs["OutlierIdentifiers`"]
(* The definitions are made to have a prefix "ClCon" . *)
(**************************************************************)
(* Generation *)
(**************************************************************)
(* Generate base functions of ClCon monad (ClassifierWithContext) *)
GenerateStateMonadCode["MonadicContextualClassification`ClCon", "FailureSymbol" -> $ClConFailure]
(**************************************************************)
(* Infix operators *)
(**************************************************************)
(* This should be already done by GenerateStateMonadCode. *)
(*DoubleLongRightArrow[x_?ClConUnitQ, f_] := ClConBind[x, f];*)
(**************************************************************)
(* General functions *)
(**************************************************************)
ClearAll[ClConToNormalClassifierData]
Options[ClConToNormalClassifierData] = {"DeleteMissing"->True, "ClassLabelColumn" -> Automatic };
(* Here we use MathematicaForPredictionUtilities`DataArrayRulesForClassifyQ *)
ClConToNormalClassifierData[___] := $ClConFailure;
ClConToNormalClassifierData[ {}, opts:OptionsPattern[] ] := {};
ClConToNormalClassifierData[ data_?DataRulesForClassifyQ, opts:OptionsPattern[] ] :=
Block[{},
If[ TrueQ[ OptionValue[ClConToNormalClassifierData,"DeleteMissing"]],
DeleteMissing[data,1,2],
data
]
];
ClConToNormalClassifierData[ data_Association, opts:OptionsPattern[] ] :=
Block[{},
If[ TrueQ[ OptionValue[ClConToNormalClassifierData,"DeleteMissing"]],
DeleteMissing[data,2,2],
data
]
] /; MatchQ[ data, Association[(_?AtomQ -> _List) ..]];
ClConToNormalClassifierData[ td_Dataset, opts:OptionsPattern[] ] :=
ClConToNormalClassifierData[ Normal[td[All, Values]], opts];
ClConToNormalClassifierData[ data_?MatrixQ, opts:OptionsPattern[] ] :=
Block[{dmVal, labelCol, trainingRange, ncols},
ncols = Dimensions[data][[2]];
dmVal = TrueQ[ OptionValue[ClConToNormalClassifierData, "DeleteMissing"] ];
labelCol = OptionValue[ClConToNormalClassifierData,"ClassLabelColumn"];
If[ TrueQ[labelCol === Automatic], labelCol = ncols ];
trainingRange = Complement[Range[ncols], {labelCol}];
If[ dmVal,
Thread[#[[All, trainingRange]] -> #[[All, labelCol]]] &@ DeleteMissing[data, 1, 2],
(* ELSE *)
Thread[#[[All, trainingRange]] -> #[[All, labelCol]]] &@ data
]
];
(**************************************************************)
(* Data splitting and recovery functions *)
(**************************************************************)
ClearAll[ClConSplitData]
(* This function does not respect specified label column yet. *)
Options[ClConSplitData] = {Method->"LabelsProportional", "ClassLabelColumn" -> Automatic};
ClConSplitData[___][$ClConFailure] := $ClConFailure
ClConSplitData[fr_?NumberQ, opts:OptionsPattern[]][xs_, context_Association] :=
ClConSplitData[fr, 0, opts][xs, context];
ClConSplitData[fr_?NumberQ, valFr_?NumberQ, opts:OptionsPattern[]][xs_, context_Association] :=
Block[{method, labelCol, dataLabels, indGroups, t,
trainingData, testData, validationData},
method = OptionValue[ClConSplitData, Method];
labelCol = OptionValue[ClConSplitData,"ClassLabelColumn"];
If[ ! ( TrueQ[Head[xs] === Dataset] || DataRulesForClassifyQ[xs] || MatrixQ[xs] ),
Echo["No data to split or unknown data form.", "ClConSplitData:" ];
Return[$ClConFailure];
];
Which[
method == "LabelsProportional",
dataLabels =
Transpose[{Range[Length[xs]], Normal[xs[[All, -1]]]}];
indGroups = Map[#[[All, 1]] &, GroupBy[dataLabels, Last]];
t = TakeDrop[RandomSample[#], Floor[fr*Length[#]]] & /@ indGroups;
trainingData = xs[[ Join @@ t[[All, 1]], All]];
testData = xs[[ Join @@ t[[All, 2]], All]],
True,
{trainingData, testData} = TakeDrop[RandomSample[xs], Floor[fr*Length[xs]]];
];
If[ TrueQ[valFr == 0],
t = AssociationThread[{"trainingData", "testData"} -> {trainingData, testData}],
(*ELSE*)
t = Fold[ ClConBind, ClConUnit[trainingData, <||>], { ClConSplitData[1-valFr, opts], ClConTakeValue }];
{trainingData, validationData} = { t["trainingData"], t["testData"]};
t = AssociationThread[{"trainingData", "testData", "validationData"} -> {trainingData, testData, validationData}]
];
ClConUnit[ t, Join[context, t] ]
] /; 0 < fr <= 1 && 0 <= valFr < 1;
ClConSplitData[___][xs_, context_Association] :=
Block[{},
Echo[ "One or two arguments are expected, both numbers between 0 and 1.", "ClConSplitData:" ];
$ClConFailure
];
ClearAll[ClConRecoverData]
ClConRecoverData[$ClConFailure] := $ClConFailure;
ClConRecoverData[][xs_, context_Association] := ClConRecoverData[xs, context];
ClConRecoverData[xs_, context_Association] :=
Block[{},
Which[
MatchQ[xs, _Association] && KeyExistsQ[xs, "trainingData"] && KeyExistsQ[xs, "testData"] && KeyExistsQ[xs, "validationData"],
ClCon[Join[xs["trainingData"], xs["testData"], xs["validationData"]], context],
KeyExistsQ[context, "trainingData"] && KeyExistsQ[context, "testData"] && KeyExistsQ[context, "validationData"],
ClCon[Join[context["trainingData"], context["testData"], context["validationData"]], context],
MatchQ[xs, _Association] && KeyExistsQ[xs, "trainingData"] && KeyExistsQ[xs, "testData"],
ClCon[Join[xs["trainingData"], xs["testData"]], context],
KeyExistsQ[context, "trainingData"] && KeyExistsQ[context, "testData"],
ClCon[Join[context["trainingData"], context["testData"]], context],
True,
Echo["Cannot recover data.","ClConRecoverData:"];
$ClConFailure
]
];
(**************************************************************)
(* Setters / getters *)
(**************************************************************)
ClearAll["ClConSet*Data", "ClConTake*Data", ClConTakeClassifier, ClConTakeClassLabelIndex, ClConTakeVariableNames];
ClConSetTrainingData[___][$ClConFailure] := $ClConFailure;
ClConSetTrainingData[xs_, context_Association] := $ClConFailure;
ClConSetTrainingData[data_][xs_, context_Association] :=
ClConUnit[xs, Join[ context, <| "trainingData" -> data |> ] ];
ClConSetTestData[___][$ClConFailure] := $ClConFailure;
ClConSetTestData[xs_, context_Association] := $ClConFailure;
ClConSetTestData[data_][xs_, context_Association] :=
ClConUnit[xs, Join[ context, <| "testData" -> data |> ] ];
ClConSetValidationData[___][$ClConFailure] := $ClConFailure;
ClConSetValidationData[xs_, context_Association] := $ClConFailure;
ClConSetValidationData[data_][xs_, context_Association] :=
ClConUnit[xs, Join[ context, <| "validationData" -> data |> ] ];
ClConSetClassifier[___][$ClConFailure] := $ClConFailure;
ClConSetClassifier[xs_, context_Association] := $ClConFailure;
ClConSetClassifier[cl_][xs_, context_Association] :=
ClConUnit[xs, Join[ context, <| "classifier" -> cl |> ] ];
ClConSetVariableNames[___][$ClConFailure] := $ClConFailure;
ClConSetVariableNames[xs_, context_Association] := $ClConFailure;
ClConSetVariableNames[nms_][xs_, context_Association] :=
ClConUnit[xs, Join[ context, <| "variableNames" -> nms |> ] ];
ClConTakeTrainingData[___][$ClConFailure] := $ClConFailure;
ClConTakeTrainingData[$ClConFailure] := $ClConFailure;
ClConTakeTrainingData[][xs_, context_] := ClConTakeTrainingData[xs, context];
ClConTakeTrainingData[xs_, context_Association] := context["trainingData"];
ClConTakeTestData[___][$ClConFailure] := $ClConFailure;
ClConTakeTestData[$ClConFailure] := $ClConFailure;
ClConTakeTestData[][xs_, context_] := ClConTakeTestData[xs, context];
ClConTakeTestData[xs_, context_Association] := context["testData"];
ClConTakeValidationData[___][$ClConFailure] := $ClConFailure;
ClConTakeValidationData[$ClConFailure] := $ClConFailure;
ClConTakeValidationData[][xs_, context_] := ClConTakeValidationData[xs, context];
ClConTakeValidationData[xs_, context_Association] := context["validationData"];
ClConTakeData[___][$ClConFailure] := $ClConFailure;
ClConTakeData[$ClConFailure] := $ClConFailure;
ClConTakeData[][xs_, context_] := ClConTakeData[xs, context];
ClConTakeData[xs_, context_] :=
Fold[ ClConBind, ClConUnit[xs, context], {ClConRecoverData, ClConTakeValue}];
ClConTakeClassifier[___][$ClConFailure] := $ClConFailure;
ClConTakeClassifier[$ClConFailure] := $ClConFailure;
ClConTakeClassifier[][xs_, context_] := ClConTakeClassifier[xs, context];
ClConTakeClassifier[xs_, context_Association] := context["classifier"];
ClConTakeROCData[___][$ClConFailure] := $ClConFailure;
ClConTakeROCData[$ClConFailure] := $ClConFailure;
ClConTakeROCData[][xs_, context_] := ClConTakeROCData[xs, context];
ClConTakeROCData[xs_, context_Association] := context["rocData"];
ClConTakeClassLabelIndex[___][$ClConFailure] := $ClConFailure;
ClConTakeClassLabelIndex[][xs_, context_Association] := ClConTakeClassLabelIndex[Automatic][xs, context];
ClConTakeClassLabelIndex[classLabel_][xs_, context_Association] :=
Block[{varNames},
varNames = ClConBind[ ClConUnit[xs, context], ClConTakeVariableNames ];
(* Not a good idea this to be here *)
(*If[ TrueQ[varNames === $ClConFailure],*)
(*Echo["Proceeding with automatic variable names.", "ClConTakeClassLabelIndex:"];*)
(*varNames = ClConBind[ ClConUnit[xs, context], { ClConAssignVariableNames, ClConTakeVariableNames} ];*)
(*];*)
Which[
TrueQ[varNames === $ClConFailure],
$ClConFailure,
TrueQ[classLabel===Automatic] && KeyExistsQ[context, "classLabel"] && MemberQ[varNames, context["classLabel"]],
<| context["classLabel"] -> First@Flatten@Position[varNames, context["classLabel"]] |>,
TrueQ[classLabel===Automatic] && KeyExistsQ[context, "classLabel"] && !MemberQ[varNames, context["classLabel"]],
Echo["The context value for \"classLabel\" is one of " <> ToString[varNames], "ClConTakeClassLabelIndex::"];
$ClConFailure,
TrueQ[classLabel===Automatic],
<| varNames[[-1]] -> Length[varNames] |>,
MemberQ[varNames, classLabel],
<| classLabel -> First@Flatten@Position[varNames, classLabel] |>,
True,
Echo["The specified class label "<>ToString[classLabel]<>" is one of " <> ToString[varNames], "ClConTakeClassLabelIndex::"];
$ClConFailure
]
];
ClConTakeVariableNames[][$ClConFailure] := $ClConFailure;
ClConTakeVariableNames[$ClConFailure] := $ClConFailure;
ClConTakeVariableNames[][xs_, context_] := ClConTakeVariableNames[xs, context];
ClConTakeVariableNames[xs_, context_Association] :=
Fold[ClConBind, ClConUnit[xs, context], {ClConGetVariableNames, ClConTakeValue}];
(**************************************************************)
(* Dealing with variable names *)
(**************************************************************)
Clear[DatasetWithColumnNamesQ]
DatasetWithColumnNamesQ[ds_Dataset] :=
FreeQ[ Normal[ds[1, Keys]], _Missing ];
DatasetWithColumnNamesQ[___] := False;
ClearAll[ClConGetVariableNames];
ClConGetVariableNames[][$ClConFailure] := $ClConFailure;
ClConGetVariableNames[$ClConFailure] := $ClConFailure;
ClConGetVariableNames[][xs_, context_] := ClConGetVariableNames[xs, context];
ClConGetVariableNames[xs_, context_Association] :=
Block[{},
Which[
DatasetWithColumnNamesQ[xs],
ClCon[Normal[xs[1,Keys]], context],
MatchQ[xs, _Association] && KeyExistsQ[xs, "trainingData"] && DatasetWithColumnNamesQ[xs["trainingData"]],
ClCon[Normal[xs["trainingData"][1,Keys]], context],
KeyExistsQ[context, "trainingData"] && KeyExistsQ[context, "testData"] && DatasetWithColumnNamesQ[context["trainingData"]],
ClCon[Normal[context["trainingData"][1,Keys]], context],
KeyExistsQ[context, "variableNames"],
ClCon[context["variableNames"], context],
True,
Echo["Cannot find the variable names: (1) there is no context key \"variableNames\", " <>
"(2) the pipeline value is not a Dataset with named columns, and " <>
"(3) there is no \"trainingData\" key in the context or the corresponding value is not a Dataset with named columns.",
"ClConGetVariableNames:"];
$ClConFailure
]
];
ClearAll[ClConEchoVariableNames];
ClConEchoVariableNames[$ClConFailure] := $ClConFailure;
ClConEchoVariableNames[][$ClConFailure] := $ClConFailure;
ClConEchoVariableNames[][xs_, context_] := ClConEchoVariableNames[xs, context];
ClConEchoVariableNames[xs_, context_Association] :=
Block[{t},
t = Fold[ ClConBind, ClConUnit[xs,context], {ClConGetVariableNames, ClConTakeValue}];
If[ TrueQ[ t === $ClConFailure ],
$ClConFailure,
Echo[t,"variable names:"];
ClConUnit[xs, context]
]
];
ClearAll[ClConAssignVariableNames]
ClConAssignVariableNames[$ClConFailure] := $ClConFailure;
ClConAssignVariableNames[][$ClConFailure] := $ClConFailure;
ClConAssignVariableNames[xs_, context_Association] := ClConAssignVariableNames[{}][xs, context];
ClConAssignVariableNames[][xs_, context_] := ClConAssignVariableNames[{}][xs, context];
ClConAssignVariableNames[Automatic][xs_, context_] := ClConAssignVariableNames[{}][xs, context];
ClConAssignVariableNames[varNamesArg:{_String...}][xs_, context_Association] :=
Block[{varNames = varNamesArg, ncols, dsQ, mlrQ, arrQ},
If[ KeyExistsQ[context, "trainingData"],
dsQ = TrueQ[ Head[context["trainingData"]] === Dataset ];
mlrQ = DataArrayRulesForClassifyQ[ context["trainingData"] ];
arrQ = MatrixQ[context["trainingData"]];
ncols =
Which[
dsQ || arrQ, Dimensions[context["trainingData"]][[2]],
mlrQ, Dimensions[context["trainingData"][[All,1]]][[2]] + 1,
True, Missing["NA"]
];
Which[
NumberQ[ncols] && (dsQ || mlrQ || arrQ) && Length[varNames] < ncols,
varNames = Join[ varNames, Map[ ToString, Table[ i, {i, Length[varNames]+1, ncols}] ] ],
NumberQ[ncols] && (dsQ || mlrQ || arrQ) && Length[varNames] >= ncols,
varNames = Take[varNames, ncols],
True,
Echo["Unknown training data type or the specified variable names do not correspond to the \"trainingData\" dimensions.", "ClConAssignVariableNames:"];
Return[$ClConFailure]
],
(*ELSE*)
Echo["No training data in the context.", "ClConAssignVariableNames:"];
Return[$ClConFailure]
];
ClConUnit[xs, Join[context, <|"variableNames"->varNames|>] ]
];
ClConAssignVariableNames[___][xs_, context_Association] :=
Block[{},
Echo["The first argument is expected to be a list of strings or Automatic.", "ClConAssignVariableNames:"]
$ClConFailure
];
(************************************************************)
(* ClConSummarizeData *)
(************************************************************)
Clear[GetData]
GetData[xs_, context_] :=
Block[{},
Which[
Head[xs] === Dataset || DataRulesForClassifyQ[xs] || MatrixQ[xs],
ctData = <|Anonymous->xs|>,
MatchQ[xs, _Association] && ( KeyExistsQ[xs, "trainingData"] || KeyExistsQ[xs, "testData"] || KeyExistsQ[xs, "validationData"] ),
ctData = KeyTake[xs, {"trainingData", "testData", "validationData"}],
KeyExistsQ[context, "trainingData"] || KeyExistsQ[context, "testData"] || KeyExistsQ[context, "validationData"],
ctData = KeyTake[context, {"trainingData", "testData", "validationData"}],
True,
ctData = {}
]
];
ClearAll[ClConSummarizeData, ClConSummarizeDataLongForm];
Options[ClConSummarizeData] = Join[ {"Type" -> Automatic, "Echo"->True }, Options[DataColumnsSummary]];
ClConSummarizeData[$ClConFailure] := $ClConFailure;
ClConSummarizeData[___][$ClConFailure] := $ClConFailure;
ClConSummarizeData[xs_, context_Association] := ClConSummarizeData[][xs,context];
ClConSummarizeData[opts:OptionsPattern[]][xs_, context_] :=
Block[{ctData=GetData[xs,context], res, rsOpts, dims, type},
type = OptionValue[ClConSummarizeData, "Type"];
rsOpts = DeleteCases[{opts},("Type"->_)|("Echo"->_)];
If[ Length[ctData] == 0 || And @@ Map[ Length[#]==0 &, ctData],
Echo["Cannot find data in the context.", "ClConSummarizeData:"];
Return[$ClConFailure]
];
If[ DataRulesForClassifyQ[ctData[[1]]] && ! DataArrayRulesForClassifyQ[ctData[[1]]],
(* I.e {(_?AtomQ->_lbl)..} *)
type = "WideForm"; dims = {Length[ctData[[1]]], 1};
];
dims =
If[DataArrayRulesForClassifyQ[ctData[[1]]],
Dimensions[ ctData[[1, All, 1]] ],
Dimensions[ ctData[[1]] ]
];
If[ TrueQ[ type === Automatic] && Length[dims] > 1 && dims[[2]] < 9 ||
TrueQ[ type != "LongForm"],
(*ctData = ClConToNormalClassifierData /@ ctData;*)
(*res = RecordsSummary[#, Thread->True, DeleteCases[{opts},"Type"->_]]& /@ ctData;*)
res = RecordsSummary[#, rsOpts, Thread->True]& /@ ctData;
If[ TrueQ[OptionValue["Echo"]],
ClConBind[ ClConUnit[ Normal@res, context], ClConEchoFunctionValue["summaries:", Identity] ],
ClConUnit[ Normal@res, context]
],
(* ELSE *)
ClConSummarizeDataLongForm[rsOpts][xs, context]
]
];
Options[ClConSummarizeDataLongForm] = Join[ {"Echo"->True}, Options[DataColumnsSummary]];
ClConSummarizeDataLongForm[$ClConFailure] := $ClConFailure;
ClConSummarizeDataLongForm[___][$ClConFailure] := $ClConFailure;
ClConSummarizeDataLongForm[xs_, context_Association] := ClConSummarizeDataLongForm[][xs,context];
ClConSummarizeDataLongForm[opts:OptionsPattern[]][xs_, context_] :=
Block[{rsOpts, varNames, ctData, data, sMat, dataLongForm, res},
rsOpts = DeleteCases[{opts},("Type"->_)|("Echo"->_)];
ctData = GetData[xs, context];
If[ Length[ctData] == 0 || And @@ Map[ Length[#]==0 &, ctData],
Echo["Cannot find data in the context.", "ClConSummarizeDataLongForm:"];
Return[$ClConFailure]
];
varNames = ClConBind[ ClConUnit[xs,context], ClConTakeVariableNames ];
If[ TrueQ[varNames === $ClConFailure],
If[ Keys[ctData] === {Anonymous},
res = <| "trainingData" -> First @ Values @Map[Take[#,UpTo[3]]&, ctData] |>,
res = ctData
];
varNames = Fold[ ClConBind, ClConUnit[{}, res], { ClConAssignVariableNames, ClConTakeVariableNames} ]
];
If[ TrueQ[varNames === $ClConFailure], Return[$ClConFailure] ];
ctData = ClConToNormalClassifierData /@ ctData;
res =
Function[{data},
If[Length[data] == 0, {},
sMat = ToSSparseMatrix[SparseArray@(Flatten@*List @@@ data),
"ColumnNames" -> varNames,
"RowNames" -> ToString /@ Range[Length[data]]];
dataLongForm = SSparseMatrixToTriplets[sMat];
RecordsSummary[dataLongForm, {"RowID", "Variable", "Value"}, rsOpts]
]
] /@ ctData;
If[ TrueQ[OptionValue["Echo"]],
ClConBind[ ClConUnit[ Normal@res, context], ClConEchoFunctionValue["summaries:", Identity] ],
ClConUnit[ Normal@res, context]
]
];
(**************************************************************)
(* DeleteMissing *)
(**************************************************************)
ClearAll[ClConDeleteMissing];
ClConDeleteMissing[$ClConFailure] := $ClConFailure;
ClConDeleteMissing[xs_, context_] := ClConDeleteMissing[][xs, context];
ClConDeleteMissing[][xs_, context_] :=
Block[{data},
data = ClConBind[ ClConGetData[xs, context], ClConTakeValue];
If[ data === $ClConFailure,
$ClConFailure,
(*ELSE*)
data = DeleteMissing[data, 1, 2];
ClConUnit[ data, Join[ context, <|"data"->data|>] ]
]
];
ClConDeleteMissing[___][__] := $ClConFailure;
(************************************************************)
(* ClConMakeClassifier *)
(************************************************************)
ClearAll[ClConMethodQ, ClConMethodListQ, ClConResamplingMethodListQ, ClConMethodSpecQ, ClConClassifierQ, ClConMakeClassifier];
ClConMethodQ[x_] := StringQ[x] || MatchQ[ x, {_String, _Rule..} ]; (* And check is it known by Classify. *)
ClConMethodListQ[x_] := MatchQ[ x, {_?ClConMethodQ..} ];
ClConResamplingMethodQ[x_] := MatchQ[ x, (Automatic | _Association | _String | {_String, _?NumberQ} | {_String, _?NumberQ, _Integer} | {_String, _?NumberQ, _Integer, RandomSample | RandomChoice}) ];
(* Note that is includes MethodListQ. *)
ClConResamplingMethodListQ[x_] := MatchQ[ x, { _?ClConResamplingMethodQ .. } ];
ClConMethodSpecQ[x_] := ClConMethodQ[x] || ClConResamplingMethodQ[x] || ClConResamplingMethodListQ[x];
ClConClassifierQ[ cl_ ] :=
MatchQ[ cl, _ClassifierFunction] ||
If[Length[DownValues[ClassifierEnsembles`EnsembleClassifierMeasurements]] > 0,
MatchQ[ cl, Association[(_ -> _ClassifierFunction) ..] ]
];
Options[ClConMakeClassifier] = Options[Classify];
ClConMakeClassifier[___][$ClConFailure] := $ClConFailure;
ClConMakeClassifier[xs_, context_Association] := ClConMakeClassifier[][xs, context];
ClConMakeClassifier[][xs_, context_] := ClConMakeClassifier["LogisticRegression"][xs, context];
ClConMakeClassifier[opts:OptionsPattern[]][xs_, context_] := ClConMakeClassifier[Automatic,opts][xs, context];
ClConMakeClassifier[methodSpecArg_?ClConMethodSpecQ, opts:OptionsPattern[]][xs_, context_] :=
Block[{cf, dataAssoc, newContext, methodSpec = methodSpecArg},
Which[
MatchQ[xs, _Association] && KeyExistsQ[xs, "trainingData"],
dataAssoc = xs; newContext = Join[context, xs],
KeyExistsQ[context, "trainingData"],
dataAssoc = context; newContext = <||>,
True,
Echo["No training data. (No changes in argument and context were made.)", "ClConMakeClassifier:"];
Return[ClCon[xs, context]]
];
(* Note that if opts has Method->_ then this setting is ignored. *)
If[ TrueQ[methodSpec===Automatic], methodSpec = "LogisticRegression" ];
Which[
ClConMethodQ[methodSpec] && ( !KeyExistsQ[dataAssoc, "validationData"] || TrueQ[dataAssoc["validationData"] === Automatic] ),
cf = Classify[ClConToNormalClassifierData[dataAssoc@"trainingData"], opts, Method -> methodSpec ],
ClConMethodQ[methodSpec] && KeyExistsQ[dataAssoc, "validationData"],
cf = Classify[ClConToNormalClassifierData[dataAssoc@"trainingData"], opts, Method -> methodSpec,
ValidationSet -> ClConToNormalClassifierData[dataAssoc@"validationData"] ],
ClConMethodListQ[methodSpec],
cf = EnsembleClassifier[ methodSpec, ClConToNormalClassifierData[dataAssoc@"trainingData"], opts ],
ClConResamplingMethodQ[methodSpec],
cf = ResamplingEnsembleClassifier[ {methodSpec}, ClConToNormalClassifierData[dataAssoc@"trainingData"], opts ],
ClConResamplingMethodListQ[methodSpec],
cf = ResamplingEnsembleClassifier[ methodSpec, ClConToNormalClassifierData[dataAssoc@"trainingData"], opts ],
True,
Echo["Unknown classifier specification.", "ClConMakeClassifier:"];
cf = None;
];
If[ ! ClConClassifierQ[cf],
Echo["Classifier making failure.", "ClConMakeClassifier:"];
$ClConFailure,
(* ELSE *)
ClCon[cf, KeyDrop[ Join[context, newContext, <|"classifier" -> cf|>], "rocData"] ]
]
];
ClConMakeClassifier[___][xs_, context_Association] :=
Block[{},
Echo[ "The first argument is expected to be a classifier ensemble specification.",
"ClConMakeClassifier:"
];
$ClConFailure
];
(************************************************************)
(* ClConClassifierMeasurements *)
(************************************************************)
CleaAll[ClConClassifierMeasurements]
Options[ClConClassifierMeasurements] = { Method -> Automatic, "ROCRange" -> Range[0,1,0.025]};
ClConClassifierMeasurements[xs_, context_Association] := ClConClassifierMeasurements[{"Accuracy", "Precision", "Recall"}][xs,context];
ClConClassifierMeasurements[][xs_, context_Association] := ClConClassifierMeasurements[{"Accuracy", "Precision", "Recall"}][xs,context];
ClConClassifierMeasurements[___][$ClConFailure] := $ClConFailure;
ClConClassifierMeasurements[measuresArg : (_String | {_String ..}), opts:OptionsPattern[]][xs_, context_] :=
Block[{cm, measures = Flatten[{measuresArg}], cmROC, rocRange},
rocRange = OptionValue[ ClConClassifierMeasurements, "ROCRange"];
If[ !( VectorQ[rocRange,NumberQ] && Apply[And, 0 <= # <= 1& /@ rocRange] ),
Echo["The value of the option \"ROCRange\" is expected to be a list of numbers between 0 and 1.", "ClConClassifierMeasurements:"];
Echo["Continuing with \"ROCRange\"-> Range[0,1,0.025].", "ClConClassifierMeasurements:"];
rocRange = Range[0,1,0.025];
];
Which[
!KeyExistsQ[context, "classifier"],
Echo["Make a classifier first.", "ClConClassifierMeasurements:"];
$ClConFailure,
!( MatchQ[ context["classifier"], _ClassifierFunction] || MatchQ[ context["classifier"], Association[(_ -> _ClassifierFunction) ..] ] ),
Echo["The value of \"classifier\" in the context is not a ClassifierFunction object or an Association of ClassifierFunction objects.", "ClConClassifierMeasurements:"];
$ClConFailure,
!KeyExistsQ[context, "testData"] || Length[context["testData"]] == 0,
Echo["Cannot find test data in the context.","ClConClassifierMeasurements:"];
$ClConFailure,
MatchQ[ context["classifier"], _ClassifierFunction],
cm = ClassifierMeasurements[context["classifier"], ClConToNormalClassifierData[context@"testData"]];
ClCon[AssociationThread[measures -> cm /@ measures], context],
!MemberQ[measures, "ROCCurve"],
cm = EnsembleClassifierMeasurements[ context["classifier"], ClConToNormalClassifierData[context@"testData"], measures, opts];
ClCon[AssociationThread[measures -> cm], context],
True,
(* This here reconciles EnsembleClassifierMeasurements with ClassifierMeasurements.
It is probably better to be handled in ClassifierEnsembles.m .
The advantage of handling it here is that ClassifierEnsembles.m does not have to be changed. *)
cmROC =
EnsembleClassifierROCPlots[
context["classifier"], ClConToNormalClassifierData @ context["testData"], rocRange,
PlotJoined -> True, PlotRange -> {{0, 1}, {0, 1}},
GridLines -> Automatic];
measures = DeleteCases[measures, "ROCCurve"];
cm = EnsembleClassifierMeasurements[ context["classifier"], ClConToNormalClassifierData[context@"testData"], measures, opts];
ClCon[Join[AssociationThread[measures -> cm], <|"ROCCurve"->cmROC|>], context]
]
];
ClConClassifierMeasurements[___][xs_, context_Association] :=
Block[{},
Echo[ "The first argument is expected to be a list of classifier measures.",
"ClConClassifierMeasurements:"
];
$ClConFailure
];
(************************************************************)
(* ClConClassifierMeasurementsByThreshold *)
(************************************************************)
CleaAll[ClConClassifierMeasurementsByThreshold]
Options[ClConClassifierMeasurementsByThreshold] = { Method -> Automatic, "ROCRange" -> Range[0,1,0.025]};
ClConClassifierMeasurementsByThreshold[___][$ClConFailure] := $ClConFailure;
ClConClassifierMeasurementsByThreshold[measures : (_String | {_String ..}), classLabel_ -> th_?NumberQ, opts:OptionsPattern[]][xs_, context_] :=
Block[{aCl},
Which[
KeyExistsQ[context, "classifier"] && TrueQ[ Head[ context["classifier"] ] === ClassifierFunction ],
aCl = <|ClassifierInformation[context["classifier"], Method] -> context["classifier"] |>;
ClConBind[
ClConUnit[ xs, Join[context, <|"classifier"-> aCl|>] ],
ClConClassifierMeasurements[ measures, Method -> (EnsembleClassifyByThreshold[##, classLabel -> th]&), opts ]
],
KeyExistsQ[context, "classifier"] && AssociationQ[ context["classifier"] ],
ClConBind[
ClConUnit[ xs, context ],
ClConClassifierMeasurements[ measures, Method -> (EnsembleClassifyByThreshold[##, classLabel -> th]&), opts ]
],
True,
Echo["Make a classifier first.", "ClConClassifierMeasurementsByThreshold:"];
$ClConFailure
]
];
ClConClassifierMeasurementsByThreshold[___][xs_, context_Association] :=
Block[{},
Echo[ "The first argument is expected to be a list of classifier measures; " <>
"the second argument is expected to be a rule of the form classLabel_ -> th_?NumberQ.",
"ClConClassifierMeasurementsByThreshold:"
];
$ClConFailure
];
(************************************************************)
(* ClConROCData *)
(************************************************************)
(* This is done as a separate function because it is important to be able to extract and manipulate that data. *)
(* Another reason is that there is no easy way of extracting that from the ClassifierMeasurements objects. *)
(* Here we extract that data in a "black box" manner. *)
(* Meaning the classifier is repeatedly called over a test set. *)
(* Note that there is an *Echo* version. This prompts as possible computation optimization. *)
ClearAll[ClConROCData]
Options[ClConROCData] = { "ROCRange" -> Automatic, "ClassLabels" -> All };
ClConROCData[$ClConFailure] := $ClConFailure;
ClConROCData[___][$ClConFailure] = $ClConFailure;
ClConROCData[xs_, context_Association] := ClConROCData[][xs, context];
(* (Of course) this implementation is very similar to ClConClassifierMeasurements. *)
(* So, some proper refactoring has to be done. *)
(* Using Block instead of Module some unexpected execution flow would happen involving res .*)
ClConROCData[opts:OptionsPattern[]][xs_,context_]:=
Module[{ rocRange, targetClasses, cl, res},
rocRange = OptionValue[ ClConROCData, "ROCRange"];
Which[
TrueQ[ rocRange === Automatic ],
rocRange = Range[0,1,0.025],
!( VectorQ[rocRange,NumberQ] && Apply[And, 0 <= # <= 1& /@ rocRange] ),
Echo["The value of the option \"ROCRange\" is expected to be a list of numbers between 0 and 1.", "ClConROCData:"];
Echo["Continuing with \"ROCRange\"-> Range[0,1,0.025].", "ClConROCData:"];
rocRange = Range[0,1,0.025];
];
targetClasses = OptionValue[ ClConROCData, "ClassLabels"];
If[ ! ( TrueQ[ targetClasses === All ] || TrueQ[ targetClasses === Automatic ] ) && AtomQ[targetClasses],
(* Assuming that most likely the labels are _?AtomQ . *)
targetClasses = Flatten[List[targetClasses]];
];
Which[
!KeyExistsQ[context, "classifier"],
Echo["Make a classifier first.", "ClConROCData:"];
Return[$ClConFailure],
!( MatchQ[ context["classifier"], _ClassifierFunction] || MatchQ[ context["classifier"], Association[(_ -> _ClassifierFunction) ..] ] ),
Echo["The value of \"classifier\" in the context is not a ClassifierFunction object or an Association of ClassifierFunction objects.", "ClConROCData:"];
Return[$ClConFailure],
!KeyExistsQ[context, "testData"] || Length[context["testData"]] == 0,
Echo["Cannot find test data in the context.","ClConROCData:"];
Return[$ClConFailure],
MatchQ[ context["classifier"], _ClassifierFunction],
cl = <| ClassifierInformation[context["classifier"],Method] -> context["classifier"] |>,
True,
cl = context["classifier"]
];
res = EnsembleClassifierROCData[ cl, ClConToNormalClassifierData[context["testData"]], rocRange, targetClasses];
res = Association[res];
ClConUnit[ res, Join[context, <|"rocData"->res|>] ]
];
ClConROCData[___][xs_,context_Association] :=
Block[{},
Echo["No arguments are expected.", "ClConROCData:"];
$ClConFailure
];
(************************************************************)
(* ClConROCPlot *)
(************************************************************)
ClearAll[ClConROCPlot];
Options[ClConROCPlot] = Join[ Options[ClConROCData], Options[ROCPlot], {"Echo"->True} ];
ClConROCPlot[___][$ClConFailure] := $ClConFailure;
(*ClConROCPlot[][xs_,context_]:= ClConROCPlot[ "FPR", "TPR"][xs, context];*)
ClConROCPlot[opts:OptionsPattern[]][xs_,context_]:= ClConROCPlot[ "FPR", "TPR", opts][xs, context];
ClConROCPlot[ xFuncName_String, yFuncName_String, opts:OptionsPattern[]][xs_,context_]:=
Block[{rocFuncs = rocFuncsArg, rocDataOpts, rocPlotOpts, rocPlotFunc, echoQ},
echoQ = TrueQ[OptionValue[ClConROCPlot, "Echo"]];
rocDataOpts =
With[{lhs = Alternatives @@ Options[ClConROCData][[All, 1]]},
Cases[{opts}, HoldPattern[Rule[lhs, _]]]
];
rocPlotOpts =
With[{lhs = Alternatives @@ Options[ROCPlot][[All, 1]]},
Cases[{opts}, HoldPattern[Rule[lhs, _]]]
];
rocPlotFunc = ROCPlot[ xFuncName, yFuncName, #, Sequence @@ rocPlotOpts, "PlotJoined" -> True, GridLines -> Automatic, ImageSize -> Small] &;
Which[
KeyExistsQ[context, "rocData"] && Length[rocDataOpts] == 0 && !echoQ,
ClConBind[ ClCon[xs,context], (ClCon[ rocPlotFunc /@ #2["rocData"], #2 ]&) ],
KeyExistsQ[context, "rocData"] && Length[rocDataOpts] == 0 && echoQ,
Fold[ClConBind,
ClCon[xs,context],
{ (ClCon[ rocPlotFunc /@ #2["rocData"], #2 ]&), ClConEchoFunctionValue["ROC plot(s):", # &] }
],
!echoQ,
Fold[ClConBind,
ClCon[xs,context],
{ ClConROCData[Sequence @@ rocDataOpts], (ClCon[ rocPlotFunc /@ #1, #2 ]&) }
],
True, (* echoQ *)
Fold[ClConBind,
ClCon[xs,context],
{ ClConROCData[Sequence @@ rocDataOpts], (ClCon[ rocPlotFunc /@ #1, #2 ]&), ClConEchoFunctionValue["ROC plot(s):", # &] }
]
]
];
ClConROCPlot[___][xs_,context_] :=
Block[{},
Echo["No arguments or two string arguments are expected. The arguments are names of ROC functions.", "ClConROCPlot:"];
$ClConFailure
];
(************************************************************)
(* ClConROCLinePlot *)
(************************************************************)
ClearAll[ClConROCListLinePlot];
Options[ClConROCListLinePlot] = Join[ Options[ClConROCData], Options[ListLinePlot], {"Echo"->True} ];
ClConROCListLinePlot[$ClConFailure] := $ClConFailure;
(*ClConROCListLinePlot[___] := $ClConFailure;*)
ClConROCListLinePlot[___][$ClConFailure] := $ClConFailure;
ClConROCListLinePlot[rocFuncs:{_String...}, opts:OptionsPattern[]][xs_,context_]:=
Block[{rocDataOpts, linePlotOpts, rocValsFunc, rocPlotFunc, echoQ },
If[ Length[rocFuncs] == 0,
Echo["The first argument is expected to be a list of strings that are names of ROC functions.", "ClConROCListLinePlot::"];
Return[$ClConFailure]
];
echoQ = TrueQ[OptionValue[ClConROCListLinePlot, "Echo"]];
rocDataOpts =
With[{lhs = Alternatives @@ Options[ClConROCData][[All, 1]]},
Cases[{opts}, HoldPattern[Rule[lhs, _]]]
];
linePlotOpts =
With[{lhs = Alternatives @@ Options[ListLinePlot][[All, 1]]},
Cases[{opts}, HoldPattern[Rule[lhs, _]]]
];
rocValsFunc =
Function[{roc},
AssociationThread[rocFuncs,
Map[Transpose[{Through[roc["ROCParameter"]], #}] &,
N[Through[ROCFunctions[rocFuncs][roc]]]]]];
rocPlotFunc = ListLinePlot[#, Sequence @@ linePlotOpts, PlotTheme -> "Detailed", ImageSize -> Small] &;
Which[
KeyExistsQ[context, "rocData"] && Length[rocDataOpts] == 0 && !echoQ,
ClConBind[ ClCon[xs,context], ClCon[ rocPlotFunc /@ rocValsFunc /@ #2["rocData"], #2 ]& ],
KeyExistsQ[context, "rocData"] && Length[rocDataOpts] == 0 && echoQ,
Fold[ClConBind,
ClCon[xs,context],
{ ClCon[ rocPlotFunc /@ rocValsFunc /@ #2["rocData"], #2 ]&, ClConEchoFunctionValue["ROC line plot(s):", # &] }
],
!echoQ,
Fold[ClConBind,
ClCon[xs,context],
{ ClConROCData[Sequence @@ rocDataOpts], ClCon[ rocPlotFunc /@ rocValsFunc /@ #1, #2 ]& }
],
True, (*echoQ*)
Fold[ClConBind,
ClCon[xs,context],
{ ClConROCData[Sequence @@ rocDataOpts], ClCon[ rocPlotFunc /@ rocValsFunc /@ #1, #2 ]&, ClConEchoFunctionValue["ROC line plot(s):", # &] }
]
]
];
ClConROCListLinePlot[___][xs_,context_Association] :=
Block[{},
Echo["The first argument is expected to be a list of strings that are names of ROC functions.", "ClConROCListLinePlot::"];
$ClConFailure
];
(************************************************************)
(* ClConSuggestROCThresholds *)
(************************************************************)
ClearAll[ClConSuggestROCThresholds]
Options[ClConSuggestROCThresholds] = Options[ROCData];
ClConSuggestROCThresholds[xs_, context_Association] := ClConSuggestROCThresholds[1][xs, context];
ClConSuggestROCThresholds[][xs_, context_] := ClConSuggestROCThresholds[1][xs, context];
ClConSuggestROCThresholds[n_Integer:1, opts:OptionsPattern[]][xs_, context_] :=
Module[{rocData, rocPoints, rocDists},
Which[
KeyExistsQ[context, "rocData"] && Length[{opts}] == 0,
rocData = context["rocData"],
True,
rocData = Fold[ ClConBind, ClConUnit[xs, context], { ClConROCData[opts], ClConTakeROCData} ]
];
rocPoints = Map[Transpose[{ROCFunctions["FPR"][#], ROCFunctions["TPR"][#]}] &, rocData];
rocDists =
Association[
KeyValueMap[
Function[{k, rps},
k ->
Association[
MapThread[#2 -> EuclideanDistance[#1, {0, 1}] &, {N@rps, Through[rocData[k]["ROCParameter"]]}]]],
rocPoints]
];
ClCon[ Keys[TakeSmallest[#, UpTo[n]]] & /@ rocDists, context]
];
(************************************************************)
(* ClConAccuracyByVariableShuffling *)
(************************************************************)
ClearAll[ClConAccuracyByVariableShuffling];
Options[ClConAccuracyByVariableShuffling] = { "ClassLabels" -> None };
ClConAccuracyByVariableShuffling[$ClConFailure] := $ClConFailure;
ClConAccuracyByVariableShuffling[___][$ClConFailure] := $ClConFailure;
ClConAccuracyByVariableShuffling[xs_, context_Association] :=
ClConAccuracyByVariableShuffling["ClassLabels" -> None][xs, context];
ClConAccuracyByVariableShuffling[][xs_, context_] :=
ClConAccuracyByVariableShuffling["ClassLabels" -> None][xs, context];
ClConAccuracyByVariableShuffling[opts : OptionsPattern[]][xs_, context_] :=
Block[{fsClasses = FilterRules[{opts}, "ClassLabels"]},
If[Length[fsClasses] == 0 || fsClasses === Automatic, fsClasses = None];
Which[
!KeyExistsQ[context, "testData"],
Echo["No test data is the context.", "ClConAccuracyByVariableShuffling:"];
$ClConFailure,
!KeyExistsQ[context, "classifier"],
Echo["No classifier in the context.", "ClConAccuracyByVariableShuffling:"];
$ClConFailure,
TrueQ[ Head[context["testData"]] === Dataset ],
ClCon[
AccuracyByVariableShuffling[
context["classifier"],
ClConToNormalClassifierData[context["testData"]],
Most@Keys[Normal@context["testData"][[1]]],
fsClasses],
context],
DataArrayRulesForClassifyQ[context["testData"]],
ClCon[
AccuracyByVariableShuffling[
context["classifier"],
context["testData"],
Automatic,
fsClasses],
context],
True,
Echo["Unknown data type of the test data is the context.", "ClConAccuracyByVariableShuffling:"];
$ClConFailure
]
];
(************************************************************)
(* ClConReduceDimension *)
(************************************************************)
ClearAll[ClConReduceDimension]
Options[ClConReduceDimension] = { "Echo" -> True };
ClConReduceDimension[$ClConFailure] := $ClConFailure;
ClConReduceDimension[][$ClConFailure] := $ClConFailure;
ClConReduceDimension[k_Integer, opts:OptionsPattern[]][xs_, context_] :=
Module[{echoQ},
echoQ = TrueQ[ OptionValue[ClConReduceDimension, "Echo"] ];
DoubleLongRightArrow[
ClConUnit[xs, context],
ClConModifyContext[Join[#, ClConToNormalClassifierData /@ KeyTake[#, {"trainingData", "testData", "validationData"}]] &],
ClConWhen[! (KeyExistsQ[#2, "trainingData"] && KeyExistsQ[#2, "testData"]) &,
DoubleLongRightArrow[
ClConUnit[],
ClConEcho["Cannot find training data and test data in the context.", "ClConReduceDimension:"],
ClConFail
]&
],
ClConWhen[! (MatrixQ[#2["trainingData"][[All, 1]], NumberQ] && MatrixQ[#2["testData"][[All, 1]], NumberQ]) &,
DoubleLongRightArrow[
ClConUnit[],
ClConEcho["The non-label parts of the training data and test data are not numerical matrices.", "ClConReduceDimension:"],
ClConFail
]&
],
ClConModifyContext[Join[#, <|"mean" -> Mean[#["trainingData"][[All, 1]]]|>] &],
ClConModifyContext[Function[{ct}, Join[ct, <|"svdRes" -> SingularValueDecomposition[Map[# - ct["mean"] &, ct["trainingData"][[All, 1]]], k] |>]]],
ClConModifyContext[Function[{ct}, Join[ct, <|
"trainingData" ->
Thread[(ct["svdRes"][[1]].ct["svdRes"][[2]]) -> ct["trainingData"][[All, 2]]],
"testData" ->
Thread[(Map[# - ct["mean"] &, ct["testData"][[All, 1]]].ct["svdRes"][[3]]) -> ct["testData"][[All, 2]]] |>]]],
ClConWhen[KeyExistsQ[#2, "validationData"] &,
ClConModifyContext[
Function[{ct}, Join[ct, <|
"validationData" -> Thread[(Map[# - ct["mean"] &, ct["validationData"][[All, 1]]].ct["svdRes"][[3]]) -> ct["validationData"][[All, 2]]] |>]]]
],
ClConWhen[ echoQ &,
ClConEchoFunctionContext["Singular values:", ListPlot[Diagonal[#["svdRes"][[2]]],
PlotRange -> All, PlotTheme -> "Detailed", Filling -> Axis, PlotStyle -> PointSize[0.02], ImageSize -> Small] &]
]
]
];
ClConReduceDimension[___][__] := $ClConFailure;
(********************************************************************************************************************)
(* Experimental functions *)
(********************************************************************************************************************)
(*
Note that at this point the functions below are not exposed to the outside world --
there are no '::usage' declarations for them.
*)
(************************************************************)
(* ClConToLinearVectorSpaceRepresentation *)
(************************************************************)
Clear[ClConToLinearVectorSpaceRepresentation];
ClConToLinearVectorSpaceRepresentation[data:(_?MatrixQ|_Dataset)] :=
Block[{catData, smats, resMat, res},
catData = ToCategoricalColumns[data];
smats =
Table[
CrossTabulate[ Transpose[{Range[Length[catData]], Normal[catData[All, i]]}] ],
{i, Length[data[1]]}];
resMat = Transpose[Join @@ Map[Transpose[#["SparseMatrix"]] &, smats]];
<| "SparseMatrix"->resMat, "RowNames"-> smats[[1]]["RowNames"], "ColumnNames" -> Join @@ Through[smats["ColumnNames"]] |>
];
ClConToLinearVectorSpaceRepresentation[][$ClConFailure] := $ClConFailure;
ClConToLinearVectorSpaceRepresentation[][xs_, context_] :=
Block[{t},
t = ClConBind[ ClConUnit[xs, context], ClConTakeData ];
If[ t === $ClConFailure, Return[$ClConFailure] ];
t = ClConToLinearVectorSpaceRepresentation[t];
ClConUnit[t, context]
];
(************************************************************)
(* ClConOutlierPosition *)
(************************************************************)
ClearAll[ClConOutlierPosition, ClConDataOutlierPosition]
Options[ClConOutlierPosition] = {
"CentralItemFunction" -> Mean,
DistanceFunction -> EuclideanDistance,
"OutlierIdentifierParameters" -> (TopOutliers@*SPLUSQuartileIdentifierParameters),
"ClassLabel" -> Automatic,
"ConversionFunction" -> None
};
Options[ClConDataOutlierPosition] = Options[ClConOutlierPosition];
ClConDataOutlierPosition[ data:(_?MatrixQ|_Dataset), opts:OptionsPattern[] ] :=
Block[{avgFunc, distFunc, olParams, conversionFunction, fef, smat, avgItem, dists},
avgFunc = OptionValue[ ClConDataOutlierPosition, "CentralItemFunction" ];
distFunc = OptionValue[ ClConDataOutlierPosition, DistanceFunction ];
olParams = OptionValue[ ClConDataOutlierPosition, "OutlierIdentifierParameters" ];
conversionFunction = TrueQ[OptionValue[ ClConDataOutlierPosition, "ConversionFunction" ]];
Which[
TrueQ[Head[data]===Dataset] && MatrixQ[Normal[data], NumberQ],
smat = Normal[data],
TrueQ[Head[data]===Dataset] && MatrixQ[Normal[data[Values]], NumberQ],
smat = Normal[data[Values]],
TrueQ[conversionFunction === None],
smat = Query[All, Values@*Select[NumberQ]]@ReplaceAll[Normal[data], _Missing -> 0];
If[VectorQ[smat], smat = Transpose[{smat}] ],
True, (* conversionFunction === FeatureExtraction *)
(*smat = ClConToLinearVectorSpaceRepresentation[data]["SparseMatrix"]*)
fef = FeatureExtraction[data];
smat = fef[data]
];
avgItem = avgFunc[N@smat];
dists = Map[distFunc[#, avgItem] &, smat];
OutlierPosition[dists, olParams]
];
ClConOutlierPosition[___][$ClConFailure] := $ClConFailure;
ClConOutlierPosition[$ClConFailure] := $ClConFailure;
ClConOutlierPosition[xs_, context_Association] := ClConOutlierPosition[][xs, context];
ClConOutlierPosition[opts:OptionsPattern[]][xs_, context_] :=
Block[{contextDataQ, pipelineDataQ, classLabel, classLabelInd, asc, newOpts, t},
classLabel = OptionValue[ ClConOutlierPosition, "ClassLabel" ];
contextDataQ = KeyExistsQ[context, "trainingData"] || KeyExistsQ[context, "testData"] || KeyExistsQ[context, "validationData"];
pipelineDataQ = MatchQ[xs, _Association] && ( KeyExistsQ[xs, "trainingData"] || KeyExistsQ[xs, "testData"] || KeyExistsQ[xs, "validationData"] );
asc = If[pipelineDataQ, xs, context];
newOpts = Sequence @@ DeleteCases[{opts}, "ClassLabel"->_];
Which[
( contextDataQ || pipelineDataQ ) && DataArrayRulesForClassifyQ[asc["trainingData"]],
If[ !TrueQ[classLabel === Automatic],
Echo["Ignoring the value given to the option \"ClassLabel\": the data is a list of rules.", "ClConOutlierPosition:"]
];
ClConUnit[
ClConDataOutlierPosition[#[[All,1]], newOpts]& /@ KeyTake[asc, {"trainingData", "testData", "validationData"}],
context],
contextDataQ || pipelineDataQ,
classLabelInd =
If[TrueQ[classLabel === Automatic], -1,
First @ Values @ ClConBind[ ClConUnit[xs, context], ClConTakeClassLabelIndex[classLabel]]
];
ClConUnit[
ClConDataOutlierPosition[ Drop[#, None, {classLabelInd}], opts]& /@ KeyTake[asc, {"trainingData", "testData", "validationData"}],
context],
DataArrayRulesForClassifyQ[xs],
If[ !TrueQ[classLabel === Automatic],
Echo["Ignoring the value given to the option \"ClassLabel\": the data is a list of rules.", "ClConOutlierPosition:"]
];
ClConUnit[ClConDataOutlierPosition[xs, newOpts][[All,1]], context],
TrueQ[ classLabel === None ] && ( TrueQ[Head[xs] === Dataset] || TrueQ[MatrixQ[xs]] ),
ClConUnit[ClConDataOutlierPosition[xs, opts], context],
TrueQ[ classLabel === Automatic ] && ( TrueQ[Head[xs] === Dataset] || TrueQ[MatrixQ[xs]] ),
ClConUnit[ClConDataOutlierPosition[ Drop[xs, None, {-1}], opts], context],
TrueQ[Head[xs] === Dataset] || TrueQ[MatrixQ[xs]],
t = ClConBind[ ClConUnit[<| "trainingData"->xs |>, context], ClConOutlierPosition ];
If[ TrueQ[ t === $ClConFailure ],
$ClConFailure,
ClConUnit[ ClConBind[ t, ClConTakeTrainingData ], context ]
],
True,
Echo["Cannot find data.","ClConOutlierPosition:"];
$ClConFailure
]
];
(************************************************************)
(* ClConFindOutliersPerClassLabel *)
(************************************************************)
ClearAll[ClConFindOutliersPerClassLabel]
Options[ClConFindOutliersPerClassLabel] = {
"OutlierIdentifierParameters" -> (TopOutliers@*SPLUSQuartileIdentifierParameters),
"TrainingDataOnly" -> True,
"ClassLabel" -> Automatic,
"ConversionFunction" -> None
};
ClConFindOutliersPerClassLabel[___][$ClConFailure] := $ClConFailure;
ClConFindOutliersPerClassLabel[$ClConFailure] := $ClConFailure;
ClConFindOutliersPerClassLabel[xs_, context_Association] := ClConFindOutliersPerClassLabel[][xs, context];
ClConFindOutliersPerClassLabel[][xs_, context_Association] :=
ClConFindOutliersPerClassLabel["OutlierIdentifierParameters" -> (TopOutliers@*SPLUSQuartileIdentifierParameters) ][xs, context];
ClConFindOutliersPerClassLabel[opts : OptionsPattern[]][xs_, context_Association] :=
Block[{data, res},
res = Fold[ ClConBind, ClConUnit[xs,context], {ClConOutliersOperationsProcessing[opts][##]&, ClConTakeValue}];
data = res["data"][ GroupBy[#[res["classLabel"]]&] ];
res =
AssociationThread[
Normal[Keys[data]],
Table[ClConDataOutlierPosition[data[i], "OutlierIdentifierParameters" -> res["outlierIdentifier"] ], {i, Length[data]}]
];
ClConUnit[res, context]
];
(************************************************************)
(* ClConDropOutliersPerClassLabel *)
(************************************************************)
(*
This function drops the outliers for each class label in a Dataset object.
Here are the steps.
1. The Dataset object is split into parts corresponding to the unique values of the
specified class label.
2. For each part the outliers are found with the specified outlier identifier.
3. The obtained parts are joined into a dataset.
4. That dataset is returned as monad pipeline value.
*)
ClearAll[ClConDropOutliersPerClassLabel]
Options[ClConDropOutliersPerClassLabel] = Options[ClConFindOutliersPerClassLabel];
ClConDropOutliersPerClassLabel[___][$ClConFailure] := $ClConFailure;
ClConDropOutliersPerClassLabel[$ClConFailure] := $ClConFailure;
ClConDropOutliersPerClassLabel[xs_, context_Association] := ClConDropOutliersPerClassLabel[][xs, context];
ClConDropOutliersPerClassLabel[][xs_, context_Association] :=
ClConDropOutliersPerClassLabel["OutlierIdentifierParameters" -> (TopOutliers@*SPLUSQuartileIdentifierParameters) ][xs, context];
ClConDropOutliersPerClassLabel[opts : OptionsPattern[]][xs_, context_Association] :=
Block[{data, res, t},
res = Fold[ ClConBind, ClConUnit[xs,context], {ClConOutliersOperationsProcessing[opts][##]&, ClConTakeValue}];
data = res["data"][ GroupBy[#[res["classLabel"]]&] ];
res =
Table[(
t = ClConDataOutlierPosition[ data[i], "OutlierIdentifierParameters" -> res["outlierIdentifier"], "ConversionFunction" -> res["conversionFunction"] ];
data[i][ Complement[ Range[Length[data[i]]], t] ]
), {i, Length[data]}];
res = Join @@ res;
ClConUnit[res, context]
];
(************************************************************)
(* ClConOutliersOperationsProcessing *)
(************************************************************)
ClearAll[ClConOutliersOperationsProcessing]
Options[ClConOutliersOperationsProcessing] = Options[ClConFindOutliersPerClassLabel];
ClConOutliersOperationsProcessing[___][$ClConFailure] := $ClConFailure;
ClConOutliersOperationsProcessing[opts : OptionsPattern[]][xs_, context_Association] :=
Block[{data, outlierIdentifier, trainingDataOnly, conversionFunc, classLabel, classLabelInd, varNames, res},
outlierIdentifier = OptionValue[ ClConOutliersOperationsProcessing, "OutlierIdentifierParameters"];
trainingDataOnly = TrueQ[ OptionValue[ ClConOutliersOperationsProcessing, "TrainingDataOnly" ] ];
classLabel = OptionValue[ ClConOutliersOperationsProcessing, "ClassLabel" ];
conversionFunc = TrueQ[ OptionValue[ClConOutliersOperationsProcessing, "ConversionFunction" ] ];
If[ trainingDataOnly,
Which[
AssociationQ[xs] && KeyExistsQ[xs, "trainingData"],
data = xs["trainingData"],
KeyExistsQ[context, "trainingData"],
data = context["trainingData"],
True,
Echo["No training data.", "ClConOutliersOperationsProcessing::"];
$ClConFailure
],
(* ELSE *)
data = ClConTakeData[xs, context]
];
If[ TrueQ[Head[data] =!= Dataset],
Echo[ "The obtained data is not a Dataset object.", "ClConOutliersOperationsProcessing::"];
Return[$ClConFailure]
];
(* It is assumed below that data is a Dataset. *)
varNames = ClConBind[ ClConUnit[xs,context], ClConTakeVariableNames ];
Which[
TrueQ[classLabel === Automatic],
classLabel = varNames[[-1]];
classLabelInd = Length[varNames],
IntegerQ[classLabel],
classLabelInd = classLabel;
If[ !(1 <= classLabelInd <= Length[varNames]),
Echo[ "The \"ClassLabel\" option value " <> classLabel <> " is not an integer between 1 and " <> ToString[Length[varNames]] <>".", "ClConOutliersOperationsProcessing::"];
Return[$ClConFailure]
];
classLabel = varNames[[classLabelInd]],
MemberQ[ varNames, classLabel ],
classLabelInd = First@Flatten@Position[varNames, classLabel],
True,
Echo[ "The \"ClassLabel\" option value " <> classLabel <> " is not Automatic, a variable index, or one of " <> ToString[varNames] <> ".", "ClConOutliersOperationsProcessing::"];
Return[$ClConFailure]
];
res = <| "data"->data, "varNames"->varNames,
"trainingDataOnly" -> trainingDataOnly,
"conversionFunction" -> conversionFunc,
"classLabel"->classLabel, "classLabelInd"->classLabelInd,
"outlierIdentifier"->outlierIdentifier |>;
ClConUnit[res, context]
];
End[] (*`Private`*)
EndPackage[]