From 2d4114e96ef6858b0dff0059e63644bfca8189ad Mon Sep 17 00:00:00 2001 From: baunsgaard Date: Wed, 10 May 2023 10:42:17 +0200 Subject: [PATCH] [MINOR] Python autogenerate new Builtins --- .../systemds/operator/algorithm/__init__.py | 6 ++ .../operator/algorithm/builtin/auc.py | 49 ++++++++++ .../algorithm/builtin/decisionTree.py | 63 ++++++------ .../algorithm/builtin/decisionTreePredict.py | 46 ++++----- .../algorithm/builtin/lmPredictStats.py | 50 ++++++++++ .../operator/algorithm/builtin/pca.py | 39 ++++++-- .../algorithm/builtin/randomForest.py | 97 ++++++++----------- .../algorithm/builtin/randomForestPredict.py | 56 +++++++++++ .../operator/algorithm/builtin/scale.py | 17 +++- .../operator/algorithm/builtin/slicefinder.py | 4 +- 10 files changed, 303 insertions(+), 124 deletions(-) create mode 100644 src/main/python/systemds/operator/algorithm/builtin/auc.py create mode 100644 src/main/python/systemds/operator/algorithm/builtin/lmPredictStats.py create mode 100644 src/main/python/systemds/operator/algorithm/builtin/randomForestPredict.py diff --git a/src/main/python/systemds/operator/algorithm/__init__.py b/src/main/python/systemds/operator/algorithm/__init__.py index 2dd6578833e..769ca66229a 100644 --- a/src/main/python/systemds/operator/algorithm/__init__.py +++ b/src/main/python/systemds/operator/algorithm/__init__.py @@ -31,6 +31,7 @@ from .builtin.alsTopkPredict import alsTopkPredict from .builtin.apply_pipeline import apply_pipeline from .builtin.arima import arima +from .builtin.auc import auc from .builtin.autoencoder_2layer import autoencoder_2layer from .builtin.bandit import bandit from .builtin.bivar import bivar @@ -110,6 +111,7 @@ from .builtin.lmCG import lmCG from .builtin.lmDS import lmDS from .builtin.lmPredict import lmPredict +from .builtin.lmPredictStats import lmPredictStats from .builtin.logSumExp import logSumExp from .builtin.matrixProfile import matrixProfile from .builtin.mcc import mcc @@ -137,6 +139,7 @@ from .builtin.pnmf import pnmf from .builtin.ppca import ppca from .builtin.randomForest import randomForest +from .builtin.randomForestPredict import randomForestPredict from .builtin.scale import scale from .builtin.scaleApply import scaleApply from .builtin.scaleMinMax import scaleMinMax @@ -182,6 +185,7 @@ 'alsTopkPredict', 'apply_pipeline', 'arima', + 'auc', 'autoencoder_2layer', 'bandit', 'bivar', @@ -261,6 +265,7 @@ 'lmCG', 'lmDS', 'lmPredict', + 'lmPredictStats', 'logSumExp', 'matrixProfile', 'mcc', @@ -288,6 +293,7 @@ 'pnmf', 'ppca', 'randomForest', + 'randomForestPredict', 'scale', 'scaleApply', 'scaleMinMax', diff --git a/src/main/python/systemds/operator/algorithm/builtin/auc.py b/src/main/python/systemds/operator/algorithm/builtin/auc.py new file mode 100644 index 00000000000..8df68353112 --- /dev/null +++ b/src/main/python/systemds/operator/algorithm/builtin/auc.py @@ -0,0 +1,49 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +# Autogenerated By : src/main/python/generator/generator.py +# Autogenerated From : scripts/builtin/auc.dml + +from typing import Dict, Iterable + +from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar +from systemds.script_building.dag import OutputType +from systemds.utils.consts import VALID_INPUT_TYPES + + +def auc(Y: Matrix, + P: Matrix): + """ + This builting function computes the area under the ROC curve (AUC) + for binary classifiers. + + + + :param Y: Binary response vector (shape: n x 1), in -1/+1 or 0/1 encoding + :param P: Prediction scores (predictor such as estimated probabilities) + for true class (shape: n x 1), assumed in [0,1] + :return: Area under the ROC curve (AUC) + """ + + params_dict = {'Y': Y, 'P': P} + return Matrix(Y.sds_context, + 'auc', + named_input_nodes=params_dict) diff --git a/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py b/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py index 4fb2ee56886..399a21fd502 100644 --- a/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py +++ b/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py @@ -30,43 +30,46 @@ def decisionTree(X: Matrix, - Y: Matrix, - R: Matrix, + y: Matrix, + ctypes: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]): """ - Builtin script implementing classification trees with scale and categorical features + This script implements decision trees for recoded and binned categorical and + numerical input features. We train a single CART (classification and + regression tree) decision trees depending on the provided labels y, either + classification (majority vote per leaf) or regression (average per leaf). - :param X: Feature matrix X; note that X needs to be both recoded and dummy coded - :param Y: Label matrix Y; note that Y needs to be both recoded and dummy coded - :param R: Matrix R which for each feature in X contains the following information - - R[1,]: Row Vector which indicates if feature vector is scalar or categorical. 1 indicates - a scalar feature vector, other positive Integers indicate the number of categories - If R is not provided by default all variables are assumed to be scale - :param bins: Number of equiheight bins per scale feature to choose thresholds - :param depth: Maximum depth of the learned tree - :param verbose: boolean specifying if the algorithm should print information while executing - :return: Matrix M where each column corresponds to a node in the learned tree and each row - contains the following information: - M[1,j]: id of node j (in a complete binary tree) - M[2,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0 - M[3,j]: Feature index of the feature (scale feature id if the feature is scale or - categorical feature id if the feature is categorical) - that node j looks at if j is an internal node, otherwise 0 - M[4,j]: Type of the feature that node j looks at if j is an internal node: holds - the same information as R input vector - M[5,j]: If j is an internal node: 1 if the feature chosen for j is scale, - otherwise the size of the subset of values - stored in rows 6,7,... if j is categorical - If j is a leaf node: number of misclassified samples reaching at node j - M[6:,j]: If j is an internal node: Threshold the example's feature value is compared - to is stored at M[6,j] if the feature chosen for j is scale, - otherwise if the feature chosen for j is categorical rows 6,7,... depict the value subset chosen for j - If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0 + :param X: Feature matrix in recoded/binned representation + :param y: Label matrix in recoded/binned representation + :param ctypes: Row-Vector of column types [1 scale/ordinal, 2 categorical] + of shape 1-by-(ncol(X)+1), where the last entry is the y type + :param max_depth: Maximum depth of the learned tree (stopping criterion) + :param min_leaf: Minimum number of samples in leaf nodes (stopping criterion), + odd number recommended to avoid 50/50 leaf label decisions + :param min_split: Minimum number of samples in leaf for attempting a split + :param max_features: Parameter controlling the number of features used as split + candidates at tree nodes: m = ceil(num_features^max_features) + :param max_values: Parameter controlling the number of values per feature used + as split candidates: nb = ceil(num_values^max_values) + :param impurity: Impurity measure: entropy, gini (default), rss (regression) + :param seed: Fixed seed for randomization of samples and split candidates + :param verbose: Flag indicating verbose debug output + :return: Matrix M containing the learne trees, in linearized form + For example, give a feature matrix with features [a,b,c,d] + and the following trees, M would look as follows: + (L1) |d<5| + / \ + (L2) P1:2 |a<7| + / \ + (L3) P2:2 P3:1 + --> M := + [[4, 5, 0, 2, 1, 7, 0, 0, 0, 0, 0, 2, 0, 1]] + |(L1)| | (L2) | | (L3) | """ - params_dict = {'X': X, 'Y': Y, 'R': R} + params_dict = {'X': X, 'y': y, 'ctypes': ctypes} params_dict.update(kwargs) return Matrix(X.sds_context, 'decisionTree', diff --git a/src/main/python/systemds/operator/algorithm/builtin/decisionTreePredict.py b/src/main/python/systemds/operator/algorithm/builtin/decisionTreePredict.py index 51a396eef7e..32bb06609ba 100644 --- a/src/main/python/systemds/operator/algorithm/builtin/decisionTreePredict.py +++ b/src/main/python/systemds/operator/algorithm/builtin/decisionTreePredict.py @@ -29,40 +29,32 @@ from systemds.utils.consts import VALID_INPUT_TYPES -def decisionTreePredict(M: Matrix, - X: Matrix, - strategy: str): +def decisionTreePredict(X: Matrix, + ctypes: Matrix, + M: Matrix, + **kwargs: Dict[str, VALID_INPUT_TYPES]): """ - - Builtin script implementing prediction based on classification trees with scale features using prediction methods of the + This script implements random forest prediction for recoded and binned + categorical and numerical input features. Hummingbird paper (https://www.usenix.org/system/files/osdi20-nakandala.pdf). - :param M: Decision tree matrix M, as generated by scripts/builtin/decisionTree.dml, where each column corresponds - to a node in the learned tree and each row contains the following information: - M[1,j]: id of node j (in a complete binary tree) - M[2,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0 - M[3,j]: Feature index of the feature (scale feature id if the feature is scale or - categorical feature id if the feature is categorical) - that node j looks at if j is an internal node, otherwise 0 - M[4,j]: Type of the feature that node j looks at if j is an internal node: holds - the same information as R input vector - M[5,j]: If j is an internal node: 1 if the feature chosen for j is scale, - otherwise the size of the subset of values - stored in rows 6,7,... if j is categorical - If j is a leaf node: number of misclassified samples reaching at node j - M[6:,j]: If j is an internal node: Threshold the example's feature value is compared - to is stored at M[6,j] if the feature chosen for j is scale, - otherwise if the feature chosen for j is categorical rows 6,7,... depict the value subset chosen for j - If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0 - :param X: Feature matrix X - :param strategy: Prediction strategy, can be one of ["GEMM", "TT", "PTT"], referring to "Generic matrix multiplication", + :param X: Feature matrix in recoded/binned representation + :param y: Label matrix in recoded/binned representation, + optional for accuracy evaluation + :param ctypes: Row-Vector of column types [1 scale/ordinal, 2 categorical] + :param M: Matrix M holding the learned tree in linearized form + see decisionTree() for the detailed tree representation. + :param strategy: Prediction strategy, can be one of ["GEMM", "TT", "PTT"], + referring to "Generic matrix multiplication", "Tree traversal", and "Perfect tree traversal", respectively - :return: Matrix containing the predicted labels for X + :param verbose: Flag indicating verbose debug output + :return: Label vector of predictions """ - params_dict = {'M': M, 'X': X, 'strategy': strategy} - return Matrix(M.sds_context, + params_dict = {'X': X, 'ctypes': ctypes, 'M': M} + params_dict.update(kwargs) + return Matrix(X.sds_context, 'decisionTreePredict', named_input_nodes=params_dict) diff --git a/src/main/python/systemds/operator/algorithm/builtin/lmPredictStats.py b/src/main/python/systemds/operator/algorithm/builtin/lmPredictStats.py new file mode 100644 index 00000000000..731d6d232c1 --- /dev/null +++ b/src/main/python/systemds/operator/algorithm/builtin/lmPredictStats.py @@ -0,0 +1,50 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +# Autogenerated By : src/main/python/generator/generator.py +# Autogenerated From : scripts/builtin/lmPredictStats.dml + +from typing import Dict, Iterable + +from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar +from systemds.script_building.dag import OutputType +from systemds.utils.consts import VALID_INPUT_TYPES + + +def lmPredictStats(yhat: Matrix, + ytest: Matrix, + lm: bool): + """ + This builtin function computes and prints a summary of accuracy + measures for regression problems. + + + + :param yhat: column vector of predicted response values y + :param ytest: column vector of actual response values y + :param lm: indicator if used for linear regression model + :return: column vector holding avg_res, ss_avg_res, and R2 + """ + + params_dict = {'yhat': yhat, 'ytest': ytest, 'lm': lm} + return Matrix(yhat.sds_context, + 'lmPredictStats', + named_input_nodes=params_dict) diff --git a/src/main/python/systemds/operator/algorithm/builtin/pca.py b/src/main/python/systemds/operator/algorithm/builtin/pca.py index 403f9cfca1a..016c7caf7f6 100644 --- a/src/main/python/systemds/operator/algorithm/builtin/pca.py +++ b/src/main/python/systemds/operator/algorithm/builtin/pca.py @@ -32,18 +32,45 @@ def pca(X: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]): """ - The function Principal Component Analysis (PCA) is used for dimensionality reduction + This builtin defines PCA that is a technique typically used to + reduce the number of dimensions of a matrix. + This implementation is based on calculating eigenvectors on + the covariance matrix of the input. + + An example of calling in DML: + + .. code-block:: + + data = read($1) + [data_reduced, Components] = pca(data=data, K=4, onlyComponents=TRUE) + print(Components) + + + An example in a ML pipeline containing PCA: + + .. code-block:: + + X = read($1) + [X_reduced, Components] = pca(data=X, K=4) + Y = read($2) + bias = l2svm(X=X, Y=Y) + X_test = read($3) + [y_predict_normal, Y_predict_rounded] = l2svmPredict(X=X_test, W=bias) + write($5, Y_predict_rounded) + :param X: Input feature matrix - :param K: Number of reduced dimensions (i.e., columns) - :param Center: Indicates whether or not to center the feature matrix - :param Scale: Indicates whether or not to scale the feature matrix + :param K: Number of components returned + :param center: Indicates whether or not to center the feature matrix + :param scale: Indicates whether or not to scale the feature matrix + :param onlyComponents: Indicate if only the components should be calculated and returned + not the application of the components on X :return: Output feature matrix with K columns - :return: Output dominant eigen vectors (can be used for projections) + :return: Output dominant eigen vectors sorted by influence :return: The column means of the input, subtracted to construct the PCA - :return: The Scaling of the values, to make each dimension same size. + :return: The scaling of the values, to make each dimension same size. """ params_dict = {'X': X} diff --git a/src/main/python/systemds/operator/algorithm/builtin/randomForest.py b/src/main/python/systemds/operator/algorithm/builtin/randomForest.py index b2b4424ff64..5c4bb0438ad 100644 --- a/src/main/python/systemds/operator/algorithm/builtin/randomForest.py +++ b/src/main/python/systemds/operator/algorithm/builtin/randomForest.py @@ -30,66 +30,55 @@ def randomForest(X: Matrix, - Y: Matrix, - R: Matrix, + y: Matrix, + ctypes: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]): """ - This script implement classification random forest with both scale and categorical features. + This script implements random forest for recoded and binned categorical and + numerical input features. In detail, we train multiple CART (classification + and regression trees) decision trees in parallel and use them as an ensemble. + classifier/regressor. Each tree is trained on a sample of observations (rows) + and optionally subset of features (columns). During tree construction, split + candidates are additionally chosen on a sample of remaining features. - :param X: Feature matrix X; note that X needs to be both recoded and dummy coded - :param Y: Label matrix Y; note that Y needs to be both recoded and dummy coded - :param R: Matrix which for each feature in X contains the following information - - R[,1]: column ids TODO pass recorded and binned - - R[,2]: start indices - - R[,3]: end indices - If R is not provided by default all variables are assumed to be scale - :param bins: Number of equiheight bins per scale feature to choose thresholds - :param depth: Maximum depth of the learned tree - :param num_leaf: Number of samples when splitting stops and a leaf node is added - :param num_samples: Number of samples at which point we switch to in-memory subtree building + :param X: Feature matrix in recoded/binned representation + :param y: Label matrix in recoded/binned representation + :param ctypes: Row-Vector of column types [1 scale/ordinal, 2 categorical] + of shape 1-by-(ncol(X)+1), where the last entry is the y type :param num_trees: Number of trees to be learned in the random forest model - :param subsamp_rate: Parameter controlling the size of each tree in the forest; samples are selected from a - Poisson distribution with parameter subsamp_rate (the default value is 1.0) - :param feature_subset: Parameter that controls the number of feature used as candidates for splitting at each tree node - as a power of number of features in the dataset; - by default square root of features (i.e., feature_subset = 0.5) are used at each tree node - :param impurity: Impurity measure: entropy or Gini (the default) - :return: Matrix M containing the learned tree, where each column corresponds to a node - in the learned tree and each row contains the following information: - M[1,j]: id of node j (in a complete binary tree) - M[2,j]: tree id to which node j belongs - M[3,j]: Offset (no. of columns) to left child of j - M[4,j]: Feature index of the feature that node j looks at if j is an internal node, otherwise 0 - M[5,j]: Type of the feature that node j looks at if j is an internal node: 1 for scale and 2 - for categorical features, - otherwise the label that leaf node j is supposed to predict - M[6,j]: 1 if j is an internal node and the feature chosen for j is scale, otherwise the - size of the subset of values - stored in rows 7,8,... if j is categorical - M[7:,j]: Only applicable for internal nodes. Threshold the example's feature value is - compared to is stored at M[7,j] if the feature chosen for j is scale; - If the feature chosen for j is categorical rows 7,8,... depict the value subset chosen for j - :return: Matrix C containing the number of times samples are chosen in each tree of the random forest - :return: Mappings from scale feature ids to global feature ids - :return: Mappings from categorical feature ids to global feature ids + :param sample_frac: Sample fraction of examples for each tree in the forest + :param feature_frac: Sample fraction of features for each tree in the forest + :param max_depth: Maximum depth of the learned tree (stopping criterion) + :param min_leaf: Minimum number of samples in leaf nodes (stopping criterion) + :param min_split: Minimum number of samples in leaf for attempting a split + :param max_features: Parameter controlling the number of features used as split + candidates at tree nodes: m = ceil(num_features^max_features) + :param max_values: Parameter controlling the number of values per feature used + as split candidates: nb = ceil(num_values^max_values) + :param impurity: Impurity measure: entropy, gini (default), rss (regression) + :param seed: Fixed seed for randomization of samples and split candidates + :param verbose: Flag indicating verbose debug output + :return: Matrix M containing the learned trees, in linearized form + For example, give a feature matrix with features [a,b,c,d] + and the following two trees, M would look as follows: + (L1) |a<7| |d<5| + / \ / \ + (L2) |c<3| |b<4| |a<7| P3:2 + / \ / \ / \ + (L3) P1:2 P2:1 P3:1 P4:2 P1:2 P2:1 + --> M := + [[1, 7, 3, 3, 2, 4, 0, 2, 0, 1, 0, 1, 0, 2], (1st tree) + [4, 5, 1, 7, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0]] (2nd tree) + |(L1)| | (L2) | | (L3) | + With feature sampling (feature_frac < 1), each tree is + prefixed by a one-hot vector of sampled features + (e.g., [1,1,1,0] if we sampled a,b,c of the four features) """ - params_dict = {'X': X, 'Y': Y, 'R': R} + params_dict = {'X': X, 'y': y, 'ctypes': ctypes} params_dict.update(kwargs) - - vX_0 = Matrix(X.sds_context, '') - vX_1 = Matrix(X.sds_context, '') - vX_2 = Matrix(X.sds_context, '') - vX_3 = Matrix(X.sds_context, '') - output_nodes = [vX_0, vX_1, vX_2, vX_3, ] - - op = MultiReturn(X.sds_context, 'randomForest', output_nodes, named_input_nodes=params_dict) - - vX_0._unnamed_input_nodes = [op] - vX_1._unnamed_input_nodes = [op] - vX_2._unnamed_input_nodes = [op] - vX_3._unnamed_input_nodes = [op] - - return op + return Matrix(X.sds_context, + 'randomForest', + named_input_nodes=params_dict) diff --git a/src/main/python/systemds/operator/algorithm/builtin/randomForestPredict.py b/src/main/python/systemds/operator/algorithm/builtin/randomForestPredict.py new file mode 100644 index 00000000000..c7a598faa56 --- /dev/null +++ b/src/main/python/systemds/operator/algorithm/builtin/randomForestPredict.py @@ -0,0 +1,56 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +# Autogenerated By : src/main/python/generator/generator.py +# Autogenerated From : scripts/builtin/randomForestPredict.dml + +from typing import Dict, Iterable + +from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar +from systemds.script_building.dag import OutputType +from systemds.utils.consts import VALID_INPUT_TYPES + + +def randomForestPredict(X: Matrix, + ctypes: Matrix, + M: Matrix, + **kwargs: Dict[str, VALID_INPUT_TYPES]): + """ + This script implements random forest prediction for recoded and binned + categorical and numerical input features. + + + + :param X: Feature matrix in recoded/binned representation + :param y: Label matrix in recoded/binned representation, + optional for accuracy evaluation + :param ctypes: Row-Vector of column types [1 scale/ordinal, 2 categorical] + :param M: Matrix M holding the learned trees (one tree per row), + see randomForest() for the detailed tree representation. + :param verbose: Flag indicating verbose debug output + :return: Label vector of predictions + """ + + params_dict = {'X': X, 'ctypes': ctypes, 'M': M} + params_dict.update(kwargs) + return Matrix(X.sds_context, + 'randomForestPredict', + named_input_nodes=params_dict) diff --git a/src/main/python/systemds/operator/algorithm/builtin/scale.py b/src/main/python/systemds/operator/algorithm/builtin/scale.py index 015709d8c68..33203fafb68 100644 --- a/src/main/python/systemds/operator/algorithm/builtin/scale.py +++ b/src/main/python/systemds/operator/algorithm/builtin/scale.py @@ -32,16 +32,23 @@ def scale(X: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]): """ - This function scales and center individual features in the input matrix (column wise.) using z-score to scale the values. + This function scales and center individual features in the input + matrix (column wise.) using z-score to scale the values. + The transformation is sometimes also called scale and shift, + but it is shifted first and then subsequently scaled. + + The method is not resistant to inputs containing NaN nor overflows + of doubles, but handle it by guaranteeing that no extra NaN values + are introduced and columns that contain NaN will not be scaled or shifted. :param X: Input feature matrix - :param center: Indicates whether or not to center the feature matrix - :param scale: Indicates whether or not to scale the feature matrix - :return: Output feature matrix with K columns + :param center: Indicates to center the feature matrix + :param scale: Indicates to scale the feature matrix according to z-score + :return: Output feature matrix scaled and shifted :return: The column means of the input, subtracted if Center was TRUE - :return: The Scaling of the values, to make each dimension have similar value ranges + :return: The scaling of the values, to make each dimension have similar value ranges """ params_dict = {'X': X} diff --git a/src/main/python/systemds/operator/algorithm/builtin/slicefinder.py b/src/main/python/systemds/operator/algorithm/builtin/slicefinder.py index a8c34cc0b98..2ca2991391d 100644 --- a/src/main/python/systemds/operator/algorithm/builtin/slicefinder.py +++ b/src/main/python/systemds/operator/algorithm/builtin/slicefinder.py @@ -41,8 +41,8 @@ def slicefinder(X: Matrix, - :param X: Recoded dataset into Matrix - :param e: Trained model + :param X: Feature matrix in recoded/binned representation + :param e: Error vector of trained model :param k: Number of subsets required :param maxL: maximum level L (conjunctions of L predicates), 0 unlimited :param minSup: minimum support (min number of rows per slice)