From 2d4114e96ef6858b0dff0059e63644bfca8189ad Mon Sep 17 00:00:00 2001
From: baunsgaard <baunsgaard@tu-berlin.de>
Date: Wed, 10 May 2023 10:42:17 +0200
Subject: [PATCH] [MINOR] Python autogenerate new Builtins

---
 .../systemds/operator/algorithm/__init__.py   |  6 ++
 .../operator/algorithm/builtin/auc.py         | 49 ++++++++++
 .../algorithm/builtin/decisionTree.py         | 63 ++++++------
 .../algorithm/builtin/decisionTreePredict.py  | 46 ++++-----
 .../algorithm/builtin/lmPredictStats.py       | 50 ++++++++++
 .../operator/algorithm/builtin/pca.py         | 39 ++++++--
 .../algorithm/builtin/randomForest.py         | 97 ++++++++-----------
 .../algorithm/builtin/randomForestPredict.py  | 56 +++++++++++
 .../operator/algorithm/builtin/scale.py       | 17 +++-
 .../operator/algorithm/builtin/slicefinder.py |  4 +-
 10 files changed, 303 insertions(+), 124 deletions(-)
 create mode 100644 src/main/python/systemds/operator/algorithm/builtin/auc.py
 create mode 100644 src/main/python/systemds/operator/algorithm/builtin/lmPredictStats.py
 create mode 100644 src/main/python/systemds/operator/algorithm/builtin/randomForestPredict.py

diff --git a/src/main/python/systemds/operator/algorithm/__init__.py b/src/main/python/systemds/operator/algorithm/__init__.py
index 2dd6578833e..769ca66229a 100644
--- a/src/main/python/systemds/operator/algorithm/__init__.py
+++ b/src/main/python/systemds/operator/algorithm/__init__.py
@@ -31,6 +31,7 @@
 from .builtin.alsTopkPredict import alsTopkPredict 
 from .builtin.apply_pipeline import apply_pipeline 
 from .builtin.arima import arima 
+from .builtin.auc import auc 
 from .builtin.autoencoder_2layer import autoencoder_2layer 
 from .builtin.bandit import bandit 
 from .builtin.bivar import bivar 
@@ -110,6 +111,7 @@
 from .builtin.lmCG import lmCG 
 from .builtin.lmDS import lmDS 
 from .builtin.lmPredict import lmPredict 
+from .builtin.lmPredictStats import lmPredictStats 
 from .builtin.logSumExp import logSumExp 
 from .builtin.matrixProfile import matrixProfile 
 from .builtin.mcc import mcc 
@@ -137,6 +139,7 @@
 from .builtin.pnmf import pnmf 
 from .builtin.ppca import ppca 
 from .builtin.randomForest import randomForest 
+from .builtin.randomForestPredict import randomForestPredict 
 from .builtin.scale import scale 
 from .builtin.scaleApply import scaleApply 
 from .builtin.scaleMinMax import scaleMinMax 
@@ -182,6 +185,7 @@
  'alsTopkPredict',
  'apply_pipeline',
  'arima',
+ 'auc',
  'autoencoder_2layer',
  'bandit',
  'bivar',
@@ -261,6 +265,7 @@
  'lmCG',
  'lmDS',
  'lmPredict',
+ 'lmPredictStats',
  'logSumExp',
  'matrixProfile',
  'mcc',
@@ -288,6 +293,7 @@
  'pnmf',
  'ppca',
  'randomForest',
+ 'randomForestPredict',
  'scale',
  'scaleApply',
  'scaleMinMax',
diff --git a/src/main/python/systemds/operator/algorithm/builtin/auc.py b/src/main/python/systemds/operator/algorithm/builtin/auc.py
new file mode 100644
index 00000000000..8df68353112
--- /dev/null
+++ b/src/main/python/systemds/operator/algorithm/builtin/auc.py
@@ -0,0 +1,49 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+# Autogenerated By   : src/main/python/generator/generator.py
+# Autogenerated From : scripts/builtin/auc.dml
+
+from typing import Dict, Iterable
+
+from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar
+from systemds.script_building.dag import OutputType
+from systemds.utils.consts import VALID_INPUT_TYPES
+
+
+def auc(Y: Matrix,
+        P: Matrix):
+    """
+     This builting function computes the area under the ROC curve (AUC)
+     for binary classifiers.
+    
+    
+    
+    :param Y: Binary response vector (shape: n x 1), in -1/+1 or 0/1 encoding
+    :param P: Prediction scores (predictor such as estimated probabilities)
+        for true class (shape: n x 1), assumed in [0,1]
+    :return: Area under the ROC curve (AUC)
+    """
+
+    params_dict = {'Y': Y, 'P': P}
+    return Matrix(Y.sds_context,
+        'auc',
+        named_input_nodes=params_dict)
diff --git a/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py b/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py
index 4fb2ee56886..399a21fd502 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py
@@ -30,43 +30,46 @@
 
 
 def decisionTree(X: Matrix,
-                 Y: Matrix,
-                 R: Matrix,
+                 y: Matrix,
+                 ctypes: Matrix,
                  **kwargs: Dict[str, VALID_INPUT_TYPES]):
     """
-     Builtin script implementing classification trees with scale and categorical features
+     This script implements decision trees for recoded and binned categorical and
+     numerical input features. We train a single CART (classification and
+     regression tree) decision trees depending on the provided labels y, either
+     classification (majority vote per leaf) or regression (average per leaf).
     
     
     
-    :param X: Feature matrix X; note that X needs to be both recoded and dummy coded
-    :param Y: Label matrix Y; note that Y needs to be both recoded and dummy coded
-    :param R: Matrix R which for each feature in X contains the following information
-        - R[1,]: Row Vector which indicates if feature vector is scalar or categorical. 1 indicates
-        a scalar feature vector, other positive Integers indicate the number of categories
-        If R is not provided by default all variables are assumed to be scale
-    :param bins: Number of equiheight bins per scale feature to choose thresholds
-    :param depth: Maximum depth of the learned tree
-    :param verbose: boolean specifying if the algorithm should print information while executing
-    :return: Matrix M where each column corresponds to a node in the learned tree and each row
-        contains the following information:
-        M[1,j]: id of node j (in a complete binary tree)
-        M[2,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0
-        M[3,j]: Feature index of the feature (scale feature id if the feature is scale or
-        categorical feature id if the feature is categorical)
-        that node j looks at if j is an internal node, otherwise 0
-        M[4,j]: Type of the feature that node j looks at if j is an internal node: holds
-        the same information as R input vector
-        M[5,j]: If j is an internal node: 1 if the feature chosen for j is scale,
-        otherwise the size of the subset of values
-        stored in rows 6,7,... if j is categorical
-        If j is a leaf node: number of misclassified samples reaching at node j
-        M[6:,j]: If j is an internal node: Threshold the example's feature value is compared
-        to is stored at M[6,j] if the feature chosen for j is scale,
-        otherwise if the feature chosen for j is categorical rows 6,7,... depict the value subset chosen for j
-        If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0
+    :param X: Feature matrix in recoded/binned representation
+    :param y: Label matrix in recoded/binned representation
+    :param ctypes: Row-Vector of column types [1 scale/ordinal, 2 categorical]
+        of shape 1-by-(ncol(X)+1), where the last entry is the y type
+    :param max_depth: Maximum depth of the learned tree (stopping criterion)
+    :param min_leaf: Minimum number of samples in leaf nodes (stopping criterion),
+        odd number recommended to avoid 50/50 leaf label decisions
+    :param min_split: Minimum number of samples in leaf for attempting a split
+    :param max_features: Parameter controlling the number of features used as split
+        candidates at tree nodes: m = ceil(num_features^max_features)
+    :param max_values: Parameter controlling the number of values per feature used
+        as split candidates: nb = ceil(num_values^max_values)
+    :param impurity: Impurity measure: entropy, gini (default), rss (regression)
+    :param seed: Fixed seed for randomization of samples and split candidates
+    :param verbose: Flag indicating verbose debug output
+    :return: Matrix M containing the learne trees, in linearized form
+        For example, give a feature matrix with features [a,b,c,d]
+        and the following trees, M would look as follows:
+        (L1)               |d<5|
+        /     \
+        (L2)           P1:2    |a<7|
+        /   \
+        (L3)                 P2:2 P3:1
+        --> M :=
+        [[4, 5, 0, 2, 1, 7, 0, 0, 0, 0, 0, 2, 0, 1]]
+        |(L1)| |  (L2)   | |        (L3)         |
     """
 
-    params_dict = {'X': X, 'Y': Y, 'R': R}
+    params_dict = {'X': X, 'y': y, 'ctypes': ctypes}
     params_dict.update(kwargs)
     return Matrix(X.sds_context,
         'decisionTree',
diff --git a/src/main/python/systemds/operator/algorithm/builtin/decisionTreePredict.py b/src/main/python/systemds/operator/algorithm/builtin/decisionTreePredict.py
index 51a396eef7e..32bb06609ba 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/decisionTreePredict.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/decisionTreePredict.py
@@ -29,40 +29,32 @@
 from systemds.utils.consts import VALID_INPUT_TYPES
 
 
-def decisionTreePredict(M: Matrix,
-                        X: Matrix,
-                        strategy: str):
+def decisionTreePredict(X: Matrix,
+                        ctypes: Matrix,
+                        M: Matrix,
+                        **kwargs: Dict[str, VALID_INPUT_TYPES]):
     """
-    
-     Builtin script implementing prediction based on classification trees with scale features using prediction methods of the
+     This script implements random forest prediction for recoded and binned
+     categorical and numerical input features.
      Hummingbird paper (https://www.usenix.org/system/files/osdi20-nakandala.pdf).
     
     
     
-    :param M: Decision tree matrix M, as generated by scripts/builtin/decisionTree.dml, where each column corresponds
-        to a node in the learned tree and each row contains the following information:
-        M[1,j]: id of node j (in a complete binary tree)
-        M[2,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0
-        M[3,j]: Feature index of the feature (scale feature id if the feature is scale or
-        categorical feature id if the feature is categorical)
-        that node j looks at if j is an internal node, otherwise 0
-        M[4,j]: Type of the feature that node j looks at if j is an internal node: holds
-        the same information as R input vector
-        M[5,j]: If j is an internal node: 1 if the feature chosen for j is scale,
-        otherwise the size of the subset of values
-        stored in rows 6,7,... if j is categorical
-        If j is a leaf node: number of misclassified samples reaching at node j
-        M[6:,j]: If j is an internal node: Threshold the example's feature value is compared
-        to is stored at M[6,j] if the feature chosen for j is scale,
-        otherwise if the feature chosen for j is categorical rows 6,7,... depict the value subset chosen for j
-        If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0
-    :param X: Feature matrix X
-    :param strategy: Prediction strategy, can be one of ["GEMM", "TT", "PTT"], referring to "Generic matrix multiplication",
+    :param X: Feature matrix in recoded/binned representation
+    :param y: Label matrix in recoded/binned representation,
+        optional for accuracy evaluation
+    :param ctypes: Row-Vector of column types [1 scale/ordinal, 2 categorical]
+    :param M: Matrix M holding the learned tree in linearized form
+        see decisionTree() for the detailed tree representation.
+    :param strategy: Prediction strategy, can be one of ["GEMM", "TT", "PTT"],
+        referring to "Generic matrix multiplication",
         "Tree traversal", and "Perfect tree traversal", respectively
-    :return: Matrix containing the predicted labels for X
+    :param verbose: Flag indicating verbose debug output
+    :return: Label vector of predictions
     """
 
-    params_dict = {'M': M, 'X': X, 'strategy': strategy}
-    return Matrix(M.sds_context,
+    params_dict = {'X': X, 'ctypes': ctypes, 'M': M}
+    params_dict.update(kwargs)
+    return Matrix(X.sds_context,
         'decisionTreePredict',
         named_input_nodes=params_dict)
diff --git a/src/main/python/systemds/operator/algorithm/builtin/lmPredictStats.py b/src/main/python/systemds/operator/algorithm/builtin/lmPredictStats.py
new file mode 100644
index 00000000000..731d6d232c1
--- /dev/null
+++ b/src/main/python/systemds/operator/algorithm/builtin/lmPredictStats.py
@@ -0,0 +1,50 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+# Autogenerated By   : src/main/python/generator/generator.py
+# Autogenerated From : scripts/builtin/lmPredictStats.dml
+
+from typing import Dict, Iterable
+
+from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar
+from systemds.script_building.dag import OutputType
+from systemds.utils.consts import VALID_INPUT_TYPES
+
+
+def lmPredictStats(yhat: Matrix,
+                   ytest: Matrix,
+                   lm: bool):
+    """
+     This builtin function computes and prints a summary of accuracy
+     measures for regression problems.
+    
+    
+    
+    :param yhat: column vector of predicted response values y
+    :param ytest: column vector of actual response values y
+    :param lm: indicator if used for linear regression model
+    :return: column vector holding avg_res, ss_avg_res, and R2
+    """
+
+    params_dict = {'yhat': yhat, 'ytest': ytest, 'lm': lm}
+    return Matrix(yhat.sds_context,
+        'lmPredictStats',
+        named_input_nodes=params_dict)
diff --git a/src/main/python/systemds/operator/algorithm/builtin/pca.py b/src/main/python/systemds/operator/algorithm/builtin/pca.py
index 403f9cfca1a..016c7caf7f6 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/pca.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/pca.py
@@ -32,18 +32,45 @@
 def pca(X: Matrix,
         **kwargs: Dict[str, VALID_INPUT_TYPES]):
     """
-     The function Principal Component Analysis (PCA) is used for dimensionality reduction
+     This builtin defines PCA that is a technique typically used to
+     reduce the number of dimensions of a matrix.
+     This implementation is based on calculating eigenvectors on
+     the covariance matrix of the input.
+    
+     An example of calling in DML:
+    
+     .. code-block::
+    
+       data = read($1)
+       [data_reduced, Components] = pca(data=data, K=4, onlyComponents=TRUE)
+       print(Components)
+    
+    
+     An example in a ML pipeline containing PCA:
+    
+     .. code-block::
+    
+       X = read($1)
+       [X_reduced, Components] = pca(data=X, K=4)
+       Y = read($2)
+       bias = l2svm(X=X, Y=Y)
+       X_test = read($3)
+       [y_predict_normal, Y_predict_rounded] = l2svmPredict(X=X_test, W=bias)
+       write($5, Y_predict_rounded)
+    
     
     
     
     :param X: Input feature matrix
-    :param K: Number of reduced dimensions (i.e., columns)
-    :param Center: Indicates whether or not to center the feature matrix
-    :param Scale: Indicates whether or not to scale the feature matrix
+    :param K: Number of components returned
+    :param center: Indicates whether or not to center the feature matrix
+    :param scale: Indicates whether or not to scale the feature matrix
+    :param onlyComponents: Indicate if only the components should be calculated and returned
+        not the application of the components on X
     :return: Output feature matrix with K columns
-    :return: Output dominant eigen vectors (can be used for projections)
+    :return: Output dominant eigen vectors sorted by influence
     :return: The column means of the input, subtracted to construct the PCA
-    :return: The Scaling of the values, to make each dimension same size.
+    :return: The scaling of the values, to make each dimension same size.
     """
 
     params_dict = {'X': X}
diff --git a/src/main/python/systemds/operator/algorithm/builtin/randomForest.py b/src/main/python/systemds/operator/algorithm/builtin/randomForest.py
index b2b4424ff64..5c4bb0438ad 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/randomForest.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/randomForest.py
@@ -30,66 +30,55 @@
 
 
 def randomForest(X: Matrix,
-                 Y: Matrix,
-                 R: Matrix,
+                 y: Matrix,
+                 ctypes: Matrix,
                  **kwargs: Dict[str, VALID_INPUT_TYPES]):
     """
-     This script implement classification random forest with both scale and categorical features.
+     This script implements random forest for recoded and binned categorical and
+     numerical input features. In detail, we train multiple CART (classification
+     and regression trees) decision trees in parallel and use them as an ensemble.
+     classifier/regressor. Each tree is trained on a sample of observations (rows)
+     and optionally subset of features (columns). During tree construction, split
+     candidates are additionally chosen on a sample of remaining features.
     
     
     
-    :param X: Feature matrix X; note that X needs to be both recoded and dummy coded
-    :param Y: Label matrix Y; note that Y needs to be both recoded and dummy coded
-    :param R: Matrix which for each feature in X contains the following information
-        - R[,1]: column ids       TODO pass recorded and binned
-        - R[,2]: start indices
-        - R[,3]: end indices
-        If R is not provided by default all variables are assumed to be scale
-    :param bins: Number of equiheight bins per scale feature to choose thresholds
-    :param depth: Maximum depth of the learned tree
-    :param num_leaf: Number of samples when splitting stops and a leaf node is added
-    :param num_samples: Number of samples at which point we switch to in-memory subtree building
+    :param X: Feature matrix in recoded/binned representation
+    :param y: Label matrix in recoded/binned representation
+    :param ctypes: Row-Vector of column types [1 scale/ordinal, 2 categorical]
+        of shape 1-by-(ncol(X)+1), where the last entry is the y type
     :param num_trees: Number of trees to be learned in the random forest model
-    :param subsamp_rate: Parameter controlling the size of each tree in the forest; samples are selected from a
-        Poisson distribution with parameter subsamp_rate (the default value is 1.0)
-    :param feature_subset: Parameter that controls the number of feature used as candidates for splitting at each tree node
-        as a power of number of features in the dataset;
-        by default square root of features (i.e., feature_subset = 0.5) are used at each tree node
-    :param impurity: Impurity measure: entropy or Gini (the default)
-    :return: Matrix M containing the learned tree, where each column corresponds to a node
-        in the learned tree and each row contains the following information:
-        M[1,j]: id of node j (in a complete binary tree)
-        M[2,j]: tree id to which node j belongs
-        M[3,j]: Offset (no. of columns) to left child of j
-        M[4,j]: Feature index of the feature that node j looks at if j is an internal node, otherwise 0
-        M[5,j]: Type of the feature that node j looks at if j is an internal node: 1 for scale and 2
-        for categorical features,
-        otherwise the label that leaf node j is supposed to predict
-        M[6,j]: 1 if j is an internal node and the feature chosen for j is scale, otherwise the
-        size of the subset of values
-        stored in rows 7,8,... if j is categorical
-        M[7:,j]: Only applicable for internal nodes. Threshold the example's feature value is
-        compared to is stored at M[7,j] if the feature chosen for j is scale;
-        If the feature chosen for j is categorical rows 7,8,... depict the value subset chosen for j
-    :return: Matrix C containing the number of times samples are chosen in each tree of the random forest
-    :return: Mappings from scale feature ids to global feature ids
-    :return: Mappings from categorical feature ids to global feature ids
+    :param sample_frac: Sample fraction of examples for each tree in the forest
+    :param feature_frac: Sample fraction of features for each tree in the forest
+    :param max_depth: Maximum depth of the learned tree (stopping criterion)
+    :param min_leaf: Minimum number of samples in leaf nodes (stopping criterion)
+    :param min_split: Minimum number of samples in leaf for attempting a split
+    :param max_features: Parameter controlling the number of features used as split
+        candidates at tree nodes: m = ceil(num_features^max_features)
+    :param max_values: Parameter controlling the number of values per feature used
+        as split candidates: nb = ceil(num_values^max_values)
+    :param impurity: Impurity measure: entropy, gini (default), rss (regression)
+    :param seed: Fixed seed for randomization of samples and split candidates
+    :param verbose: Flag indicating verbose debug output
+    :return: Matrix M containing the learned trees, in linearized form
+        For example, give a feature matrix with features [a,b,c,d]
+        and the following two trees, M would look as follows:
+        (L1)          |a<7|                   |d<5|
+        /     \                 /     \
+        (L2)     |c<3|     |b<4|         |a<7|     P3:2
+        /   \     /   \         /   \
+        (L3)   P1:2 P2:1 P3:1 P4:2     P1:2 P2:1
+        --> M :=
+        [[1, 7, 3, 3, 2, 4, 0, 2, 0, 1, 0, 1, 0, 2],  (1st tree)
+        [4, 5, 1, 7, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0]]  (2nd tree)
+        |(L1)| |  (L2)   | |        (L3)         |
+        With feature sampling (feature_frac < 1), each tree is
+        prefixed by a one-hot vector of sampled features
+        (e.g., [1,1,1,0] if we sampled a,b,c of the four features)
     """
 
-    params_dict = {'X': X, 'Y': Y, 'R': R}
+    params_dict = {'X': X, 'y': y, 'ctypes': ctypes}
     params_dict.update(kwargs)
-    
-    vX_0 = Matrix(X.sds_context, '')
-    vX_1 = Matrix(X.sds_context, '')
-    vX_2 = Matrix(X.sds_context, '')
-    vX_3 = Matrix(X.sds_context, '')
-    output_nodes = [vX_0, vX_1, vX_2, vX_3, ]
-
-    op = MultiReturn(X.sds_context, 'randomForest', output_nodes, named_input_nodes=params_dict)
-
-    vX_0._unnamed_input_nodes = [op]
-    vX_1._unnamed_input_nodes = [op]
-    vX_2._unnamed_input_nodes = [op]
-    vX_3._unnamed_input_nodes = [op]
-
-    return op
+    return Matrix(X.sds_context,
+        'randomForest',
+        named_input_nodes=params_dict)
diff --git a/src/main/python/systemds/operator/algorithm/builtin/randomForestPredict.py b/src/main/python/systemds/operator/algorithm/builtin/randomForestPredict.py
new file mode 100644
index 00000000000..c7a598faa56
--- /dev/null
+++ b/src/main/python/systemds/operator/algorithm/builtin/randomForestPredict.py
@@ -0,0 +1,56 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+# Autogenerated By   : src/main/python/generator/generator.py
+# Autogenerated From : scripts/builtin/randomForestPredict.dml
+
+from typing import Dict, Iterable
+
+from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar
+from systemds.script_building.dag import OutputType
+from systemds.utils.consts import VALID_INPUT_TYPES
+
+
+def randomForestPredict(X: Matrix,
+                        ctypes: Matrix,
+                        M: Matrix,
+                        **kwargs: Dict[str, VALID_INPUT_TYPES]):
+    """
+     This script implements random forest prediction for recoded and binned
+     categorical and numerical input features.
+    
+    
+    
+    :param X: Feature matrix in recoded/binned representation
+    :param y: Label matrix in recoded/binned representation,
+        optional for accuracy evaluation
+    :param ctypes: Row-Vector of column types [1 scale/ordinal, 2 categorical]
+    :param M: Matrix M holding the learned trees (one tree per row),
+        see randomForest() for the detailed tree representation.
+    :param verbose: Flag indicating verbose debug output
+    :return: Label vector of predictions
+    """
+
+    params_dict = {'X': X, 'ctypes': ctypes, 'M': M}
+    params_dict.update(kwargs)
+    return Matrix(X.sds_context,
+        'randomForestPredict',
+        named_input_nodes=params_dict)
diff --git a/src/main/python/systemds/operator/algorithm/builtin/scale.py b/src/main/python/systemds/operator/algorithm/builtin/scale.py
index 015709d8c68..33203fafb68 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/scale.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/scale.py
@@ -32,16 +32,23 @@
 def scale(X: Matrix,
           **kwargs: Dict[str, VALID_INPUT_TYPES]):
     """
-     This function scales and center individual features in the input matrix (column wise.) using z-score to scale the values.
+     This function scales and center individual features in the input
+     matrix (column wise.) using z-score to scale the values.
+     The transformation is sometimes also called scale and shift,
+     but it is shifted first and then subsequently scaled.
+    
+     The method is not resistant to inputs containing NaN nor overflows
+     of doubles, but handle it by guaranteeing that no extra NaN values
+     are introduced and columns that contain NaN will not be scaled or shifted.
     
     
     
     :param X: Input feature matrix
-    :param center: Indicates whether or not to center the feature matrix
-    :param scale: Indicates whether or not to scale the feature matrix
-    :return: Output feature matrix with K columns
+    :param center: Indicates to center the feature matrix
+    :param scale: Indicates to scale the feature matrix according to z-score
+    :return: Output feature matrix scaled and shifted
     :return: The column means of the input, subtracted if Center was TRUE
-    :return: The Scaling of the values, to make each dimension have similar value ranges
+    :return: The scaling of the values, to make each dimension have similar value ranges
     """
 
     params_dict = {'X': X}
diff --git a/src/main/python/systemds/operator/algorithm/builtin/slicefinder.py b/src/main/python/systemds/operator/algorithm/builtin/slicefinder.py
index a8c34cc0b98..2ca2991391d 100644
--- a/src/main/python/systemds/operator/algorithm/builtin/slicefinder.py
+++ b/src/main/python/systemds/operator/algorithm/builtin/slicefinder.py
@@ -41,8 +41,8 @@ def slicefinder(X: Matrix,
     
     
     
-    :param X: Recoded dataset into Matrix
-    :param e: Trained model
+    :param X: Feature matrix in recoded/binned representation
+    :param e: Error vector of trained model
     :param k: Number of subsets required
     :param maxL: maximum level L (conjunctions of L predicates), 0 unlimited
     :param minSup: minimum support (min number of rows per slice)