[MINOR] Python autogenerate new Builtins

apache · May 10, 2023 · 2d4114e · 2d4114e
1 parent c8a8506
commit 2d4114e
Show file tree

Hide file tree

Showing 10 changed files with 303 additions and 124 deletions.
diff --git a/src/main/python/systemds/operator/algorithm/__init__.py b/src/main/python/systemds/operator/algorithm/__init__.py
@@ -31,6 +31,7 @@
 from .builtin.alsTopkPredict import alsTopkPredict 
 from .builtin.apply_pipeline import apply_pipeline 
 from .builtin.arima import arima 
+from .builtin.auc import auc 
 from .builtin.autoencoder_2layer import autoencoder_2layer 
 from .builtin.bandit import bandit 
 from .builtin.bivar import bivar 
@@ -110,6 +111,7 @@
 from .builtin.lmCG import lmCG 
 from .builtin.lmDS import lmDS 
 from .builtin.lmPredict import lmPredict 
+from .builtin.lmPredictStats import lmPredictStats 
 from .builtin.logSumExp import logSumExp 
 from .builtin.matrixProfile import matrixProfile 
 from .builtin.mcc import mcc 
@@ -137,6 +139,7 @@
 from .builtin.pnmf import pnmf 
 from .builtin.ppca import ppca 
 from .builtin.randomForest import randomForest 
+from .builtin.randomForestPredict import randomForestPredict 
 from .builtin.scale import scale 
 from .builtin.scaleApply import scaleApply 
 from .builtin.scaleMinMax import scaleMinMax 
@@ -182,6 +185,7 @@
  'alsTopkPredict',
  'apply_pipeline',
  'arima',
+ 'auc',
  'autoencoder_2layer',
  'bandit',
  'bivar',
@@ -261,6 +265,7 @@
  'lmCG',
  'lmDS',
  'lmPredict',
+ 'lmPredictStats',
  'logSumExp',
  'matrixProfile',
  'mcc',
@@ -288,6 +293,7 @@
  'pnmf',
  'ppca',
  'randomForest',
+ 'randomForestPredict',
  'scale',
  'scaleApply',
  'scaleMinMax',

diff --git a/src/main/python/systemds/operator/algorithm/builtin/auc.py b/src/main/python/systemds/operator/algorithm/builtin/auc.py
@@ -0,0 +1,49 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+# Autogenerated By   : src/main/python/generator/generator.py
+# Autogenerated From : scripts/builtin/auc.dml
+
+from typing import Dict, Iterable
+
+from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar
+from systemds.script_building.dag import OutputType
+from systemds.utils.consts import VALID_INPUT_TYPES
+
+
+def auc(Y: Matrix,
+        P: Matrix):
+    """
+     This builting function computes the area under the ROC curve (AUC)
+     for binary classifiers.
+    
+    
+    
+    :param Y: Binary response vector (shape: n x 1), in -1/+1 or 0/1 encoding
+    :param P: Prediction scores (predictor such as estimated probabilities)
+        for true class (shape: n x 1), assumed in [0,1]
+    :return: Area under the ROC curve (AUC)
+    """
+
+    params_dict = {'Y': Y, 'P': P}
+    return Matrix(Y.sds_context,
+        'auc',
+        named_input_nodes=params_dict)
diff --git a/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py b/src/main/python/systemds/operator/algorithm/builtin/decisionTree.py
@@ -30,43 +30,46 @@
 
 
 def decisionTree(X: Matrix,
-                 Y: Matrix,
-                 R: Matrix,
+                 y: Matrix,
+                 ctypes: Matrix,
                  **kwargs: Dict[str, VALID_INPUT_TYPES]):
     """
-     Builtin script implementing classification trees with scale and categorical features
+     This script implements decision trees for recoded and binned categorical and
+     numerical input features. We train a single CART (classification and
+     regression tree) decision trees depending on the provided labels y, either
+     classification (majority vote per leaf) or regression (average per leaf).
     
     
     
-    :param X: Feature matrix X; note that X needs to be both recoded and dummy coded
-    :param Y: Label matrix Y; note that Y needs to be both recoded and dummy coded
-    :param R: Matrix R which for each feature in X contains the following information
-        - R[1,]: Row Vector which indicates if feature vector is scalar or categorical. 1 indicates
-        a scalar feature vector, other positive Integers indicate the number of categories
-        If R is not provided by default all variables are assumed to be scale
-    :param bins: Number of equiheight bins per scale feature to choose thresholds
-    :param depth: Maximum depth of the learned tree
-    :param verbose: boolean specifying if the algorithm should print information while executing
-    :return: Matrix M where each column corresponds to a node in the learned tree and each row
-        contains the following information:
-        M[1,j]: id of node j (in a complete binary tree)
-        M[2,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0
-        M[3,j]: Feature index of the feature (scale feature id if the feature is scale or
-        categorical feature id if the feature is categorical)
-        that node j looks at if j is an internal node, otherwise 0
-        M[4,j]: Type of the feature that node j looks at if j is an internal node: holds
-        the same information as R input vector
-        M[5,j]: If j is an internal node: 1 if the feature chosen for j is scale,
-        otherwise the size of the subset of values
-        stored in rows 6,7,... if j is categorical
-        If j is a leaf node: number of misclassified samples reaching at node j
-        M[6:,j]: If j is an internal node: Threshold the example's feature value is compared
-        to is stored at M[6,j] if the feature chosen for j is scale,
-        otherwise if the feature chosen for j is categorical rows 6,7,... depict the value subset chosen for j
-        If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0
+    :param X: Feature matrix in recoded/binned representation
+    :param y: Label matrix in recoded/binned representation
+    :param ctypes: Row-Vector of column types [1 scale/ordinal, 2 categorical]
+        of shape 1-by-(ncol(X)+1), where the last entry is the y type
+    :param max_depth: Maximum depth of the learned tree (stopping criterion)
+    :param min_leaf: Minimum number of samples in leaf nodes (stopping criterion),
+        odd number recommended to avoid 50/50 leaf label decisions
+    :param min_split: Minimum number of samples in leaf for attempting a split
+    :param max_features: Parameter controlling the number of features used as split
+        candidates at tree nodes: m = ceil(num_features^max_features)
+    :param max_values: Parameter controlling the number of values per feature used
+        as split candidates: nb = ceil(num_values^max_values)
+    :param impurity: Impurity measure: entropy, gini (default), rss (regression)
+    :param seed: Fixed seed for randomization of samples and split candidates
+    :param verbose: Flag indicating verbose debug output
+    :return: Matrix M containing the learne trees, in linearized form
+        For example, give a feature matrix with features [a,b,c,d]
+        and the following trees, M would look as follows:
+        (L1)               |d<5|
+        /     \
+        (L2)           P1:2    |a<7|
+        /   \
+        (L3)                 P2:2 P3:1
+        --> M :=
+        [[4, 5, 0, 2, 1, 7, 0, 0, 0, 0, 0, 2, 0, 1]]
+        |(L1)| |  (L2)   | |        (L3)         |
     """
 
-    params_dict = {'X': X, 'Y': Y, 'R': R}
+    params_dict = {'X': X, 'y': y, 'ctypes': ctypes}
     params_dict.update(kwargs)
     return Matrix(X.sds_context,
         'decisionTree',

diff --git a/src/main/python/systemds/operator/algorithm/builtin/decisionTreePredict.py b/src/main/python/systemds/operator/algorithm/builtin/decisionTreePredict.py
@@ -29,40 +29,32 @@
 from systemds.utils.consts import VALID_INPUT_TYPES
 
 
-def decisionTreePredict(M: Matrix,
-                        X: Matrix,
-                        strategy: str):
+def decisionTreePredict(X: Matrix,
+                        ctypes: Matrix,
+                        M: Matrix,
+                        **kwargs: Dict[str, VALID_INPUT_TYPES]):
     """
-    
-     Builtin script implementing prediction based on classification trees with scale features using prediction methods of the
+     This script implements random forest prediction for recoded and binned
+     categorical and numerical input features.
      Hummingbird paper (https://www.usenix.org/system/files/osdi20-nakandala.pdf).
     
     
     
-    :param M: Decision tree matrix M, as generated by scripts/builtin/decisionTree.dml, where each column corresponds
-        to a node in the learned tree and each row contains the following information:
-        M[1,j]: id of node j (in a complete binary tree)
-        M[2,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0
-        M[3,j]: Feature index of the feature (scale feature id if the feature is scale or
-        categorical feature id if the feature is categorical)
-        that node j looks at if j is an internal node, otherwise 0
-        M[4,j]: Type of the feature that node j looks at if j is an internal node: holds
-        the same information as R input vector
-        M[5,j]: If j is an internal node: 1 if the feature chosen for j is scale,
-        otherwise the size of the subset of values
-        stored in rows 6,7,... if j is categorical
-        If j is a leaf node: number of misclassified samples reaching at node j
-        M[6:,j]: If j is an internal node: Threshold the example's feature value is compared
-        to is stored at M[6,j] if the feature chosen for j is scale,
-        otherwise if the feature chosen for j is categorical rows 6,7,... depict the value subset chosen for j
-        If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0
-    :param X: Feature matrix X
-    :param strategy: Prediction strategy, can be one of ["GEMM", "TT", "PTT"], referring to "Generic matrix multiplication",
+    :param X: Feature matrix in recoded/binned representation
+    :param y: Label matrix in recoded/binned representation,
+        optional for accuracy evaluation
+    :param ctypes: Row-Vector of column types [1 scale/ordinal, 2 categorical]
+    :param M: Matrix M holding the learned tree in linearized form
+        see decisionTree() for the detailed tree representation.
+    :param strategy: Prediction strategy, can be one of ["GEMM", "TT", "PTT"],
+        referring to "Generic matrix multiplication",
         "Tree traversal", and "Perfect tree traversal", respectively
-    :return: Matrix containing the predicted labels for X
+    :param verbose: Flag indicating verbose debug output
+    :return: Label vector of predictions
     """
 
-    params_dict = {'M': M, 'X': X, 'strategy': strategy}
-    return Matrix(M.sds_context,
+    params_dict = {'X': X, 'ctypes': ctypes, 'M': M}
+    params_dict.update(kwargs)
+    return Matrix(X.sds_context,
         'decisionTreePredict',
         named_input_nodes=params_dict)
diff --git a/src/main/python/systemds/operator/algorithm/builtin/lmPredictStats.py b/src/main/python/systemds/operator/algorithm/builtin/lmPredictStats.py
@@ -0,0 +1,50 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+# Autogenerated By   : src/main/python/generator/generator.py
+# Autogenerated From : scripts/builtin/lmPredictStats.dml
+
+from typing import Dict, Iterable
+
+from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar
+from systemds.script_building.dag import OutputType
+from systemds.utils.consts import VALID_INPUT_TYPES
+
+
+def lmPredictStats(yhat: Matrix,
+                   ytest: Matrix,
+                   lm: bool):
+    """
+     This builtin function computes and prints a summary of accuracy
+     measures for regression problems.
+    
+    
+    
+    :param yhat: column vector of predicted response values y
+    :param ytest: column vector of actual response values y
+    :param lm: indicator if used for linear regression model
+    :return: column vector holding avg_res, ss_avg_res, and R2
+    """
+
+    params_dict = {'yhat': yhat, 'ytest': ytest, 'lm': lm}
+    return Matrix(yhat.sds_context,
+        'lmPredictStats',
+        named_input_nodes=params_dict)
diff --git a/src/main/python/systemds/operator/algorithm/builtin/pca.py b/src/main/python/systemds/operator/algorithm/builtin/pca.py
@@ -32,18 +32,45 @@
 def pca(X: Matrix,
         **kwargs: Dict[str, VALID_INPUT_TYPES]):
     """
-     The function Principal Component Analysis (PCA) is used for dimensionality reduction
+     This builtin defines PCA that is a technique typically used to
+     reduce the number of dimensions of a matrix.
+     This implementation is based on calculating eigenvectors on
+     the covariance matrix of the input.
+    
+     An example of calling in DML:
+    
+     .. code-block::
+    
+       data = read($1)
+       [data_reduced, Components] = pca(data=data, K=4, onlyComponents=TRUE)
+       print(Components)
+    
+    
+     An example in a ML pipeline containing PCA:
+    
+     .. code-block::
+    
+       X = read($1)
+       [X_reduced, Components] = pca(data=X, K=4)
+       Y = read($2)
+       bias = l2svm(X=X, Y=Y)
+       X_test = read($3)
+       [y_predict_normal, Y_predict_rounded] = l2svmPredict(X=X_test, W=bias)
+       write($5, Y_predict_rounded)
+    
     
     
     
     :param X: Input feature matrix
-    :param K: Number of reduced dimensions (i.e., columns)
-    :param Center: Indicates whether or not to center the feature matrix
-    :param Scale: Indicates whether or not to scale the feature matrix
+    :param K: Number of components returned
+    :param center: Indicates whether or not to center the feature matrix
+    :param scale: Indicates whether or not to scale the feature matrix
+    :param onlyComponents: Indicate if only the components should be calculated and returned
+        not the application of the components on X
     :return: Output feature matrix with K columns
-    :return: Output dominant eigen vectors (can be used for projections)
+    :return: Output dominant eigen vectors sorted by influence
     :return: The column means of the input, subtracted to construct the PCA
-    :return: The Scaling of the values, to make each dimension same size.
+    :return: The scaling of the values, to make each dimension same size.
     """
 
     params_dict = {'X': X}