Skip to content

Commit

Permalink
[MINOR] Python autogenerate new Builtins
Browse files Browse the repository at this point in the history
  • Loading branch information
Baunsgaard committed May 10, 2023
1 parent c8a8506 commit 2d4114e
Show file tree
Hide file tree
Showing 10 changed files with 303 additions and 124 deletions.
6 changes: 6 additions & 0 deletions src/main/python/systemds/operator/algorithm/__init__.py
Expand Up @@ -31,6 +31,7 @@
from .builtin.alsTopkPredict import alsTopkPredict
from .builtin.apply_pipeline import apply_pipeline
from .builtin.arima import arima
from .builtin.auc import auc
from .builtin.autoencoder_2layer import autoencoder_2layer
from .builtin.bandit import bandit
from .builtin.bivar import bivar
Expand Down Expand Up @@ -110,6 +111,7 @@
from .builtin.lmCG import lmCG
from .builtin.lmDS import lmDS
from .builtin.lmPredict import lmPredict
from .builtin.lmPredictStats import lmPredictStats
from .builtin.logSumExp import logSumExp
from .builtin.matrixProfile import matrixProfile
from .builtin.mcc import mcc
Expand Down Expand Up @@ -137,6 +139,7 @@
from .builtin.pnmf import pnmf
from .builtin.ppca import ppca
from .builtin.randomForest import randomForest
from .builtin.randomForestPredict import randomForestPredict
from .builtin.scale import scale
from .builtin.scaleApply import scaleApply
from .builtin.scaleMinMax import scaleMinMax
Expand Down Expand Up @@ -182,6 +185,7 @@
'alsTopkPredict',
'apply_pipeline',
'arima',
'auc',
'autoencoder_2layer',
'bandit',
'bivar',
Expand Down Expand Up @@ -261,6 +265,7 @@
'lmCG',
'lmDS',
'lmPredict',
'lmPredictStats',
'logSumExp',
'matrixProfile',
'mcc',
Expand Down Expand Up @@ -288,6 +293,7 @@
'pnmf',
'ppca',
'randomForest',
'randomForestPredict',
'scale',
'scaleApply',
'scaleMinMax',
Expand Down
49 changes: 49 additions & 0 deletions src/main/python/systemds/operator/algorithm/builtin/auc.py
@@ -0,0 +1,49 @@
# -------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# -------------------------------------------------------------

# Autogenerated By : src/main/python/generator/generator.py
# Autogenerated From : scripts/builtin/auc.dml

from typing import Dict, Iterable

from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar
from systemds.script_building.dag import OutputType
from systemds.utils.consts import VALID_INPUT_TYPES


def auc(Y: Matrix,
P: Matrix):
"""
This builting function computes the area under the ROC curve (AUC)
for binary classifiers.
:param Y: Binary response vector (shape: n x 1), in -1/+1 or 0/1 encoding
:param P: Prediction scores (predictor such as estimated probabilities)
for true class (shape: n x 1), assumed in [0,1]
:return: Area under the ROC curve (AUC)
"""

params_dict = {'Y': Y, 'P': P}
return Matrix(Y.sds_context,
'auc',
named_input_nodes=params_dict)
63 changes: 33 additions & 30 deletions src/main/python/systemds/operator/algorithm/builtin/decisionTree.py
Expand Up @@ -30,43 +30,46 @@


def decisionTree(X: Matrix,
Y: Matrix,
R: Matrix,
y: Matrix,
ctypes: Matrix,
**kwargs: Dict[str, VALID_INPUT_TYPES]):
"""
Builtin script implementing classification trees with scale and categorical features
This script implements decision trees for recoded and binned categorical and
numerical input features. We train a single CART (classification and
regression tree) decision trees depending on the provided labels y, either
classification (majority vote per leaf) or regression (average per leaf).
:param X: Feature matrix X; note that X needs to be both recoded and dummy coded
:param Y: Label matrix Y; note that Y needs to be both recoded and dummy coded
:param R: Matrix R which for each feature in X contains the following information
- R[1,]: Row Vector which indicates if feature vector is scalar or categorical. 1 indicates
a scalar feature vector, other positive Integers indicate the number of categories
If R is not provided by default all variables are assumed to be scale
:param bins: Number of equiheight bins per scale feature to choose thresholds
:param depth: Maximum depth of the learned tree
:param verbose: boolean specifying if the algorithm should print information while executing
:return: Matrix M where each column corresponds to a node in the learned tree and each row
contains the following information:
M[1,j]: id of node j (in a complete binary tree)
M[2,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0
M[3,j]: Feature index of the feature (scale feature id if the feature is scale or
categorical feature id if the feature is categorical)
that node j looks at if j is an internal node, otherwise 0
M[4,j]: Type of the feature that node j looks at if j is an internal node: holds
the same information as R input vector
M[5,j]: If j is an internal node: 1 if the feature chosen for j is scale,
otherwise the size of the subset of values
stored in rows 6,7,... if j is categorical
If j is a leaf node: number of misclassified samples reaching at node j
M[6:,j]: If j is an internal node: Threshold the example's feature value is compared
to is stored at M[6,j] if the feature chosen for j is scale,
otherwise if the feature chosen for j is categorical rows 6,7,... depict the value subset chosen for j
If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0
:param X: Feature matrix in recoded/binned representation
:param y: Label matrix in recoded/binned representation
:param ctypes: Row-Vector of column types [1 scale/ordinal, 2 categorical]
of shape 1-by-(ncol(X)+1), where the last entry is the y type
:param max_depth: Maximum depth of the learned tree (stopping criterion)
:param min_leaf: Minimum number of samples in leaf nodes (stopping criterion),
odd number recommended to avoid 50/50 leaf label decisions
:param min_split: Minimum number of samples in leaf for attempting a split
:param max_features: Parameter controlling the number of features used as split
candidates at tree nodes: m = ceil(num_features^max_features)
:param max_values: Parameter controlling the number of values per feature used
as split candidates: nb = ceil(num_values^max_values)
:param impurity: Impurity measure: entropy, gini (default), rss (regression)
:param seed: Fixed seed for randomization of samples and split candidates
:param verbose: Flag indicating verbose debug output
:return: Matrix M containing the learne trees, in linearized form
For example, give a feature matrix with features [a,b,c,d]
and the following trees, M would look as follows:
(L1) |d<5|
/ \
(L2) P1:2 |a<7|
/ \
(L3) P2:2 P3:1
--> M :=
[[4, 5, 0, 2, 1, 7, 0, 0, 0, 0, 0, 2, 0, 1]]
|(L1)| | (L2) | | (L3) |
"""

params_dict = {'X': X, 'Y': Y, 'R': R}
params_dict = {'X': X, 'y': y, 'ctypes': ctypes}
params_dict.update(kwargs)
return Matrix(X.sds_context,
'decisionTree',
Expand Down
Expand Up @@ -29,40 +29,32 @@
from systemds.utils.consts import VALID_INPUT_TYPES


def decisionTreePredict(M: Matrix,
X: Matrix,
strategy: str):
def decisionTreePredict(X: Matrix,
ctypes: Matrix,
M: Matrix,
**kwargs: Dict[str, VALID_INPUT_TYPES]):
"""
Builtin script implementing prediction based on classification trees with scale features using prediction methods of the
This script implements random forest prediction for recoded and binned
categorical and numerical input features.
Hummingbird paper (https://www.usenix.org/system/files/osdi20-nakandala.pdf).
:param M: Decision tree matrix M, as generated by scripts/builtin/decisionTree.dml, where each column corresponds
to a node in the learned tree and each row contains the following information:
M[1,j]: id of node j (in a complete binary tree)
M[2,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0
M[3,j]: Feature index of the feature (scale feature id if the feature is scale or
categorical feature id if the feature is categorical)
that node j looks at if j is an internal node, otherwise 0
M[4,j]: Type of the feature that node j looks at if j is an internal node: holds
the same information as R input vector
M[5,j]: If j is an internal node: 1 if the feature chosen for j is scale,
otherwise the size of the subset of values
stored in rows 6,7,... if j is categorical
If j is a leaf node: number of misclassified samples reaching at node j
M[6:,j]: If j is an internal node: Threshold the example's feature value is compared
to is stored at M[6,j] if the feature chosen for j is scale,
otherwise if the feature chosen for j is categorical rows 6,7,... depict the value subset chosen for j
If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0
:param X: Feature matrix X
:param strategy: Prediction strategy, can be one of ["GEMM", "TT", "PTT"], referring to "Generic matrix multiplication",
:param X: Feature matrix in recoded/binned representation
:param y: Label matrix in recoded/binned representation,
optional for accuracy evaluation
:param ctypes: Row-Vector of column types [1 scale/ordinal, 2 categorical]
:param M: Matrix M holding the learned tree in linearized form
see decisionTree() for the detailed tree representation.
:param strategy: Prediction strategy, can be one of ["GEMM", "TT", "PTT"],
referring to "Generic matrix multiplication",
"Tree traversal", and "Perfect tree traversal", respectively
:return: Matrix containing the predicted labels for X
:param verbose: Flag indicating verbose debug output
:return: Label vector of predictions
"""

params_dict = {'M': M, 'X': X, 'strategy': strategy}
return Matrix(M.sds_context,
params_dict = {'X': X, 'ctypes': ctypes, 'M': M}
params_dict.update(kwargs)
return Matrix(X.sds_context,
'decisionTreePredict',
named_input_nodes=params_dict)
@@ -0,0 +1,50 @@
# -------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# -------------------------------------------------------------

# Autogenerated By : src/main/python/generator/generator.py
# Autogenerated From : scripts/builtin/lmPredictStats.dml

from typing import Dict, Iterable

from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar
from systemds.script_building.dag import OutputType
from systemds.utils.consts import VALID_INPUT_TYPES


def lmPredictStats(yhat: Matrix,
ytest: Matrix,
lm: bool):
"""
This builtin function computes and prints a summary of accuracy
measures for regression problems.
:param yhat: column vector of predicted response values y
:param ytest: column vector of actual response values y
:param lm: indicator if used for linear regression model
:return: column vector holding avg_res, ss_avg_res, and R2
"""

params_dict = {'yhat': yhat, 'ytest': ytest, 'lm': lm}
return Matrix(yhat.sds_context,
'lmPredictStats',
named_input_nodes=params_dict)
39 changes: 33 additions & 6 deletions src/main/python/systemds/operator/algorithm/builtin/pca.py
Expand Up @@ -32,18 +32,45 @@
def pca(X: Matrix,
**kwargs: Dict[str, VALID_INPUT_TYPES]):
"""
The function Principal Component Analysis (PCA) is used for dimensionality reduction
This builtin defines PCA that is a technique typically used to
reduce the number of dimensions of a matrix.
This implementation is based on calculating eigenvectors on
the covariance matrix of the input.
An example of calling in DML:
.. code-block::
data = read($1)
[data_reduced, Components] = pca(data=data, K=4, onlyComponents=TRUE)
print(Components)
An example in a ML pipeline containing PCA:
.. code-block::
X = read($1)
[X_reduced, Components] = pca(data=X, K=4)
Y = read($2)
bias = l2svm(X=X, Y=Y)
X_test = read($3)
[y_predict_normal, Y_predict_rounded] = l2svmPredict(X=X_test, W=bias)
write($5, Y_predict_rounded)
:param X: Input feature matrix
:param K: Number of reduced dimensions (i.e., columns)
:param Center: Indicates whether or not to center the feature matrix
:param Scale: Indicates whether or not to scale the feature matrix
:param K: Number of components returned
:param center: Indicates whether or not to center the feature matrix
:param scale: Indicates whether or not to scale the feature matrix
:param onlyComponents: Indicate if only the components should be calculated and returned
not the application of the components on X
:return: Output feature matrix with K columns
:return: Output dominant eigen vectors (can be used for projections)
:return: Output dominant eigen vectors sorted by influence
:return: The column means of the input, subtracted to construct the PCA
:return: The Scaling of the values, to make each dimension same size.
:return: The scaling of the values, to make each dimension same size.
"""

params_dict = {'X': X}
Expand Down

0 comments on commit 2d4114e

Please sign in to comment.