apache · krishnakalyan3 · Jun 9, 2017 · Jun 11, 2017 · Jun 12, 2017 · Jun 12, 2017
diff --git a/scripts/perftest/python/configuration.py b/scripts/perftest/python/configuration.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+import os
+from os.path import join
+from utils import split_rowcol, config_writer
+import sys
+import logging
+
+mat_type = {'dense': 0.9,
+            'sparse': 0.01}
+format = 'csv'
+has_predict = ['GLM', 'Kmeans', 'l2-svm', 'm-svm', 'naive-bayes']
+
+
+def naive_bayes_datagen(matrix_type, mat_shapes, conf_dir):
+    for index, dim in enumerate(mat_shapes):
+        file_name = '_'.join(['naive_bayes_datagen', matrix_type, str(index) + '.json'])
+
+        config = [dim[0], dim[1], mat_type[matrix_type], 150, 0,
+                  'X.data', 'Y.data', format]
+        config_writer(conf_dir, config, file_name)
+    return None
+
+
+def naive_bayes_datagen(matrix_type, mat_shapes, conf_dir):
+    for index, dim in enumerate(mat_shapes):
+        file_name = '_'.join(['naive_bayes_datagen', matrix_type, str(index) + '.json'])
+
+        config = [dim[0], dim[1], mat_type[matrix_type], 150, 0,
+                  'X.data', 'Y.data', format]
+        config_writer(conf_dir, config, file_name)
+    return None
+
+
+def kmeans_datagen(matrix_type, mat_shapes, conf_dir):
+    for index, dim in enumerate(mat_shapes):
+        file_name = '_'.join(['kmeans_datagen', str(index) + '.json'])
+        config = dict(nr=dim[0], nf=dim[1], nc='5', dc='10.0', dr='1.0',
+                      fbf='100.0', cbf='100.0', X='X.data', C='C.data', Y='Y.data',
+                      YbyC='YbyC.data', fmt=format)
+        config_writer(conf_dir, config, file_name)
+    return None
+
+
+def kmeans_train(conf_dir):
+    file_name = ''.join(['kmeans_train', '.json'])
+    config = dict(X='X.data', k=5, maxi=10, runs=10, tol=0.00000001, samp=20,
+                  C='C.data', isY='TRUE', Y='Y.data', verb='TRUE')
+    config_writer(conf_dir, config, file_name)
+    return None
+
+
+def kmeans_predict(conf_dir):
+    file_name = ''.join(['kmeans_predict', '.json'])
+    config = dict(X='X.data', C='C.data', prY='prY.data')
+    config_writer(conf_dir, config, file_name)
+    return None
+
+
+def init_conf(algo, temp_dir, matrix_type, matrix_shape, job):
+    # Create directories
+    conf_dir = join(temp_dir, 'conf')
+    gen_dir = join(temp_dir, 'data_gen')
+    train_dir = join(temp_dir, 'train')
+    pred_dir = join(temp_dir, 'pred')
+
+    for dirs in [conf_dir, gen_dir, train_dir, pred_dir]:
+        if not os.path.exists(dirs):
+            os.makedirs(dirs)
+
+    mat_shapes = split_rowcol(matrix_shape)
+
+    if job[0] == 1:
+        for current_algo in algo:
+            algo_dg = current_algo.lower().replace('-', '_') + '_datagen'
+            globals()[algo_dg](matrix_type, mat_shapes, conf_dir)
+            logging.info('Completed writing {} datagen file'.format(current_algo))
+
+    if job[1] == 1:
+        for current_algo in algo:
+            algo_dg = current_algo.lower() + '_train'
+            globals()[algo_dg](conf_dir)
+            logging.info('Completed writing {} training file'.format(current_algo))
+
+    if job[2] == 1:
+        for current_algo in algo:
+            if current_algo in has_predict:
+                algo_dg = current_algo.lower() + '_predict'
+                globals()[algo_dg](conf_dir)
+                logging.info('Completed writing {} training file'.format(current_algo))
diff --git a/scripts/perftest/python/run_perftest.py b/scripts/perftest/python/run_perftest.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+# TODO:
+# Handel Intercept
+
+import sys
+import argparse
+from functools import reduce
+import os
+from os.path import join
+from utils import get_algo
+from configuration import init_conf
+import logging
+import time
+
+ml_algo = {'binomial': ['MultiLogReg', 'l2-svm', 'm-svm'],
+           'clustering': ['Kmeans'],
+           'multinomial': ['naive-bayes', 'MultiLogReg', 'm-svm'],
+           'regression': ['LinearRegDS', 'LinearRegCG', 'GLM'],
+           'stats': ['Univar-Stats', 'bivar-stats', 'stratstats']}
+
+
+def main(family, algo, exec_type, mat_type, mat_shape, temp_dir, generate_data, train, predict):
+    if algo is None:
+        algo = get_algo(family, ml_algo)
+
+    job = list(map(lambda x: int(x), [generate_data, train, predict]))
+    init_conf(algo, temp_dir, mat_type, mat_shape, job)
+
+    return None
+
+
+if __name__ == '__main__':
+    algo_flat = reduce(lambda x, y: x + y, ml_algo.values())
+    cparser = argparse.ArgumentParser(description='SystemML Performance Test Script')
+    group = cparser.add_mutually_exclusive_group(required=True)
+    group.add_argument('--family', help='specify class of algorithms (e.g regression, binomial)', metavar='',
+                       choices=ml_algo.keys(), nargs='+')
+
+    group.add_argument('--algo', help='specify the type of algorithm to run', metavar='',
+                       choices=algo_flat, nargs='+')
+    cparser.add_argument('-exec-type', default='singlenode', help='System-ML backend (e.g singlenode, '
+                                                                  'spark, spark-hybrid)', metavar='',
+                         choices=['hybrid_spark', 'singlenode'])
+    cparser.add_argument('--mat-type', default='dense', help='Type of matrix to generate (e.g dense '
+                                                             'or sparse)', metavar='', choices=['sparse', 'dense'])
+    cparser.add_argument('--mat-shape', help='Shape of matrix to generate (e.g '
+                                             '10k_1k)', metavar='', nargs='+')
+
+    # Optional Arguments
+    cparser.add_argument('-temp-dir', help='specify temporary directory', metavar='')
+    cparser.add_argument('--generate-data', help='generate data', action='store_true')
+    cparser.add_argument('--train', help='train algorithms', action='store_true')
+    cparser.add_argument('--predict', help='predict (if available)', action='store_true')
+
+    args = cparser.parse_args()
+    arg_dict = vars(args)
+
+    # Check for validity of input arguments
+    if args.family is not None:
+        for fam in args.family:
+            if fam not in ml_algo.keys():
+                print('{} family not present in the performance test suit'.format(fam))
+                sys.exit()
+
+    if args.algo is not None:
+        for algo in args.algo:
+            if algo not in algo_flat:
+                print('{} algorithm not present in the performance test suit'.format(args.algo))
+                sys.exit()
+
+    if args.temp_dir is None:
+        systemml_home = os.environ.get('SYSTEMML_HOME')
+        args.temp_dir = join(systemml_home, 'scripts', 'perftest', 'temp')
+
+    start_time = time.time()
+    logging.basicConfig(filename=join(args.temp_dir, 'perftest.out'), level=logging.INFO)
+    logging.info('New experiment state time {}'.format(start_time))
+    logging.info(args)
+
+    if not os.path.exists(args.temp_dir):
+        os.makedirs(args.temp_dir)
+
+    main(**arg_dict)
diff --git a/scripts/perftest/python/utils.py b/scripts/perftest/python/utils.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+from functools import reduce
+import os
+import json
+
+
+def get_algo(family, ml_algo):
+    algo = []
+    for fam in family:
+        algo.append(ml_algo[fam])
+    algo_flat = reduce(lambda x, y: x + y, algo)
+    return algo_flat
+
+
+def split_rowcol(matrix_dim):
+    mat_shapes = []
+    for dims in matrix_dim:
+        k = str(0) * 3
+        M = str(0) * 6
+        replace_M = dims.replace('M', str(M))
+        replace_k = replace_M.replace('k', str(k))
+        row, col = replace_k.split('_')
+        mat_shapes.append((row, col))
+
+    return mat_shapes
+
+
+def config_writer(path, config_dict, file_name):
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+    with open(path + '/' + file_name, 'w') as json_file:
+        json.dump(config_dict, json_file)