From e7cfcadc9b0e72637c67c8d6a6dcc62f62ba5177 Mon Sep 17 00:00:00 2001 From: krishnakalyan3 Date: Sun, 2 Jul 2017 00:00:49 -0700 Subject: [PATCH] [SYSTEMML-1451][GSoC Phase 1] Single script to run perf tests - Single entry point to run perf tests in any combination of algoriths, families, matrix shapes & densities - Reports time taken by a single perf test by parsing the output and grep-ing for the time - Detects tests that did not run and reports in the generated log - Robust error handling and reporting, informative help message Closes #537 --- scripts/perftest/python/datagen.py | 252 +++++++++++++++ scripts/perftest/python/predict.py | 285 ++++++++++++++++ scripts/perftest/python/run_perftest.py | 339 +++++++++++++++++++ scripts/perftest/python/train.py | 411 ++++++++++++++++++++++++ scripts/perftest/python/utils.py | 296 +++++++++++++++++ 5 files changed, 1583 insertions(+) create mode 100755 scripts/perftest/python/datagen.py create mode 100755 scripts/perftest/python/predict.py create mode 100755 scripts/perftest/python/run_perftest.py create mode 100755 scripts/perftest/python/train.py create mode 100755 scripts/perftest/python/utils.py diff --git a/scripts/perftest/python/datagen.py b/scripts/perftest/python/datagen.py new file mode 100755 index 00000000000..d9c49e9cca6 --- /dev/null +++ b/scripts/perftest/python/datagen.py @@ -0,0 +1,252 @@ +#!/usr/bin/env python3 +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +import itertools +from os.path import join +from utils import split_rowcol, config_writer + +# This file contains configuration settings for data generation +DATA_FORMAT = 'csv' + +MATRIX_TYPE_DICT = {'dense': '0.9', + 'sparse': '0.01'} + +FAMILY_NO_MATRIX_TYPE = ['clustering', 'stats1', 'stats2'] + + +def multinomial_datagen(matrix_dim, matrix_type, datagen_dir): + + row, col = split_rowcol(matrix_dim) + path_name = '.'.join(['multinomial', matrix_type, str(matrix_dim)]) + full_path = join(datagen_dir, path_name) + + numSamples = row + numFeatures = col + sparsity = MATRIX_TYPE_DICT[matrix_type] + num_categories = '150' + intercept = '0' + X = join(full_path, 'X.data') + Y = join(full_path, 'Y.data') + fmt = DATA_FORMAT + + config = [numSamples, numFeatures, sparsity, num_categories, intercept, + X, Y, fmt, '1'] + + config_writer(full_path + '.json', config) + + return full_path + + +def binomial_datagen(matrix_dim, matrix_type, datagen_dir): + + row, col = split_rowcol(matrix_dim) + path_name = '.'.join(['binomial', matrix_type, str(matrix_dim)]) + full_path = join(datagen_dir, path_name) + + numSamples = row + numFeatures = col + maxFeatureValue = '5' + maxWeight = '5' + loc_weights = join(full_path, 'weight.data') + loc_data = join(full_path, 'X.data') + loc_labels = join(full_path, 'Y.data') + noise = '1' + intercept = '0' + sparsity = MATRIX_TYPE_DICT[matrix_type] + tranform_labels = '1' + fmt = DATA_FORMAT + + config = [numSamples, numFeatures, maxFeatureValue, maxWeight, loc_weights, loc_data, + loc_labels, noise, intercept, sparsity, fmt, tranform_labels] + config_writer(full_path + '.json', config) + + return full_path + + +def regression1_datagen(matrix_dim, matrix_type, datagen_dir): + + row, col = split_rowcol(matrix_dim) + path_name = '.'.join(['regression1', matrix_type, str(matrix_dim)]) + full_path = join(datagen_dir, path_name) + + numSamples = row + numFeatures = col + maxFeatureValue = '5' + maxWeight = '5' + loc_weights = join(full_path, 'weight.data') + loc_data = join(full_path, 'X.data') + loc_labels = join(full_path, 'Y.data') + noise = '1' + intercept = '0' + sparsity = MATRIX_TYPE_DICT[matrix_type] + tranform_labels = '1' + fmt = DATA_FORMAT + + config = [numSamples, numFeatures, maxFeatureValue, maxWeight, loc_weights, loc_data, + loc_labels, noise, intercept, sparsity, fmt, tranform_labels] + config_writer(full_path + '.json', config) + + return full_path + + +def regression2_datagen(matrix_dim, matrix_type, datagen_dir): + + row, col = split_rowcol(matrix_dim) + path_name = '.'.join(['regression2', matrix_type, str(matrix_dim)]) + full_path = join(datagen_dir, path_name) + + numSamples = row + numFeatures = col + maxFeatureValue = '5' + maxWeight = '5' + loc_weights = join(full_path, 'weight.data') + loc_data = join(full_path, 'X.data') + loc_labels = join(full_path, 'Y.data') + noise = '1' + intercept = '0' + sparsity = MATRIX_TYPE_DICT[matrix_type] + tranform_labels = '1' + fmt = DATA_FORMAT + + config = [numSamples, numFeatures, maxFeatureValue, maxWeight, loc_weights, loc_data, + loc_labels, noise, intercept, sparsity, fmt, tranform_labels] + config_writer(full_path + '.json', config) + + return full_path + + +def clustering_datagen(matrix_dim, matrix_type, datagen_dir): + + row, col = split_rowcol(matrix_dim) + path_name = '.'.join(['clustering', matrix_type, str(matrix_dim)]) + + full_path = join(datagen_dir, path_name) + X = join(full_path, 'X.data') + Y = join(full_path, 'Y.data') + YbyC = join(full_path, 'YbyC.data') + C = join(full_path, 'C.data') + nc = '50' + dc = '10.0' + dr = '1.0' + fbf = '100.0' + cbf = '100.0' + + config = dict(nr=row, nf=col, nc=nc, dc=dc, dr=dr, fbf=fbf, cbf=cbf, X=X, C=C, Y=Y, + YbyC=YbyC, fmt=DATA_FORMAT) + + config_writer(full_path + '.json', config) + return full_path + + +def stats1_datagen(matrix_dim, matrix_type, datagen_dir): + + row, col = split_rowcol(matrix_dim) + path_name = '.'.join(['stats1', matrix_type, str(matrix_dim)]) + full_path = join(datagen_dir, path_name) + + DATA = join(full_path, 'X.data') + TYPES = join(full_path, 'types') + TYPES1 = join(full_path, 'set1.types') + TYPES2 = join(full_path, 'set2.types') + INDEX1 = join(full_path, 'set1.indices') + INDEX2 = join(full_path, 'set2.indices') + MAXDOMAIN = '1100' + SETSIZE = '20' + LABELSETSIZE = '10' + + # NC should be less than C and more than num0 + # NC = 10 (old value) + # num0 = NC/2 + # num0 < NC < C + # NC = C/2 + NC = int(int(col)/2) + + config = dict(R=row, C=col, NC=NC, MAXDOMAIN=MAXDOMAIN, DATA=DATA, TYPES=TYPES, SETSIZE=SETSIZE, + LABELSETSIZE=LABELSETSIZE, TYPES1=TYPES1, TYPES2=TYPES2, INDEX1=INDEX1, INDEX2=INDEX2, + fmt=DATA_FORMAT) + + config_writer(full_path + '.json', config) + + return full_path + + +def stats2_datagen(matrix_dim, matrix_type, datagen_dir): + + row, col = split_rowcol(matrix_dim) + path_name = '.'.join(['stats2', matrix_type, str(matrix_dim)]) + full_path = join(datagen_dir, path_name) + + D = join(full_path, 'X.data') + Xcid = join(full_path, 'Xcid.data') + Ycid = join(full_path, 'Ycid.data') + A = join(full_path, 'A.data') + + config = dict(nr=row, nf=col, D=D, Xcid=Xcid, Ycid=Ycid, + A=A, fmt=DATA_FORMAT) + + config_writer(full_path + '.json', config) + return full_path + + +def config_packets_datagen(algo_payload, matrix_type, matrix_shape, datagen_dir): + """ + This function has two responsibilities. Generate the configuration files for + datagen algorithms and return a dictionary that will be used for execution. + + algo_payload : List of tuples + The first tuple index contains algorithm name and the second index contains + family type. + + matrix_type: String + Type of matrix to generate e.g dense or sparse + + matrix_shape: String + Shape of matrix to generate e.g 100k_10 + + return: Dictionary {string: list} + This dictionary contains algorithms to be executed as keys and the path of configuration + json files to be executed list of values. + """ + + config_bundle = {} + + distinct_families = set(map(lambda x: x[1], algo_payload)) + + # Cross Product of all configurations + for current_family in distinct_families: + if current_family in FAMILY_NO_MATRIX_TYPE: + config = list(itertools.product(matrix_shape, ['dense'])) + config_bundle[current_family] = config + else: + config = list(itertools.product(matrix_shape, matrix_type)) + # clustering : [[10k_1, dense], [10k_2, dense], ...] + config_bundle[current_family] = config + + config_packets = {} + for current_family, configs in config_bundle.items(): + config_packets[current_family] = [] + for size, type in configs: + family_func = current_family.lower() + '_datagen' + conf_path = globals()[family_func](size, type, datagen_dir) + config_packets[current_family].append(conf_path) + + return config_packets diff --git a/scripts/perftest/python/predict.py b/scripts/perftest/python/predict.py new file mode 100755 index 00000000000..bc034dad0ee --- /dev/null +++ b/scripts/perftest/python/predict.py @@ -0,0 +1,285 @@ +#!/usr/bin/env python3 +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for dadditional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +import sys +import os +from os.path import join +import glob +from utils import create_dir, config_writer + +# Contains configuration setting for predicting +DATA_FORMAT = 'csv' + + +def m_svm_predict(save_file_name, datagen_dir, train_dir, predict_dir): + + X = join(datagen_dir, 'X_test.data') + Y = join(datagen_dir, 'Y_test.data') + + icpt = save_file_name.split('.')[-1] + model = join(train_dir, 'model.data') + fmt = DATA_FORMAT + + config = dict(X=X, Y=Y, icpt=icpt, model=model, fmt=fmt) + + full_path_predict = join(predict_dir, save_file_name) + config_writer(full_path_predict + '.json', config) + + return full_path_predict + + +def l2_svm_predict(save_file_name, datagen_dir, train_dir, predict_dir): + + X = join(datagen_dir, 'X_test.data') + Y = join(datagen_dir, 'Y_test.data') + + icpt = save_file_name.split('.')[-1] + model = join(train_dir, 'model.data') + fmt = DATA_FORMAT + + config = dict(X=X, Y=Y, icpt=icpt, model=model, fmt=fmt) + + full_path_predict = join(predict_dir, save_file_name) + config_writer(full_path_predict + '.json', config) + + return full_path_predict + + +def multilogreg_predict(save_file_name, datagen_dir, train_dir, predict_dir): + X = join(datagen_dir, 'X_test.data') + Y = join(datagen_dir, 'Y_test.data') + B = join(train_dir, 'B.data') + M = join(train_dir, 'M.data') + dfam = '3' + vpow = '-1' + link = '2' + fmt = DATA_FORMAT + + config = dict(dfam=dfam, vpow=vpow, link=link, fmt=fmt, X=X, B=B, Y=Y, M=M) + + full_path_predict = join(predict_dir, save_file_name) + config_writer(full_path_predict + '.json', config) + + return full_path_predict + + +def naive_bayes_predict(save_file_name, datagen_dir, train_dir, predict_dir): + + X = join(datagen_dir, 'X_test.data') + Y = join(datagen_dir, 'Y_test.data') + + prior = join(train_dir, 'prior') + conditionals = join(train_dir, 'conditionals') + fmt = DATA_FORMAT + probabilities = join(train_dir, 'probabilities') + config = dict(X=X, Y=Y, prior=prior, conditionals=conditionals, fmt=fmt, probabilities=probabilities) + + full_path_predict = join(predict_dir, save_file_name) + config_writer(full_path_predict + '.json', config) + + return full_path_predict + + +def kmeans_predict(save_file_name, datagen_dir, train_dir, predict_dir): + + X = join(datagen_dir, 'X_test.data') + C = join(datagen_dir, 'C.data') + + full_path_predict = join(predict_dir, save_file_name) + prY = join(full_path_predict, 'prY.data') + + config = dict(X=X, C=C, prY=prY) + config_writer(full_path_predict + '.json', config) + + return full_path_predict + + +def linearregcg_predict(save_file_name, datagen_dir, train_dir, predict_dir): + + dfam = '1' + link = '1' + vpow = '0.0' + lpow = '1.0' + + X = join(datagen_dir, 'X_test.data') + B = join(train_dir, 'B.data') + Y = join(datagen_dir, 'Y_test.data') + + full_path_predict = join(predict_dir, save_file_name) + M = join(full_path_predict, 'M.data') + O = join(full_path_predict, 'O.data') + + config = dict(dfam=dfam, link=link, vpow=vpow, lpow=lpow, fmt=DATA_FORMAT, X=X, + B=B, Y=Y, M=M, O=O) + config_writer(full_path_predict + '.json', config) + + return full_path_predict + + +def linearregds_predict(save_file_name, datagen_dir, train_dir, predict_dir): + + dfam = '1' + link = '1' + vpow = '0.0' + lpow = '1.0' + + X = join(datagen_dir, 'X_test.data') + B = join(train_dir, 'B.data') + Y = join(datagen_dir, 'Y_test.data') + + full_path_predict = join(predict_dir, save_file_name) + M = join(full_path_predict, 'M.data') + O = join(full_path_predict, 'O.data') + + config = dict(dfam=dfam, link=link, vpow=vpow, lpow=lpow, fmt=DATA_FORMAT, X=X, + B=B, Y=Y, M=M, O=O) + config_writer(full_path_predict + '.json', config) + + return full_path_predict + + +def glm_poisson_predict(save_file_name, datagen_dir, train_dir, predict_dir): + + dfam = '1' + link = '1' + vpow = '1' + lpow = '1.0' + + X = join(datagen_dir, 'X_test.data') + B = join(train_dir, 'B.data') + Y = join(datagen_dir, 'Y_test.data') + + full_path_predict = join(predict_dir, save_file_name) + M = join(full_path_predict, 'M.data') + O = join(full_path_predict, 'O.data') + + config = dict(dfam=dfam, link=link, vpow=vpow, lpow=lpow, fmt=DATA_FORMAT, X=X, + B=B, Y=Y, M=M, O=O) + config_writer(full_path_predict + '.json', config) + + return full_path_predict + + +def glm_binomial_predict(save_file_name, datagen_dir, train_dir, predict_dir): + + dfam = '2' + link = '3' + + X = join(datagen_dir, 'X_test.data') + B = join(train_dir, 'B.data') + Y = join(datagen_dir, 'Y_test.data') + + full_path_predict = join(predict_dir, save_file_name) + M = join(full_path_predict, 'M.data') + O = join(full_path_predict, 'O.data') + + config = dict(dfam=dfam, link=link, fmt=DATA_FORMAT, X=X, + B=B, Y=Y, M=M, O=O) + config_writer(full_path_predict + '.json', config) + + return full_path_predict + + +def glm_gamma_predict(save_file_name, datagen_dir, train_dir, predict_dir): + + dfam = '1' + link = '1' + vpow = '2' + lpow = '0' + + X = join(datagen_dir, 'X_test.data') + B = join(train_dir, 'B.data') + Y = join(datagen_dir, 'Y_test.data') + + full_path_predict = join(predict_dir, save_file_name) + M = join(full_path_predict, 'M.data') + O = join(full_path_predict, 'O.data') + + config = dict(dfam=dfam, link=link, vpow=vpow, lpow=lpow, fmt=DATA_FORMAT, X=X, + B=B, Y=Y, M=M, O=O) + config_writer(full_path_predict + '.json', config) + + return full_path_predict + + +def config_packets_predict(algo_payload, datagen_dir, train_dir, predict_dir): + """ + This function has two responsibilities. Generate the configuration files for + prediction algorithms and return a dictionary that will be used for execution. + + algo_payload : List of tuples + The first tuple index contains algorithm name and the second index contains + family type. + + datagen_dir: String + Path of the data generation directory + + train_dir: String + Path of the training directory + + predict_dir: String + Path of the prediction directory + + return: Dictionary {string: list} + This dictionary contains algorithms to be executed as keys and the path of configuration + json files to be executed list of values. + """ + + algo_payload_distinct = set(map(lambda x: x[0], algo_payload)) + + config_bundle = {} + + for k, v in algo_payload: + config_bundle[k] = [] + + for current_algo in algo_payload_distinct: + # Get all train folders related to the algorithm + train_path = join(train_dir, current_algo) + train_subdir = glob.glob(train_path + "*") + train_folders = list(filter(lambda x: os.path.isdir(x), train_subdir)) + + if len(train_folders) == 0: + print('training folders not present for {}'.format(current_algo)) + sys.exit() + + for current_train_folder in train_folders: + save_name = current_train_folder.split('/')[-1] + # Get all datagen folders + data_gen_folder_name = '.'.join(save_name.split('.')[1:-1]) + data_gen_path = join(datagen_dir, data_gen_folder_name) + data_gen_subdir = glob.glob(data_gen_path + "*") + data_gen_folder = list(filter(lambda x: os.path.isdir(x), data_gen_subdir)) + + if len(data_gen_folder) == 0: + print('data-gen folders not present for {}'.format(current_family)) + sys.exit() + + # Ideally we will have more than one datagen directory to be found + current_data_gen_dir = list(data_gen_folder)[0] + + algo_func = '_'.join([current_algo.lower().replace('-', '_')] + ['predict']) + conf_path = globals()[algo_func](save_name, current_data_gen_dir, + current_train_folder, predict_dir) + + config_bundle[current_algo].append(conf_path) + + return config_bundle diff --git a/scripts/perftest/python/run_perftest.py b/scripts/perftest/python/run_perftest.py new file mode 100755 index 00000000000..1421c2c6674 --- /dev/null +++ b/scripts/perftest/python/run_perftest.py @@ -0,0 +1,339 @@ +#!/usr/bin/env python3 +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +import sys +import time +import argparse +from functools import reduce +import os +from os.path import join +from utils import get_families, config_reader, create_dir, get_existence, \ + exec_dml_and_parse_time, exec_test_data, check_predict, get_folder_metrics +import logging +from datetime import datetime +from datagen import config_packets_datagen +from train import config_packets_train +from predict import config_packets_predict + +# A packet is a dictionary +# with key as the algorithm +# value as the list with configuration json files + + +ML_ALGO = {'binomial': ['MultiLogReg', 'l2-svm', 'm-svm'], + 'clustering': ['Kmeans'], + 'multinomial': ['naive-bayes', 'MultiLogReg', 'm-svm'], + 'regression1': ['LinearRegDS', 'LinearRegCG'], + 'regression2': ['GLM_poisson', 'GLM_gamma', 'GLM_binomial'], + 'stats1': ['Univar-Stats', 'bivar-stats'], + 'stats2': ['stratstats']} + +ML_GENDATA = {'binomial': 'genRandData4LogisticRegression', + 'clustering': 'genRandData4Kmeans', + 'multinomial': 'genRandData4Multinomial', + 'regression1': 'genRandData4LogisticRegression', + 'regression2': 'genRandData4LogisticRegression', + 'stats1': 'genRandData4DescriptiveStats', + 'stats2': 'genRandData4StratStats'} + +ML_TRAIN = {'GLM_poisson': 'GLM', + 'GLM_gamma': 'GLM', + 'GLM_binomial': 'GLM', + 'LinearRegCG': 'LinearRegCG', + 'LinearRegDS': 'LinearRegDS', + 'stratstats': 'stratstats', + 'Univar-Stats': 'Univar-Stats', + 'bivar-stats': 'bivar-stats', + 'Kmeans': 'Kmeans', + 'm-svm': 'm-svm', + 'l2-svm': 'l2-svm', + 'MultiLogReg': 'MultiLogReg', + 'naive-bayes': 'naive-bayes'} + +ML_PREDICT = {'Kmeans': 'Kmeans-predict', + 'LinearRegCG': 'GLM-predict', + 'LinearRegDS': 'GLM-predict', + 'm-svm': 'm-svm-predict', + 'l2-svm': 'l2-svm-predict', + 'MultiLogReg': 'GLM-predict', + 'naive-bayes': 'naive-bayes-predict', + 'GLM_poisson': 'GLM-predict', + 'GLM_gamma': 'GLM-predict', + 'GLM_binomial': 'GLM-predict'} + + +# Responsible for execution and metric logging +def algorithm_workflow(algo, exec_type, config_path, file_name, action_mode): + """ + This function is responsible for overall workflow. This does the following actions + Check if the input is key value argument or list of positional args + Execution and time + Logging Metrics + + + algo : String + Input algorithm specified + + exec_type : String + Contains the execution type singlenode / hybrid_spark + + config_path : String + Path to read the json file from + + file_name : String + DML file name to be used while processing the arguments give + + action_mode : String + Type of action data-gen, train ... + """ + + config_data = config_reader(config_path + '.json') + + if isinstance(config_data, dict): + dict_args = ' '.join([str(key) + '=' + str(val) for key, val in config_data.items()]) + args = {'-nvargs': dict_args} + + if isinstance(config_data, list): + list_args = ' '.join(config_data) + args = {'-args': list_args} + + folder_name = config_path.split('/')[-1] + mat_type, mat_shape, intercept = get_folder_metrics(folder_name, action_mode) + + exit_flag_success = get_existence(config_path, action_mode) + + if exit_flag_success: + print('data already exists {}'.format(config_path)) + time = 'data_exists' + else: + time = exec_dml_and_parse_time(exec_type, file_name, args) + + # Write a _SUCCESS file only if time is found and in data-gen action_mode + if len(time.split('.')) == 2 and action_mode == 'data-gen': + full_path = join(config_path, '_SUCCESS') + open(full_path, 'w').close() + + print('{},{},{},{},{},{}'.format(algo, action_mode, intercept, mat_type, mat_shape, time)) + current_metrics = [algo, action_mode, intercept, mat_type, mat_shape, time] + logging.info(','.join(current_metrics)) + + +# Perf test entry point +def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, temp_dir, mode): + """ + This function is the entry point for performance testing + + family: List + A family may contain one or more algorithm based on data generation script used + + algo: List + Input algorithms + + exec_type: String + Contains the execution type singlenode / hybrid_spark + + mat_type: List + Type of matrix to generate dense or sparse + + mat_shape: List + Dimensions of the input matrix with rows and columns + + temp_dir: String + Location to store all files created during perf test + + mode: List + Type of workload to run. data-gen, train ... + """ + + # algos to run is a list of tuples with + # [(m-svm, binomial), (m-svm, multinomial)...] + # Basic block for execution of scripts + algos_to_run = [] + + # Sections below build algos_to_run in our performance test + # Handles algorithms like m-svm and MultiLogReg which have multiple + # data generation scripts (dual datagen) + # --family is taken into consideration only when there are multiple datagen for an algo + + if family is not None and algo is not None: + for current_algo in algo: + family_list = get_families(current_algo, ML_ALGO) + if len(family_list) == 1: + algos_to_run.append((current_algo, family_list[0])) + else: + intersection = set(family).intersection(family_list) + for valid_family in intersection: + algos_to_run.append((current_algo, valid_family)) + + # When the user inputs just algorithms to run + elif algo is not None: + for current_algo in algo: + family_list = get_families(current_algo, ML_ALGO) + for f in family_list: + algos_to_run.append((current_algo, f)) + + # When the user just specifies only families to run + elif family is not None: + for current_family in family: + algos = ML_ALGO[current_family] + for current_algo in algos: + algos_to_run.append((current_algo, current_family)) + + if 'data-gen' in mode: + data_gen_dir = join(temp_dir, 'data-gen') + create_dir(data_gen_dir) + conf_packet = config_packets_datagen(algos_to_run, mat_type, mat_shape, data_gen_dir) + for family_name, config_folders in conf_packet.items(): + for config in config_folders: + file_name = ML_GENDATA[family_name] + algorithm_workflow(family_name, exec_type, config, file_name, 'data-gen') + + # Statistic family do not require to be split + if family_name not in ['stats1', 'stats2']: + exec_test_data(exec_type, config) + + if 'train' in mode: + data_gen_dir = join(temp_dir, 'data-gen') + train_dir = join(temp_dir, 'train') + create_dir(train_dir) + conf_packet = config_packets_train(algos_to_run, data_gen_dir, train_dir) + for algo_name, config_files in conf_packet.items(): + for config in config_files: + file_name = ML_TRAIN[algo_name] + algorithm_workflow(algo_name, exec_type, config, file_name, 'train') + + if 'predict' in mode: + data_gen_dir = join(temp_dir, 'data-gen') + train_dir = join(temp_dir, 'train') + predict_dir = join(temp_dir, 'predict') + create_dir(predict_dir) + algos_to_run_perdict = list(filter(lambda algo: check_predict(algo[0], ML_PREDICT), algos_to_run)) + if len(algos_to_run_perdict) < 0: + pass + conf_packet = config_packets_predict(algos_to_run_perdict, data_gen_dir, train_dir, predict_dir) + for algo_name, config_files in conf_packet.items(): + for config in config_files: + file_name = ML_PREDICT[algo_name] + algorithm_workflow(algo_name, exec_type, config, file_name, 'predict') + +if __name__ == '__main__': + + # sys ml env set and error handling + systemml_home = os.environ.get('SYSTEMML_HOME') + if systemml_home is None: + print('SYSTEMML_HOME not found') + sys.exit() + + # Default Arguments + default_mat_type = ['dense', 'sparse'] + default_workload = ['data-gen', 'train', 'predict'] + default_mat_shape = ['10k_100'] + default_execution_mode = ['hybrid_spark', 'singlenode'] + + # Default temp directory, contains everything generated in perftest + default_temp_dir = join(systemml_home, 'scripts', 'perftest', 'temp') + create_dir(default_temp_dir) + + # Initialize time + start_time = time.time() + + # Default Date Time + time_now = str(datetime.now()) + + # Remove duplicates algorithms and used as default inputs + all_algos = set(reduce(lambda x, y: x + y, ML_ALGO.values())) + + # Argparse Module + cparser = argparse.ArgumentParser(description='SystemML Performance Test Script') + cparser.add_argument('--family', help='specify class of algorithms (e.g regression, binomial)', + metavar='', choices=ML_ALGO.keys(), nargs='+') + cparser.add_argument('--algo', help='specify the type of algorithm to run (Overrides --family)', metavar='', + choices=all_algos, nargs='+') + + cparser.add_argument('--exec-type', default='singlenode', help='System-ML backend ' + '(e.g singlenode, spark-hybrid)', metavar='', + choices=default_execution_mode) + cparser.add_argument('--mat-type', default=default_mat_type, help='type of matrix to generate ' + '(e.g dense or sparse)', metavar='', choices=default_mat_type, + nargs='+') + cparser.add_argument('--mat-shape', default=default_mat_shape, help='shape of matrix ' + 'to generate (e.g 10k_1k)', metavar='', nargs='+') + cparser.add_argument('--temp-dir', default=default_temp_dir, help='specify temporary directory', + metavar='') + cparser.add_argument('--filename', default='perf_test', help='specify output file for the perf' + ' metics', metavar='') + cparser.add_argument('--mode', default=default_workload, + help='specify type of workload to run (e.g data-gen, train, predict)', + metavar='', choices=default_workload, nargs='+') + + # Args is a namespace + args = cparser.parse_args() + arg_dict = vars(args) + + # Debug arguments + # print(arg_dict) + + # Check for validity of input arguments + if args.family is not None: + for fam in args.family: + if fam not in ML_ALGO.keys(): + print('{} family not present in the performance test suit'.format(fam)) + sys.exit() + + if args.algo is not None: + for algo in args.algo: + if algo not in all_algos: + print('{} algorithm not present in the performance test suit'.format(args.algo)) + sys.exit() + + # This section check the validity of dual datagen algorithms like m-svm + algo_families = {} + for current_algo in args.algo: + algo_families[current_algo] = get_families(current_algo, ML_ALGO) + + if len(algo_families[current_algo]) > 1: + if args.family is None: + print('family should be present for {}'.format(current_algo)) + sys.exit() + + valid_families = set(algo_families[current_algo]) + input_families = set(args.family) + common_families = input_families.intersection(valid_families) + if len(common_families) == 0: + print('Please specify a valid family for {} and the ' + 'valid families are {}'.format(current_algo, ' '.join(valid_families))) + sys.exit() + + # Set level to 0 -> debug mode + # Set level to 20 -> Plain metrics + log_filename = args.filename + '_' + args.exec_type + '.out' + logging.basicConfig(filename=join(default_temp_dir, log_filename), level=20) + logging.info('New performance test started at {}'.format(time_now)) + logging.info('algorithm,run_type,intercept,matrix_type,data_shape,time_sec') + + # Remove filename item from dictionary as its already used to create the log above + del arg_dict['filename'] + + perf_test_entry(**arg_dict) + + total_time = (time.time() - start_time) + logging.info('Performance tests complete {0:.3f} secs \n'.format(total_time)) diff --git a/scripts/perftest/python/train.py b/scripts/perftest/python/train.py new file mode 100755 index 00000000000..1ab2880e214 --- /dev/null +++ b/scripts/perftest/python/train.py @@ -0,0 +1,411 @@ +#!/usr/bin/env python3 +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +import sys +import glob +import os +from os.path import join +from utils import config_writer +from functools import reduce + +# Contains configuration setting for training +DATA_FORMAT = 'csv' + + +def binomial_m_svm_train(save_folder_name, datagen_dir, train_dir): + + data_folders = [] + for i in [0, 1]: + icpt = str(i) + reg = '0.01' + tol = '0.0001' + maxiter = 20 + X = join(datagen_dir, 'X.data') + Y = join(datagen_dir, 'Y.data') + + full_path_train = join(train_dir, save_folder_name + '.' + str(i)) + data_folders.append(full_path_train) + + model = join(full_path_train, 'model.data') + Log = join(full_path_train, 'Log.data') + + config = dict(X=X, Y=Y, icpt=icpt, classes=2, reg=reg, tol=tol, maxiter=maxiter, model=model, + Log=Log, fmt=DATA_FORMAT) + config_writer(full_path_train + '.json', config) + + return data_folders + + +def binomial_l2_svm_train(save_folder_name, datagen_dir, train_dir): + + data_folders = [] + for i in [0, 1]: + icpt = str(i) + reg = '0.01' + tol = '0.0001' + maxiter = '100' + X = join(datagen_dir, 'X.data') + Y = join(datagen_dir, 'Y.data') + + full_path_train = join(train_dir, save_folder_name + '.' + str(i)) + data_folders.append(full_path_train) + + model = join(full_path_train, 'model.data') + Log = join(full_path_train, 'Log.data') + + config = dict(X=X, Y=Y, icpt=icpt, reg=reg, tol=tol, maxiter=maxiter, model=model, + Log=Log, fmt=DATA_FORMAT) + config_writer(full_path_train + '.json', config) + + return data_folders + + +def binomial_multilogreg_train(save_folder_name, datagen_dir, train_dir): + data_folders = [] + + for i in [0, 1, 2]: + icpt = str(i) + reg = '0.01' + tol = '0.0001' + moi = '100' + mii = '5' + X = join(datagen_dir, 'X.data') + Y = join(datagen_dir, 'Y.data') + + full_path_train = join(train_dir, save_folder_name + '.' + str(i)) + data_folders.append(full_path_train) + + B = join(full_path_train, 'B.data') + + config = dict(X=X, Y=Y, icpt=icpt, reg=reg, tol=tol, moi=moi, mii=mii, + B=B) + config_writer(full_path_train + '.json', config) + return data_folders + + +def multinomial_m_svm_train(save_folder_name, datagen_dir, train_dir): + + data_folders = [] + for i in [0, 1]: + icpt = str(i) + reg = '0.01' + tol = '0.0001' + maxiter = '20' + X = join(datagen_dir, 'X.data') + Y = join(datagen_dir, 'Y.data') + + full_path_train = join(train_dir, save_folder_name + '.' + str(i)) + model = join(full_path_train, 'model.data') + Log = join(full_path_train, 'Log.data') + + config = dict(X=X, Y=Y, icpt=icpt, classes=150, reg=reg, tol=tol, maxiter=maxiter, model=model, + Log=Log, fmt=DATA_FORMAT) + config_writer(full_path_train + '.json', config) + data_folders.append(full_path_train) + + return data_folders + + +def clustering_kmeans_train(save_folder_name, datagen_dir, train_dir): + + X = join(datagen_dir, 'X.data') + + full_path_train = join(train_dir, save_folder_name) + C = join(full_path_train, 'C.data') + k = '50' + maxi = '50' + tol = '0.0001' + config = dict(X=X, k=k, maxi=maxi, tol=tol, C=C) + + config_writer(full_path_train + '.json', config) + + return [full_path_train] + + +def stats1_univar_stats_train(save_folder_name, datagen_dir, train_dir): + + X = join(datagen_dir, 'X.data') + TYPES = join(datagen_dir, 'types') + + full_path_train = join(train_dir, save_folder_name) + STATS = join(full_path_train, 'STATS.data') + + config = dict(X=X, TYPES=TYPES, STATS=STATS) + config_writer(full_path_train + '.json', config) + + return [full_path_train] + + +def stats1_bivar_stats_train(save_folder_name, datagen_dir, train_dir): + + X = join(datagen_dir, 'X.data') + index1 = join(datagen_dir, 'set1.indices') + index2 = join(datagen_dir, 'set2.indices') + types1 = join(datagen_dir, 'set1.types') + types2 = join(datagen_dir, 'set2.types') + + full_path_train = join(train_dir, save_folder_name) + OUTDIR = full_path_train + + config = dict(X=X, index1=index1, index2=index2, types1=types1, types2=types2, OUTDIR=OUTDIR) + config_writer(full_path_train + '.json', config) + return [full_path_train] + + +def stats2_stratstats_train(save_folder_name, datagen_dir, train_dir): + + X = join(datagen_dir, 'X.data') + Xcid = join(datagen_dir, 'Xcid.data') + Ycid = join(datagen_dir, 'Ycid.data') + + full_path_train = join(train_dir, save_folder_name) + O = join(full_path_train, 'O.data') + + config = dict(X=X, Xcid=Xcid, Ycid=Ycid, O=O, fmt=DATA_FORMAT) + + config_writer(full_path_train + '.json', config) + + return [full_path_train] + + +def multinomial_naive_bayes_train(save_folder_name, datagen_dir, train_dir): + + X = join(datagen_dir, 'X.data') + Y = join(datagen_dir, 'Y.data') + classes = '150' + + full_path_train = join(train_dir, save_folder_name) + prior = join(full_path_train, 'prior') + conditionals = join(full_path_train, 'conditionals') + accuracy = join(full_path_train, 'accuracy') + fmt = DATA_FORMAT + probabilities = join(full_path_train, 'probabilities') + + config = dict(X=X, Y=Y, classes=classes, prior=prior, conditionals=conditionals, + accuracy=accuracy, fmt=fmt, probabilities=probabilities) + + config_writer(full_path_train + '.json', config) + + return [full_path_train] + + +def multinomial_multilogreg_train(save_folder_name, datagen_dir, train_dir): + + data_folders = [] + for i in [0, 1, 2]: + icpt = str(i) + reg = '0.01' + tol = '0.0001' + moi = '100' + mii = '0' + X = join(datagen_dir, 'X.data') + Y = join(datagen_dir, 'Y.data') + + full_path_train = join(train_dir, save_folder_name + '.' + str(i)) + data_folders.append(full_path_train) + B = join(full_path_train, 'B.data') + + config = dict(X=X, Y=Y, B=B, icpt=icpt, reg=reg, tol=tol, moi=moi, mii=mii, fmt=DATA_FORMAT) + config_writer(full_path_train + '.json', config) + + return data_folders + + +def regression1_linearregds_train(save_folder_name, datagen_dir, train_dir): + + data_folders = [] + for i in [0, 1, 2]: + icpt = str(i) + reg = '0.01' + X = join(datagen_dir, 'X.data') + Y = join(datagen_dir, 'Y.data') + + full_path_train = join(train_dir, save_folder_name + '.' + str(i)) + data_folders.append(full_path_train) + B = join(full_path_train, 'B.data') + + config = dict(X=X, Y=Y, B=B, icpt=icpt, fmt=DATA_FORMAT, reg=reg) + config_writer(full_path_train + '.json', config) + + return data_folders + + +def regression1_linearregcg_train(save_folder_name, datagen_dir, train_dir): + + data_folders = [] + for i in [0, 1, 2]: + icpt = str(i) + reg = '0.01' + tol = '0.0001' + maxi = '20' + X = join(datagen_dir, 'X.data') + Y = join(datagen_dir, 'Y.data') + + full_path_train = join(train_dir, save_folder_name + '.' + str(i)) + data_folders.append(full_path_train) + B = join(full_path_train, 'B.data') + + config = dict(X=X, Y=Y, B=B, icpt=icpt, fmt=DATA_FORMAT, maxi=maxi, tol=tol, reg=reg) + config_writer(full_path_train + '.json', config) + + return data_folders + + +def regression2_glm_gamma_train(save_folder_name, datagen_dir, train_dir): + + data_folders = [] + + for i in [0, 1, 2]: + X = join(datagen_dir, 'X.data') + Y = join(datagen_dir, 'Y.data') + + full_path_train = join(train_dir, save_folder_name) + data_folders.append(full_path_train) + + B = join(full_path_train, 'B.data') + icpt = str(i) + fmt = DATA_FORMAT + moi = '200' + mii = '5' + dfam = '1' + vpow = '2.0' + link = '1' + lpow = '0.0' + tol = '0.0001' + reg = '0.01' + config = dict(X=X, Y=Y, B=B, icpt=icpt, fmt=fmt, moi=moi, mii=mii, dfam=dfam, + vpov=vpow, link=link, lpow=lpow, tol=tol, reg=reg) + + config_writer(full_path_train + '.json', config) + + return data_folders + + +def regression2_glm_binomial_train(save_folder_name, datagen_dir, train_dir): + + data_folders = [] + + for i in [0, 1, 2]: + X = join(datagen_dir, 'X.data') + Y = join(datagen_dir, 'Y.data') + + full_path_train = join(train_dir, save_folder_name) + data_folders.append(full_path_train) + + B = join(full_path_train, 'B.data') + icpt = str(i) + fmt = DATA_FORMAT + moi = '200' + mii = '5' + dfam = '2' + link = '3' + yneg = '2' + tol = '0.0001' + reg = '0.01' + config = dict(X=X, Y=Y, B=B, icpt=icpt, fmt=fmt, moi=moi, mii=mii, + dfam=dfam, link=link, yneg=yneg, tol=tol, reg=reg) + + config_writer(full_path_train + '.json', config) + + return data_folders + + +def regression2_glm_poisson_train(save_folder_name, datagen_dir, train_dir): + + data_folders = [] + + for i in [0, 1, 2]: + X = join(datagen_dir, 'X.data') + Y = join(datagen_dir, 'Y.data') + + full_path_train = join(train_dir, save_folder_name) + data_folders.append(full_path_train) + + B = join(full_path_train, 'B.data') + icpt = str(i) + fmt = DATA_FORMAT + moi = '200' + mii = '5' + dfam = '1' + vpov = '1' + link = '1' + lpow = '0' + tol = '0.0001' + reg = '0.01' + config = dict(X=X, Y=Y, B=B, icpt=icpt, fmt=fmt, moi=moi, mii=mii, + dfam=dfam, vpov=vpov, link=link, lpow=lpow, tol=tol, reg=reg) + config_writer(full_path_train + '.json', config) + + return data_folders + + +def config_packets_train(algo_payload, datagen_dir, train_dir): + """ + This function has two responsibilities. Generate the configuration files for + input training algorithms and return a dictionary that will be used for execution. + + algo_payload : List of tuples + The first tuple index contains algorithm name and the second index contains + family type. + + datagen_dir: String + Path of the data generation directory + + train_dir: String + Path of the training directory + + return: {string: list} + This dictionary contains algorithms to be executed as keys and the path of configuration + json files to be executed list of values. + + """ + + config_bundle = {} + + for k, v in algo_payload: + config_bundle[k] = [] + + for current_algo, current_family in algo_payload: + data_gen_path = join(datagen_dir, current_family) + data_gen_subdir = glob.glob(data_gen_path + "*") + + # Filter for specific data gen + data_gen_folders = list(filter(lambda x: os.path.isdir(x), data_gen_subdir)) + if len(data_gen_folders) == 0: + print('datagen folders not present for {}'.format(current_family)) + sys.exit() + + for current_folder in data_gen_folders: + file_path_last = current_folder.split('/')[-1] + save_name = '.'.join([current_algo] + [file_path_last]) + algo_func = '_'.join([current_family] + [current_algo.lower().replace('-', '_')] + + ['train']) + conf_path = globals()[algo_func](save_name, current_folder, train_dir) + config_bundle[current_algo].append(conf_path) + + config_packets = {} + + # Flatten + for current_algo, current_family in config_bundle.items(): + config_packets[current_algo] = reduce(lambda x, y: x + y, current_family) + + return config_packets diff --git a/scripts/perftest/python/utils.py b/scripts/perftest/python/utils.py new file mode 100755 index 00000000000..7ff3b548b58 --- /dev/null +++ b/scripts/perftest/python/utils.py @@ -0,0 +1,296 @@ +#!/usr/bin/env python3 +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +from os.path import join +import os +import json +import subprocess +import shlex +import re +import logging + +# This file contains all the utility functions required for performance test module + + +def get_families(current_algo, ML_ALGO): + """ + Given current algorithm we get its families. + + current_algo : String + Input algorithm specified + + ml_algo : Dictionary + key, value dictionary with family as key and algorithms as list of values + + return: List + List of families returned + """ + + family_list = [] + for family, algos in ML_ALGO.items(): + if current_algo in algos: + family_list.append(family) + return family_list + + +def split_rowcol(matrix_dim): + """ + Split the input matrix dimensions into row and columns + + matrix_dim: String + Input concatenated string with row and column + + return: Tuple + Row and column split based on suffix + """ + + k = str(0) * 3 + M = str(0) * 6 + replace_M = matrix_dim.replace('M', str(M)) + replace_k = replace_M.replace('k', str(k)) + row, col = replace_k.split('_') + return row, col + + +def config_writer(write_path, config_obj): + """ + Writes the dictionary as an configuration json file to the give path + + write_path: String + Absolute path of file name to be written + + config_obj: List or Dictionary + Can be a dictionary or a list based on the object passed + """ + + with open(write_path, 'w') as input_file: + json.dump(config_obj, input_file, indent=4) + + +def config_reader(read_path): + """ + Read json file given path + + return: List or Dictionary + Reading the json file can give us a list if we have positional args or + key value for a dictionary + """ + + with open(read_path, 'r') as input_file: + conf_file = json.load(input_file) + + return conf_file + + +def create_dir(directory): + """ + Create directory given path if the directory does not exist already + + directory: String + Input folder path + """ + + if not os.path.exists(directory): + os.makedirs(directory) + + +def get_existence(path, action_mode): + """ + Check SUCCESS file is present in the input path + + path: String + Input folder path + + action_mode : String + Type of action data-gen, train ... + + return: Boolean check if the file _SUCCESS exists + """ + + if action_mode == 'data-gen': + full_path = join(path, '_SUCCESS') + exist = os.path.isfile(full_path) + else: + # Files does not exist for other modes return False to continue + # For e.g some predict algorithms do not generate an output folder + # hence checking for SUCCESS would fail + exist = False + + return exist + + +def exec_dml_and_parse_time(exec_type, file_name, args, Time=True): + """ + This function is responsible of execution of input arguments via python sub process, + We also extract time obtained from the output of this subprocess + + exec_type: String + Contains the execution type singlenode / hybrid_spark + + file_name: String + DML file name to be used while processing the arguments give + + args: Dictionary + Key values pairs depending on the arg type + + time: Boolean (default=True) + Boolean argument used to extract time from raw output logs. + """ + + algorithm = file_name + '.dml' + if exec_type == 'singlenode': + exec_script = join(os.environ.get('SYSTEMML_HOME'), 'bin', 'systemml-standalone.py') + + args = ''.join(['{} {}'.format(k, v) for k, v in args.items()]) + cmd = [exec_script, algorithm, args] + cmd_string = ' '.join(cmd) + + if exec_type == 'hybrid_spark': + exec_script = join(os.environ.get('SYSTEMML_HOME'), 'bin', 'systemml-spark-submit.py') + args = ''.join(['{} {}'.format(k, v) for k, v in args.items()]) + cmd = [exec_script, '-f', algorithm, args] + cmd_string = ' '.join(cmd) + + # Debug + # print(cmd_string) + + # Subprocess to execute input arguments + # proc1_log contains the shell output which is used for time parsing + proc1 = subprocess.Popen(shlex.split(cmd_string), stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + + if Time: + proc1_log = [] + while proc1.poll() is None: + raw_std_out = proc1.stdout.readline() + decode_raw = raw_std_out.decode('ascii').strip() + proc1_log.append(decode_raw) + logging.log(10, decode_raw) + + out1, err1 = proc1.communicate() + + if "Error" in str(err1): + print('Error Found in {}'.format(file_name)) + total_time = 'failure' + else: + total_time = parse_time(proc1_log) + + else: + total_time = 'not_specified' + + return total_time + + +def parse_time(raw_logs): + """ + Parses raw input list and extracts time + + raw_logs : List + Each line obtained from the standard output is in the list + + return: String + Extracted time in seconds or time_not_found + """ + # Debug + # print(raw_logs) + + for line in raw_logs: + if line.startswith('Total execution time'): + extract_time = re.findall(r'\d+', line) + total_time = '.'.join(extract_time) + + return total_time + + return 'time_not_found' + + +def exec_test_data(exec_type, path): + """ + Creates the test data split from the given input path + + exec_type : String + Contains the execution type singlenode / hybrid_spark + + path : String + Location of the input folder to pick X and Y + """ + systemml_home = os.environ.get('SYSTEMML_HOME') + test_split_script = join(systemml_home, 'scripts', 'perftest', 'extractTestData') + X = join(path, 'X.data') + Y = join(path, 'Y.data') + X_test = join(path, 'X_test.data') + Y_test = join(path, 'Y_test.data') + args = {'-args': ' '.join([X, Y, X_test, Y_test, 'csv'])} + + # Call the exec script without time + exec_dml_and_parse_time(exec_type, test_split_script, args, False) + + +def check_predict(current_algo, ML_PREDICT): + """ + To check if the current algorithm requires to run the predict + + current_algo: String + Algorithm being processed + + ML_PREDICT: Dictionary + Key value pairs of algorithm and predict file to process + """ + if current_algo in ML_PREDICT.keys(): + return True + else: + return False + + +def get_folder_metrics(folder_name, action_mode): + """ + Gets metrics from folder name + + folder_name: String + Folder from which we want to grab details + + return: List(3) + A list with mat_type, mat_shape, intercept + """ + + if action_mode == 'data-gen': + split_name = folder_name.split('.') + mat_type = split_name[1] + mat_shape = split_name[2] + intercept = 'none' + + try: + if action_mode == 'train': + split_name = folder_name.split('.') + mat_type = split_name[3] + mat_shape = split_name[2] + intercept = split_name[4] + + if action_mode == 'predict': + split_name = folder_name.split('.') + mat_type = split_name[3] + mat_shape = split_name[2] + intercept = split_name[4] + except IndexError: + intercept = 'none' + + return mat_type, mat_shape, intercept \ No newline at end of file