New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SYSTEMML-1451][Phase 1] Automate performance suite and report performance numbers #537
Changes from 3 commits
7dfc5e9
33c06dc
8db9959
bddfda5
99d246b
ad785b6
df57a3d
d06e87b
bb1f148
9b56486
a7ae3d2
faad7f4
e8f97bf
fec1c2d
c01e992
c4607ce
7f07eda
08bf5bd
093222d
bdb7cc8
c83154c
8356eea
c1e84fe
180b48c
e72d73b
33683f5
6c222d7
e6664d1
8689cff
29e310b
19bbbbd
e6733be
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
#!/usr/bin/env python3 | ||
#------------------------------------------------------------- | ||
# | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, | ||
# software distributed under the License is distributed on an | ||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations | ||
# under the License. | ||
# | ||
#------------------------------------------------------------- | ||
|
||
import os | ||
from os.path import join | ||
from utils import split_rowcol, config_writer | ||
import sys | ||
import logging | ||
|
||
mat_type = {'dense': 0.9, | ||
'sparse': 0.01} | ||
format = 'csv' | ||
has_predict = ['GLM', 'Kmeans', 'l2-svm', 'm-svm', 'naive-bayes'] | ||
|
||
|
||
def naive_bayes_datagen(matrix_type, mat_shapes, conf_dir): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I could be missing something obvious here (since I am not very familiar with Python), but it seems like this function |
||
for index, dim in enumerate(mat_shapes): | ||
file_name = '_'.join(['naive_bayes_datagen', matrix_type, str(index) + '.json']) | ||
|
||
config = [dim[0], dim[1], mat_type[matrix_type], 150, 0, | ||
'X.data', 'Y.data', format] | ||
config_writer(conf_dir, config, file_name) | ||
return None | ||
|
||
|
||
def naive_bayes_datagen(matrix_type, mat_shapes, conf_dir): | ||
for index, dim in enumerate(mat_shapes): | ||
file_name = '_'.join(['naive_bayes_datagen', matrix_type, str(index) + '.json']) | ||
|
||
config = [dim[0], dim[1], mat_type[matrix_type], 150, 0, | ||
'X.data', 'Y.data', format] | ||
config_writer(conf_dir, config, file_name) | ||
return None | ||
|
||
|
||
def kmeans_datagen(matrix_type, mat_shapes, conf_dir): | ||
for index, dim in enumerate(mat_shapes): | ||
file_name = '_'.join(['kmeans_datagen', str(index) + '.json']) | ||
config = dict(nr=dim[0], nf=dim[1], nc='5', dc='10.0', dr='1.0', | ||
fbf='100.0', cbf='100.0', X='X.data', C='C.data', Y='Y.data', | ||
YbyC='YbyC.data', fmt=format) | ||
config_writer(conf_dir, config, file_name) | ||
return None | ||
|
||
|
||
def kmeans_train(conf_dir): | ||
file_name = ''.join(['kmeans_train', '.json']) | ||
config = dict(X='X.data', k=5, maxi=10, runs=10, tol=0.00000001, samp=20, | ||
C='C.data', isY='TRUE', Y='Y.data', verb='TRUE') | ||
config_writer(conf_dir, config, file_name) | ||
return None | ||
|
||
|
||
def kmeans_predict(conf_dir): | ||
file_name = ''.join(['kmeans_predict', '.json']) | ||
config = dict(X='X.data', C='C.data', prY='prY.data') | ||
config_writer(conf_dir, config, file_name) | ||
return None | ||
|
||
|
||
def init_conf(algo, temp_dir, matrix_type, matrix_shape, job): | ||
# Create directories | ||
conf_dir = join(temp_dir, 'conf') | ||
gen_dir = join(temp_dir, 'data_gen') | ||
train_dir = join(temp_dir, 'train') | ||
pred_dir = join(temp_dir, 'pred') | ||
|
||
for dirs in [conf_dir, gen_dir, train_dir, pred_dir]: | ||
if not os.path.exists(dirs): | ||
os.makedirs(dirs) | ||
|
||
mat_shapes = split_rowcol(matrix_shape) | ||
|
||
if job[0] == 1: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of 0 & 1, use either an enum (i know they were added in 3.4 and may need some discussion) or a named constant. |
||
for current_algo in algo: | ||
algo_dg = current_algo.lower().replace('-', '_') + '_datagen' | ||
globals()[algo_dg](matrix_type, mat_shapes, conf_dir) | ||
logging.info('Completed writing {} datagen file'.format(current_algo)) | ||
|
||
if job[1] == 1: | ||
for current_algo in algo: | ||
algo_dg = current_algo.lower() + '_train' | ||
globals()[algo_dg](conf_dir) | ||
logging.info('Completed writing {} training file'.format(current_algo)) | ||
|
||
if job[2] == 1: | ||
for current_algo in algo: | ||
if current_algo in has_predict: | ||
algo_dg = current_algo.lower() + '_predict' | ||
globals()[algo_dg](conf_dir) | ||
logging.info('Completed writing {} training file'.format(current_algo)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
#!/usr/bin/env python3 | ||
# ------------------------------------------------------------- | ||
# | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, | ||
# software distributed under the License is distributed on an | ||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations | ||
# under the License. | ||
# | ||
# ------------------------------------------------------------- | ||
|
||
# TODO: | ||
# Handel Intercept | ||
|
||
import sys | ||
import argparse | ||
from functools import reduce | ||
import os | ||
from os.path import join | ||
from utils import get_algo | ||
from configuration import init_conf | ||
import logging | ||
import time | ||
|
||
ml_algo = {'binomial': ['MultiLogReg', 'l2-svm', 'm-svm'], | ||
'clustering': ['Kmeans'], | ||
'multinomial': ['naive-bayes', 'MultiLogReg', 'm-svm'], | ||
'regression': ['LinearRegDS', 'LinearRegCG', 'GLM'], | ||
'stats': ['Univar-Stats', 'bivar-stats', 'stratstats']} | ||
|
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not a fan of using the function name |
||
def main(family, algo, exec_type, mat_type, mat_shape, temp_dir, generate_data, train, predict): | ||
if algo is None: | ||
algo = get_algo(family, ml_algo) | ||
|
||
job = list(map(lambda x: int(x), [generate_data, train, predict])) | ||
init_conf(algo, temp_dir, mat_type, mat_shape, job) | ||
|
||
return None | ||
|
||
|
||
if __name__ == '__main__': | ||
algo_flat = reduce(lambda x, y: x + y, ml_algo.values()) | ||
cparser = argparse.ArgumentParser(description='SystemML Performance Test Script') | ||
group = cparser.add_mutually_exclusive_group(required=True) | ||
group.add_argument('--family', help='specify class of algorithms (e.g regression, binomial)', metavar='', | ||
choices=ml_algo.keys(), nargs='+') | ||
|
||
group.add_argument('--algo', help='specify the type of algorithm to run', metavar='', | ||
choices=algo_flat, nargs='+') | ||
cparser.add_argument('-exec-type', default='singlenode', help='System-ML backend (e.g singlenode, ' | ||
'spark, spark-hybrid)', metavar='', | ||
choices=['hybrid_spark', 'singlenode']) | ||
cparser.add_argument('--mat-type', default='dense', help='Type of matrix to generate (e.g dense ' | ||
'or sparse)', metavar='', choices=['sparse', 'dense']) | ||
cparser.add_argument('--mat-shape', help='Shape of matrix to generate (e.g ' | ||
'10k_1k)', metavar='', nargs='+') | ||
|
||
# Optional Arguments | ||
cparser.add_argument('-temp-dir', help='specify temporary directory', metavar='') | ||
cparser.add_argument('--generate-data', help='generate data', action='store_true') | ||
cparser.add_argument('--train', help='train algorithms', action='store_true') | ||
cparser.add_argument('--predict', help='predict (if available)', action='store_true') | ||
|
||
args = cparser.parse_args() | ||
arg_dict = vars(args) | ||
|
||
# Check for validity of input arguments | ||
if args.family is not None: | ||
for fam in args.family: | ||
if fam not in ml_algo.keys(): | ||
print('{} family not present in the performance test suit'.format(fam)) | ||
sys.exit() | ||
|
||
if args.algo is not None: | ||
for algo in args.algo: | ||
if algo not in algo_flat: | ||
print('{} algorithm not present in the performance test suit'.format(args.algo)) | ||
sys.exit() | ||
|
||
if args.temp_dir is None: | ||
systemml_home = os.environ.get('SYSTEMML_HOME') | ||
args.temp_dir = join(systemml_home, 'scripts', 'perftest', 'temp') | ||
|
||
start_time = time.time() | ||
logging.basicConfig(filename=join(args.temp_dir, 'perftest.out'), level=logging.INFO) | ||
logging.info('New experiment state time {}'.format(start_time)) | ||
logging.info(args) | ||
|
||
if not os.path.exists(args.temp_dir): | ||
os.makedirs(args.temp_dir) | ||
|
||
main(**arg_dict) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
#!/usr/bin/env python3 | ||
#------------------------------------------------------------- | ||
# | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, | ||
# software distributed under the License is distributed on an | ||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations | ||
# under the License. | ||
# | ||
#------------------------------------------------------------- | ||
|
||
from functools import reduce | ||
import os | ||
import json | ||
|
||
|
||
def get_algo(family, ml_algo): | ||
algo = [] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Best to add documentation to all the functions in this file. So that someone who wants to add perf tests in the future knows what to do. |
||
for fam in family: | ||
algo.append(ml_algo[fam]) | ||
algo_flat = reduce(lambda x, y: x + y, algo) | ||
return algo_flat | ||
|
||
|
||
def split_rowcol(matrix_dim): | ||
mat_shapes = [] | ||
for dims in matrix_dim: | ||
k = str(0) * 3 | ||
M = str(0) * 6 | ||
replace_M = dims.replace('M', str(M)) | ||
replace_k = replace_M.replace('k', str(k)) | ||
row, col = replace_k.split('_') | ||
mat_shapes.append((row, col)) | ||
|
||
return mat_shapes | ||
|
||
|
||
def config_writer(path, config_dict, file_name): | ||
if not os.path.exists(path): | ||
os.makedirs(path) | ||
|
||
with open(path + '/' + file_name, 'w') as json_file: | ||
json.dump(config_dict, json_file) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A little blurb about the contents/purpose of this file would be great.