Skip to content

Commit

Permalink
[SYSTEMML-1451][GSoC Phase 1] Single script to run perf tests
Browse files Browse the repository at this point in the history
- Single entry point to run perf tests in any combination of algoriths,
  families, matrix shapes & densities
- Reports time taken by a single perf test by parsing the output and
  grep-ing for the time
- Detects tests that did not run and reports in the generated log
- Robust error handling and reporting, informative help message

Closes #537
  • Loading branch information
krishnakalyan3 authored and nakul02 committed Jul 2, 2017
1 parent 31952e4 commit e7cfcad
Show file tree
Hide file tree
Showing 5 changed files with 1,583 additions and 0 deletions.
252 changes: 252 additions & 0 deletions scripts/perftest/python/datagen.py
@@ -0,0 +1,252 @@
#!/usr/bin/env python3
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

import itertools
from os.path import join
from utils import split_rowcol, config_writer

# This file contains configuration settings for data generation
DATA_FORMAT = 'csv'

MATRIX_TYPE_DICT = {'dense': '0.9',
'sparse': '0.01'}

FAMILY_NO_MATRIX_TYPE = ['clustering', 'stats1', 'stats2']


def multinomial_datagen(matrix_dim, matrix_type, datagen_dir):

row, col = split_rowcol(matrix_dim)
path_name = '.'.join(['multinomial', matrix_type, str(matrix_dim)])
full_path = join(datagen_dir, path_name)

numSamples = row
numFeatures = col
sparsity = MATRIX_TYPE_DICT[matrix_type]
num_categories = '150'
intercept = '0'
X = join(full_path, 'X.data')
Y = join(full_path, 'Y.data')
fmt = DATA_FORMAT

config = [numSamples, numFeatures, sparsity, num_categories, intercept,
X, Y, fmt, '1']

config_writer(full_path + '.json', config)

return full_path


def binomial_datagen(matrix_dim, matrix_type, datagen_dir):

row, col = split_rowcol(matrix_dim)
path_name = '.'.join(['binomial', matrix_type, str(matrix_dim)])
full_path = join(datagen_dir, path_name)

numSamples = row
numFeatures = col
maxFeatureValue = '5'
maxWeight = '5'
loc_weights = join(full_path, 'weight.data')
loc_data = join(full_path, 'X.data')
loc_labels = join(full_path, 'Y.data')
noise = '1'
intercept = '0'
sparsity = MATRIX_TYPE_DICT[matrix_type]
tranform_labels = '1'
fmt = DATA_FORMAT

config = [numSamples, numFeatures, maxFeatureValue, maxWeight, loc_weights, loc_data,
loc_labels, noise, intercept, sparsity, fmt, tranform_labels]
config_writer(full_path + '.json', config)

return full_path


def regression1_datagen(matrix_dim, matrix_type, datagen_dir):

row, col = split_rowcol(matrix_dim)
path_name = '.'.join(['regression1', matrix_type, str(matrix_dim)])
full_path = join(datagen_dir, path_name)

numSamples = row
numFeatures = col
maxFeatureValue = '5'
maxWeight = '5'
loc_weights = join(full_path, 'weight.data')
loc_data = join(full_path, 'X.data')
loc_labels = join(full_path, 'Y.data')
noise = '1'
intercept = '0'
sparsity = MATRIX_TYPE_DICT[matrix_type]
tranform_labels = '1'
fmt = DATA_FORMAT

config = [numSamples, numFeatures, maxFeatureValue, maxWeight, loc_weights, loc_data,
loc_labels, noise, intercept, sparsity, fmt, tranform_labels]
config_writer(full_path + '.json', config)

return full_path


def regression2_datagen(matrix_dim, matrix_type, datagen_dir):

row, col = split_rowcol(matrix_dim)
path_name = '.'.join(['regression2', matrix_type, str(matrix_dim)])
full_path = join(datagen_dir, path_name)

numSamples = row
numFeatures = col
maxFeatureValue = '5'
maxWeight = '5'
loc_weights = join(full_path, 'weight.data')
loc_data = join(full_path, 'X.data')
loc_labels = join(full_path, 'Y.data')
noise = '1'
intercept = '0'
sparsity = MATRIX_TYPE_DICT[matrix_type]
tranform_labels = '1'
fmt = DATA_FORMAT

config = [numSamples, numFeatures, maxFeatureValue, maxWeight, loc_weights, loc_data,
loc_labels, noise, intercept, sparsity, fmt, tranform_labels]
config_writer(full_path + '.json', config)

return full_path


def clustering_datagen(matrix_dim, matrix_type, datagen_dir):

row, col = split_rowcol(matrix_dim)
path_name = '.'.join(['clustering', matrix_type, str(matrix_dim)])

full_path = join(datagen_dir, path_name)
X = join(full_path, 'X.data')
Y = join(full_path, 'Y.data')
YbyC = join(full_path, 'YbyC.data')
C = join(full_path, 'C.data')
nc = '50'
dc = '10.0'
dr = '1.0'
fbf = '100.0'
cbf = '100.0'

config = dict(nr=row, nf=col, nc=nc, dc=dc, dr=dr, fbf=fbf, cbf=cbf, X=X, C=C, Y=Y,
YbyC=YbyC, fmt=DATA_FORMAT)

config_writer(full_path + '.json', config)
return full_path


def stats1_datagen(matrix_dim, matrix_type, datagen_dir):

row, col = split_rowcol(matrix_dim)
path_name = '.'.join(['stats1', matrix_type, str(matrix_dim)])
full_path = join(datagen_dir, path_name)

DATA = join(full_path, 'X.data')
TYPES = join(full_path, 'types')
TYPES1 = join(full_path, 'set1.types')
TYPES2 = join(full_path, 'set2.types')
INDEX1 = join(full_path, 'set1.indices')
INDEX2 = join(full_path, 'set2.indices')
MAXDOMAIN = '1100'
SETSIZE = '20'
LABELSETSIZE = '10'

# NC should be less than C and more than num0
# NC = 10 (old value)
# num0 = NC/2
# num0 < NC < C
# NC = C/2
NC = int(int(col)/2)

config = dict(R=row, C=col, NC=NC, MAXDOMAIN=MAXDOMAIN, DATA=DATA, TYPES=TYPES, SETSIZE=SETSIZE,
LABELSETSIZE=LABELSETSIZE, TYPES1=TYPES1, TYPES2=TYPES2, INDEX1=INDEX1, INDEX2=INDEX2,
fmt=DATA_FORMAT)

config_writer(full_path + '.json', config)

return full_path


def stats2_datagen(matrix_dim, matrix_type, datagen_dir):

row, col = split_rowcol(matrix_dim)
path_name = '.'.join(['stats2', matrix_type, str(matrix_dim)])
full_path = join(datagen_dir, path_name)

D = join(full_path, 'X.data')
Xcid = join(full_path, 'Xcid.data')
Ycid = join(full_path, 'Ycid.data')
A = join(full_path, 'A.data')

config = dict(nr=row, nf=col, D=D, Xcid=Xcid, Ycid=Ycid,
A=A, fmt=DATA_FORMAT)

config_writer(full_path + '.json', config)
return full_path


def config_packets_datagen(algo_payload, matrix_type, matrix_shape, datagen_dir):
"""
This function has two responsibilities. Generate the configuration files for
datagen algorithms and return a dictionary that will be used for execution.
algo_payload : List of tuples
The first tuple index contains algorithm name and the second index contains
family type.
matrix_type: String
Type of matrix to generate e.g dense or sparse
matrix_shape: String
Shape of matrix to generate e.g 100k_10
return: Dictionary {string: list}
This dictionary contains algorithms to be executed as keys and the path of configuration
json files to be executed list of values.
"""

config_bundle = {}

distinct_families = set(map(lambda x: x[1], algo_payload))

# Cross Product of all configurations
for current_family in distinct_families:
if current_family in FAMILY_NO_MATRIX_TYPE:
config = list(itertools.product(matrix_shape, ['dense']))
config_bundle[current_family] = config
else:
config = list(itertools.product(matrix_shape, matrix_type))
# clustering : [[10k_1, dense], [10k_2, dense], ...]
config_bundle[current_family] = config

config_packets = {}
for current_family, configs in config_bundle.items():
config_packets[current_family] = []
for size, type in configs:
family_func = current_family.lower() + '_datagen'
conf_path = globals()[family_func](size, type, datagen_dir)
config_packets[current_family].append(conf_path)

return config_packets

0 comments on commit e7cfcad

Please sign in to comment.