From 11be840a5c5ed12758924b49bb16ab0092cd8d35 Mon Sep 17 00:00:00 2001 From: Xiaocheng Tang Date: Thu, 15 Oct 2015 16:19:26 -0700 Subject: [PATCH 01/14] SVM: Add regression for SVM --- CMakeLists.txt | 2 + src/modules/convex/linear_svm_igd.cpp | 49 +++-- src/modules/convex/task/l1.hpp | 69 ++++--- src/modules/convex/task/l2.hpp | 37 ++-- src/modules/convex/task/linear_svm.hpp | 44 ++++- src/ports/postgres/modules/svm/svm.py_in | 166 ++++++++++++----- src/ports/postgres/modules/svm/svm.sql_in | 174 +++++++++++++++++- .../modules/svm/test/linear_svm.sql_in | 96 +++++++++- 8 files changed, 495 insertions(+), 142 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c5f40f4f9..c00b79ff0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -103,6 +103,8 @@ if(CMAKE_COMPILER_IS_GNUCXX) if(APPLE) set(CMAKE_INCLUDE_SYSTEM_FLAG_CXX "-isystem ") endif(APPLE) +elseif(CMAKE_C_COMPILER_ID MATCHES "Clang") + set(CMAKE_CXX_FLAGS "-stdlib=libstdc++") endif(CMAKE_COMPILER_IS_GNUCXX) # force a `m4_' prefix to all builtins diff --git a/src/modules/convex/linear_svm_igd.cpp b/src/modules/convex/linear_svm_igd.cpp index 2fb171efa..b0ded7660 100644 --- a/src/modules/convex/linear_svm_igd.cpp +++ b/src/modules/convex/linear_svm_igd.cpp @@ -53,12 +53,9 @@ linear_svm_igd_transition::run(AnyType &args) { // For other tuples: args[0] holds the computation state until last tuple GLMIGDState > state = args[0]; - const double lambda = args[6].getAs(); - const bool isL2 = args[7].getAs(); - const int nTuples = args[8].getAs(); - - // initilize the state if first tuple + // initialize the state if first tuple if (state.algo.numRows == 0) { + LinearSVM::epsilon = args[9].getAs();; if (!args[3].isNull()) { GLMIGDState > previousState = args[3]; state.allocate(*this, previousState.task.dimension); @@ -66,20 +63,18 @@ linear_svm_igd_transition::run(AnyType &args) { } else { // configuration parameters uint32_t dimension = args[4].getAs(); - double stepsize = args[5].getAs(); - state.allocate(*this, dimension); // with zeros - state.task.stepsize = stepsize; } // resetting in either case state.reset(); - if (isL2) { - state.algo.loss += L2::loss(state.task.model, lambda); - L2::gradient(state.task.model, lambda, state.algo.gradient); - } else { - state.algo.loss += L1::loss(state.task.model, lambda); - L1::gradient(state.task.model, lambda, state.algo.gradient); - } + state.task.stepsize = args[5].getAs(); + const bool isL2 = args[7].getAs(); + const double lambda = args[6].getAs(); + const int nTuples = args[8].getAs(); + L1::n_tuples = nTuples; + L2::n_tuples = nTuples; + if (isL2) L2::lambda = lambda; + else L1::lambda = lambda; } // Skip the current record if args[1] (features) contains NULL values, @@ -95,19 +90,16 @@ linear_svm_igd_transition::run(AnyType &args) { using madlib::dbal::eigen_integration::MappedColumnVector; GLMTuple tuple; tuple.indVar.rebind(args[1].getAs().memoryHandle(), - state.task.dimension); - tuple.depVar = args[2].getAs() ? 1. : -1.; + state.task.dimension); + tuple.depVar = args[2].getAs(); // Now do the transition step // apply IGD with regularization - if (isL2) { - L2::scaling(state.algo.incrModel, lambda, nTuples, state.task.stepsize); - LinearSVMIGDAlgorithm::transition(state, tuple); - } else { - LinearSVMIGDAlgorithm::transition(state, tuple); - L1::clipping(state.algo.incrModel, lambda, nTuples, state.task.stepsize); - } - // objective function and its gradient + L2::scaling(state.task.model, state.algo.incrModel, state.task.stepsize); + LinearSVMIGDAlgorithm::transition(state, tuple); + L1::clipping(state.algo.incrModel, state.task.stepsize); + // evaluate objective function and its gradient + // at the old model - state.task.model LinearSVMLossAlgorithm::transition(state, tuple); LinearSVMGradientAlgorithm::transition(state, tuple); @@ -152,7 +144,12 @@ linear_svm_igd_final::run(AnyType &args) { // Aggregates that haven't seen any data just return Null. if (state.algo.numRows == 0) { return Null(); } - + + state.algo.loss += L2::loss(state.task.model); + state.algo.loss += L1::loss(state.task.model); + L2::gradient(state.task.model, state.algo.gradient); + L1::gradient(state.task.model, state.algo.gradient); + elog(NOTICE, "loss and |gradient|: %e, %e\n", (double) state.algo.loss, state.algo.gradient.norm()); // finalizing LinearSVMIGDAlgorithm::final(state); diff --git a/src/modules/convex/task/l1.hpp b/src/modules/convex/task/l1.hpp index 31e3e969b..e0d68e442 100644 --- a/src/modules/convex/task/l1.hpp +++ b/src/modules/convex/task/l1.hpp @@ -23,33 +23,42 @@ class L1 { public: typedef Model model_type; + static double lambda; + + static int n_tuples; + static void gradient( - const model_type &model, - const double &lambda, - model_type &gradient); + const model_type &model, + model_type &gradient); static void clipping( - model_type &incrModel, - const double &lambda, - const int &n_tuples, - const double &stepsize); + model_type &incrModel, + const double &stepsize); static double loss( - const model_type &model, - const double &lambda); + const model_type &model); }; +template +double +L1::lambda = 0.; + +template +int +L1::n_tuples = 1; + template void L1::gradient( const model_type &model, - const double &lambda, model_type &gradient) { - for (Index i = 0; i < model.size(); i++) { - if (model(i) > 0) { - gradient(i) += lambda; - } else if (model(i) < 0) { - gradient(i) -= lambda; + if (lambda != 0.) { + for (Index i = 0; i < model.size(); i++) { + if (model(i) > 0) { + gradient(i) += lambda; + } else if (model(i) < 0) { + gradient(i) -= lambda; + } } } } @@ -58,29 +67,29 @@ template void L1::clipping( model_type &incrModel, - const double &lambda, - const int &n_tuples, const double &stepsize) { - // implement the Clipping method mentioned in Tsuruoka et al. 2009 - double clip_boundry = lambda / n_tuples * stepsize; - for (Index i = 0; i < incrModel.size(); i++) { - if (incrModel(i) > clip_boundry) { - incrModel(i) -= clip_boundry; - } else if (incrModel(i) < - clip_boundry) { - incrModel(i) += clip_boundry; - } else { incrModel(i) = 0.; } + if (lambda != 0.) { + // implement the Clipping method mentioned in Tsuruoka et al. 2009 + double clip_boundry = lambda / n_tuples * stepsize; + for (Index i = 0; i < incrModel.size(); i++) { + if (incrModel(i) > clip_boundry) { + incrModel(i) -= clip_boundry; + } else if (incrModel(i) < - clip_boundry) { + incrModel(i) += clip_boundry; + } else { incrModel(i) = 0.; } + } } } template double L1::loss( - const model_type &model, - const double &lambda) { + const model_type &model) { double s = 0.; - for (Index i = 0; i < model.size(); i++) { - s += std::abs(model(i)); - } + if (lambda != 0.) + for (Index i = 0; i < model.size(); i++) { + s += std::abs(model(i)); + } return lambda * s; } diff --git a/src/modules/convex/task/l2.hpp b/src/modules/convex/task/l2.hpp index fa054c2ef..4f9cab783 100644 --- a/src/modules/convex/task/l2.hpp +++ b/src/modules/convex/task/l2.hpp @@ -23,45 +23,51 @@ class L2 { public: typedef Model model_type; typedef Hessian hessian_type; + + static double lambda; + static int n_tuples; static void gradient( const model_type &model, - const double &lambda, model_type &gradient); static void scaling( + const model_type &model, model_type &incrModel, - const double &lambda, - const int &n_tuples, const double &stepsize); static void hessian( const model_type &model, - const double &lambda, hessian_type &hessian); static double loss( - const model_type &model, - const double &lambda); + const model_type &model); }; + +template +double +L2::lambda = 0.; + +template +int +L2::n_tuples = 1; template void L2::gradient( const model_type &model, - const double &lambda, model_type &gradient) { - gradient += 2 * lambda * model; + // 1/2 * lambda * || w ||^2 + gradient += lambda * model; } template void L2::scaling( + const model_type &model, model_type &incrModel, - const double &lambda, - const int &n_tuples, const double &stepsize) { - double wscale = 1 - 2 * lambda / n_tuples * stepsize; + double wscale = 1 - lambda / n_tuples * stepsize; if (wscale > 0) { incrModel *= wscale; } else { incrModel.setZero(); } } @@ -70,18 +76,17 @@ template void L2::hessian( const model_type &model, - const double &lambda, hessian_type &hessian) { int n = model.size(); - hessian += 2 * lambda * hessian.Identity(n, n); + hessian += lambda * hessian.Identity(n, n); } template double L2::loss( - const model_type &model, - const double &lambda) { - return lambda * model.norm(); + const model_type &model) { + // 1/2 * lambda * || w ||^2 + return lambda * model.norm()*model.norm() / 2; } } // namespace convex diff --git a/src/modules/convex/task/linear_svm.hpp b/src/modules/convex/task/linear_svm.hpp index 8ecf98244..4a31c5b3f 100644 --- a/src/modules/convex/task/linear_svm.hpp +++ b/src/modules/convex/task/linear_svm.hpp @@ -15,6 +15,7 @@ namespace convex { // Use Eigen using namespace madlib::dbal::eigen_integration; + template class LinearSVM { @@ -25,6 +26,8 @@ class LinearSVM { independent_variables_type; typedef typename Tuple::dependent_variable_type dependent_variable_type; + static double epsilon; + static void gradient( const model_type &model, const independent_variables_type &x, @@ -47,6 +50,10 @@ class LinearSVM { const independent_variables_type &x); }; +template +double +LinearSVM::epsilon = 0.; + template void LinearSVM::gradient( @@ -55,10 +62,18 @@ LinearSVM::gradient( const dependent_variable_type &y, model_type &gradient) { double wx = dot(model, x); - if (1 - wx * y > 0) { - double c = -y; // minus for "-loglik" - gradient += c * x; - } else { gradient += 0. * x; } + if (epsilon == 0) { + if (1 - wx * y > 0) { + double c = -y; // minus for "-loglik" + gradient += c * x; + } else { gradient += 0. * x; } + } + else { + double wx_y = wx - y; + double c = wx_y > 0 ? 1. : -1.; + if (c*wx_y - epsilon > 0.) gradient += c * x; + else gradient += 0. * x; + } } template @@ -69,10 +84,17 @@ LinearSVM::gradientInPlace( const dependent_variable_type &y, const double &stepsize) { double wx = dot(model, x); - if (1. - wx * y > 0.) { - double c = -y; // minus for "-loglik" - model -= stepsize * c * x; - } else { } + if (epsilon == 0) { + if (1. - wx * y > 0.) { + double c = -y; // minus for "-loglik" + model -= stepsize * c * x; + } else { } + } + else { + double wx_y = wx - y; + double c = wx_y > 0 ? 1. : -1.; + if (c*wx_y - epsilon > 0.) model -= stepsize * c * x; + } } template @@ -82,7 +104,11 @@ LinearSVM::loss( const independent_variables_type &x, const dependent_variable_type &y) { double wx = dot(model, x); - double distance = 1. - wx * y; + double distance = 0.; + if (epsilon == 0) distance = 1. - wx * y; + else { + distance = fabs(wx - y) - epsilon; + } return distance > 0. ? distance : 0.; } diff --git a/src/ports/postgres/modules/svm/svm.py_in b/src/ports/postgres/modules/svm/svm.py_in index 3ca1d7d45..276f5b1d0 100644 --- a/src/ports/postgres/modules/svm/svm.py_in +++ b/src/ports/postgres/modules/svm/svm.py_in @@ -32,19 +32,46 @@ def _compute_svm(args): """ {schema_madlib}.linear_svm_igd_step( ({col_ind_var})::FLOAT8[], - CASE WHEN ({col_dep_var}) IS NULL THEN NULL - WHEN ({col_dep_var}) = {mapped_value_for_negative} THEN FALSE - ELSE TRUE - END, + ({col_dep_var_trans})::FLOAT8, {rel_state}.{col_grp_state}, {n_features}::INT4, {stepsize}::FLOAT8, {lambda}::FLOAT8, {is_l2}::BOOLEAN, - {col_n_tuples} + {col_n_tuples}, + {epsilon}::FLOAT8 ) """) - it.kwargs['stepsize'] *= it.kwargs['decay_factor'] + if kwargs['decay_factor'] > 0: + it.kwargs['stepsize'] = it.kwargs['stepsize'] * kwargs['decay_factor'] + else: + it.kwargs['stepsize'] = init_stepsize / (it.iteration + 1) + if args['verbose']: + res = plpy.execute(""" + SELECT + (result).loss AS loss, + (result).norm_of_gradient AS norm_of_gradient + FROM ( + SELECT {schema_madlib}.internal_linear_svm_igd_result( + _state) AS result + FROM {rel_state} + WHERE _iteration = {iteration}) subq + """.format(iteration=it.iteration, + schema_madlib=schema_madlib, + rel_state=it.kwargs['rel_state'])) + + loss = res[0]['loss'] + normg = res[0]['norm_of_gradient'] + epsilon = it.kwargs['epsilon'] + stepsize = it.kwargs['stepsize'] + iteration = it.iteration + plpy.notice("DEBUG: \ + iter = {iteration:5d}, \ + loss = {loss:.5e}, \ + |gradient| = {normg:.5e}, \ + stepsize = {stepsize:.5e}, \ + epsilon = {epsilon:.2e}".format(**locals())) + has_converged = it.test( """ {iteration} >= {max_iter} @@ -56,14 +83,12 @@ def _compute_svm(args): # --------------------------------------------------- -# Function to run the SVM classification algorithm -def svm_classification(schema_madlib, source_table, model_table, - dependent_varname, independent_varname, - kernel_func, kernel_params, grouping_col, - optim_params, reg_params, verbose, **kwargs): +def svm(schema_madlib, source_table, model_table, + dependent_varname, independent_varname, kernel_func, + kernel_params, grouping_col, optim_params, reg_params, + epsilon, is_svc, verbose, **kwargs): """ Executes the linear support vector classification algorithm. - """ # verbosing verbosity_level = "info" if verbose else "error" @@ -77,20 +102,44 @@ def svm_classification(schema_madlib, source_table, model_table, "SVM error: invalid independent_varname ('" + str(independent_varname) + "') for source_table (" + source_table + ")!") - # map dependent variables - dep_labels=plpy.execute(""" - SELECT {dep} AS y - FROM {tbl} - WHERE ({dep}) IS NOT NULL - GROUP BY ({dep}) - ORDER BY ({dep}) - """.format(dep=dependent_varname, tbl=source_table)) - _dep_var_mapping = ["'" + d['y'] + "'" if isinstance(d['y'], basestring) - else str(d['y']) for d in dep_labels] - if len(_dep_var_mapping) != 2: - plpy.error("SVM error: Only binary classification is supported. " - "Found more than two dependent variable values ({0})". - format(str(_dep_var_mapping))) + # transform col_dep_var to binary (1`or -1) if classification + args = {'col_dep_var_trans': dependent_varname, + 'mapping': 'NULL', + 'method': 'SVR'} + + if is_svc: + args['method'] = 'SVC' + # dependent variable mapping + dep_labels=plpy.execute(""" + SELECT {dependent_varname} AS y + FROM {source_table} + WHERE ({dependent_varname}) IS NOT NULL + GROUP BY ({dependent_varname}) + ORDER BY ({dependent_varname})""".format(**locals())) + dep_var_mapping = ["'" + d['y'] + "'" if isinstance(d['y'], basestring) else str(d['y']) for d in dep_labels] + if len(dep_var_mapping) != 2: + plpy.error("SVM error: Classification currently only supports binary output") + + col_dep_var_trans = ( + """ + CASE WHEN ({col_dep_var}) IS NULL THEN NULL + WHEN ({col_dep_var}) = {mapped_value_for_negative} THEN -1.0 + ELSE 1.0 + END + """ + .format(col_dep_var=dependent_varname, + mapped_value_for_negative=dep_var_mapping[0]) + ) + args.update({ + 'mapped_value_for_negative': dep_var_mapping[0], + 'col_dep_var_trans': col_dep_var_trans, + 'mapping': dep_var_mapping[0] + "," + dep_var_mapping[1]}) + elif epsilon is None: + # TODO: choose a better default value for epsilon + epsilon = 0.1 + elif epsilon < 0: + plpy.error("SVM error: epsilon cannot be less than 0!") + dep_type = get_expr_type(dependent_varname, source_table) if '[]' in dep_type: plpy.error("SVM error: dependent_varname cannot be of array type!") @@ -127,7 +176,7 @@ def svm_classification(schema_madlib, source_table, model_table, 'rel_source': source_table, 'col_ind_var': independent_varname, 'col_dep_var': dependent_varname, - 'mapped_value_for_negative': _dep_var_mapping[0] + 'mapped_value_for_negative': dep_var_mapping[0] } args.update(locals()) # variables defined above cannot be moved below this line @@ -173,7 +222,7 @@ def svm_classification(schema_madlib, source_table, model_table, n_iters_run = _compute_svm(args) # organizing results - args['mapping'] = _dep_var_mapping[0] + "," + _dep_var_mapping[1] + args['mapping'] = dep_var_mapping[0] + "," + dep_var_mapping[1] groupby_str = "GROUP BY {grouping_col}, {col_grp_key}".format(**args) if grouping_col else "" using_str = "USING ({col_grp_key})".format(**args) if grouping_col else "ON TRUE" model_table_query = """ @@ -188,8 +237,9 @@ def svm_classification(schema_madlib, source_table, model_table, n_tuples_including_nulls - (result).num_rows_processed AS num_rows_skipped, NULL AS __random_feature_data, - ARRAY[{mapping}]::{dep_type}[] AS _dep_var_mapping - FROM ( + ARRAY[{mapping}]::{dep_type}[] AS dep_var_mapping + FROM + ( SELECT {schema_madlib}.internal_linear_svm_igd_result( {col_grp_state} @@ -198,7 +248,8 @@ def svm_classification(schema_madlib, source_table, model_table, FROM {rel_state} WHERE {col_grp_iteration} = {n_iters_run} ) rel_state_subq - JOIN ( + JOIN + ( SELECT {grouping_str1} count(*) AS n_tuples_including_nulls, @@ -230,17 +281,18 @@ def svm_classification(schema_madlib, source_table, model_table, 'linear'::text AS kernel_func, NULL::text AS kernel_params, '{grouping_text}'::text AS grouping_col, - 'init_stepsize={init_stepsize}, ' || + 'init_stepsize={init_stepsize}, ' || 'decay_factor={decay_factor}, ' || - 'max_iter={max_iter}, ' || + 'max_iter={max_iter}, ' || 'tolerance={tolerance}'::text AS optim_params, 'lambda={lambda_str}, ' || - 'norm={norm}, ' || + 'norm={norm}, ' || 'n_folds={n_folds}'::text AS reg_params, count(*)::integer AS num_all_groups, 0::integer AS num_failed_groups, sum(num_rows_processed)::bigint AS total_rows_processed, - sum(num_rows_skipped)::bigint AS total_rows_skipped + sum(num_rows_skipped)::bigint AS total_rows_skipped, + '{epsilon}'::double precision AS EPSILON FROM {model_table}; """.format(grouping_text="NULL" if not grouping_col else grouping_col, **args)) @@ -272,6 +324,7 @@ def svm_predict(schema_madlib, model_table, new_data_table, id_col_name, # read necessary info from summary summary = plpy.execute(""" SELECT + method, dependent_varname, independent_varname, kernel_func, @@ -279,6 +332,7 @@ def svm_predict(schema_madlib, model_table, new_data_table, id_col_name, grouping_col FROM {summary_table} """.format(**locals()))[0] + method = summary['method'] dependent_varname = summary['dependent_varname'] independent_varname = summary['independent_varname'] kernel_func = summary['kernel_func'] @@ -298,16 +352,28 @@ def svm_predict(schema_madlib, model_table, new_data_table, id_col_name, "SVM error: id_col_name ('" + id_col_name + "') is invalid for new_data_table (" + new_data_table + ")!") output_tbl_valid(output_table, 'SVM') + if method == 'SVC': + pred_query = """ + CASE WHEN {schema_madlib}.array_dot(coef::double precision [], {independent_varname}::double precision []) >= 0 + THEN dep_var_mapping[2] + ELSE dep_var_mapping[1] + END + """.format(schema_madlib=schema_madlib, + independent_varname=independent_varname) + elif method == 'SVR': + pred_query = """ + {schema_madlib}.array_dot(coef::double precision [], {independent_varname}::double precision []) + """.format(schema_madlib=schema_madlib, + independent_varname=independent_varname) + else: + plpy.error("SVM error: method can only be SVC or SVR!") if grouping_col != "NULL": sql = """ CREATE TABLE {output_table} AS SELECT {id_col_name} AS id, - CASE WHEN {schema_madlib}.array_dot(coef, {independent_varname}) >= 0 - THEN _dep_var_mapping[2] - ELSE _dep_var_mapping[1] - END AS prediction, + {pred_query} AS prediction, {model_table}.{grouping_col} as grouping_col FROM {model_table} JOIN {new_data_table} @@ -320,28 +386,28 @@ def svm_predict(schema_madlib, model_table, new_data_table, id_col_name, CREATE TABLE {output_table} AS SELECT {id_col_name} AS id, - CASE WHEN {schema_madlib}.array_dot(coef, {independent_varname}) >= 0 - THEN _dep_var_mapping[2] - ELSE _dep_var_mapping[1] - END AS prediction + {pred_query} as prediction FROM {model_table}, {new_data_table} WHERE not {schema_madlib}.array_contains_null({independent_varname}) """.format(**locals()) - plpy.execute(sql) def _extract_optim_params(schema_madlib, optim_params, module='SVM'): - default_dict = dict(init_stepsize=0.01, decay_factor=0.9, max_iter=100, tolerance=1e-3) - optim_params_types = dict(init_stepsize=float, decay_factor=float, max_iter=int, tolerance=float) + # NOTICE: the type of values in default_dict should be consistent with + # the types specified in optim_params_types + default_dict = dict(init_stepsize=0.01, decay_factor=0.9, + max_iter=100, tolerance=1e-3) + optim_params_types = dict(init_stepsize=float, decay_factor=float, + max_iter=int, tolerance=float) optim_params_dict = extract_keyvalue_params(optim_params, optim_params_types, default_dict) if optim_params_dict['init_stepsize'] <= 0: plpy.error("{0} error: init_stepsize must be positive!".format(module)) - if optim_params_dict['decay_factor'] <= 0 or optim_params_dict['decay_factor'] > 1: - plpy.error("{0} error: decay_factor must be in (0,1]!".format(module)) + if optim_params_dict['decay_factor'] > 1: + plpy.error("{0} error: decay_factor must be <= 1!".format(module)) if optim_params_dict['max_iter'] <= 0: plpy.error("{0} error: max_iter must be positive!".format(module)) if optim_params_dict['tolerance'] < 0: @@ -350,8 +416,8 @@ def _extract_optim_params(schema_madlib, optim_params, module='SVM'): return optim_params_dict -def _extract_reg_params(schema_madlib, reg_params, module='SVM'): - default_dict = {'lambda': 0.01, 'norm': 'L2', 'n_folds': 0} +def __extract_reg_params(schema_madlib, reg_params, module='SVM'): + default_dict = {'lambda': 1.0, 'norm': 'L2', 'n_folds': 0} reg_params_types_lambda_scalar = {'lambda': float, 'norm': str, 'n_folds': int} reg_params_types_lambda_list = {'lambda': list, 'norm': str, 'n_folds': int} try: diff --git a/src/ports/postgres/modules/svm/svm.sql_in b/src/ports/postgres/modules/svm/svm.sql_in index 166c8773d..a08ed267d 100644 --- a/src/ports/postgres/modules/svm/svm.sql_in +++ b/src/ports/postgres/modules/svm/svm.sql_in @@ -389,13 +389,14 @@ CREATE TYPE MADLIB_SCHEMA.linear_svm_result AS ( CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linear_svm_igd_transition( state double precision[], ind_var double precision[], - dep_var boolean, + dep_var double precision, previous_state double precision[], dimension integer, stepsize double precision, reg double precision, is_l2 boolean, - n_tuples integer) + n_tuples integer, + epsilon double precision) RETURNS double precision[] AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL'); @@ -413,6 +414,36 @@ RETURNS double precision[] AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL'); +/** + * @internal + * @brief Perform one iteration of the incremental gradient + * method for computing linear support vector machine + */ +DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.linear_svm_igd_step_serial( + /*+ ind_var */ double precision[], + /*+ dep_var */ double precision, + /*+ previous_state */ double precision[], + /*+ dimension */ integer, + /*+ stepsize */ double precision, + /*+ reg */ double precision, + /*+ is_l2 */ boolean, + /*+ n_tuples */ integer, + /*+ epsilon */ double precision); +CREATE AGGREGATE MADLIB_SCHEMA.linear_svm_igd_step_serial( + /*+ ind_var */ double precision[], + /*+ dep_var */ double precision, + /*+ previous_state */ double precision[], + /*+ dimension */ integer, + /*+ stepsize */ double precision, + /*+ reg */ double precision, + /*+ is_l2 */ boolean, + /*+ n_tuples */ integer, + /*+ epsilon */ double precision) ( + STYPE=double precision[], + SFUNC=MADLIB_SCHEMA.linear_svm_igd_transition, + FINALFUNC=MADLIB_SCHEMA.linear_svm_igd_final, + INITCOND='{0,0,0,0,0,0,0}' +); /** * @internal @@ -421,22 +452,24 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL'); */ DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.linear_svm_igd_step( /*+ ind_var */ double precision[], - /*+ dep_var */ boolean, + /*+ dep_var */ double precision, /*+ previous_state */ double precision[], /*+ dimension */ integer, /*+ stepsize */ double precision, /*+ reg */ double precision, /*+ is_l2 */ boolean, - /*+ n_tuples */ integer); + /*+ n_tuples */ integer, + /*+ epsilon */ double precision); CREATE AGGREGATE MADLIB_SCHEMA.linear_svm_igd_step( /*+ ind_var */ double precision[], - /*+ dep_var */ boolean, + /*+ dep_var */ double precision, /*+ previous_state */ double precision[], /*+ dimension */ integer, /*+ stepsize */ double precision, /*+ reg */ double precision, /*+ is_l2 */ boolean, - /*+ n_tuples */ integer) ( + /*+ n_tuples */ integer, + /*+ epsilon */ double precision) ( STYPE=double precision[], SFUNC=MADLIB_SCHEMA.linear_svm_igd_transition, m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.linear_svm_igd_merge,') @@ -463,13 +496,130 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL'); +-- will change to classification if epsilon is set to 0 +-- C++ code does not differentiate between svm and svr with 0 epsilon. +-- one solution is to set epsilon to a small number in python +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( + source_table text, + model_table text, + dependent_varname text, + independent_varname text, + kernel_func text, + kernel_params text, + grouping_col text, + optim_params text, + reg_params text, + epsilon double precision, + verbose bool) +RETURNS void AS $$ + # indent according to PythonFunction + global is_svc + is_svc = False + PythonFunction(svm, svm, svm) +$$ LANGUAGE plpythonu VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( + source_table text, + model_table text, + dependent_varname text, + independent_varname text, + kernel_func text, + kernel_params text, + grouping_col text, + optim_params text, + reg_params text, + epsilon double precision) +RETURNS void AS $$ + SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, NULL); +$$ LANGUAGE sql VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( + source_table text, + model_table text, + dependent_varname text, + independent_varname text, + kernel_func text, + kernel_params text, + grouping_col text, + optim_params text, + reg_params text) +RETURNS void AS $$ + SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, $5, $6, $7, $8, $9, NULL); +$$ LANGUAGE sql VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); ------------------ + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( + source_table text, + model_table text, + dependent_varname text, + independent_varname text, + kernel_func text, + kernel_params text, + grouping_col text, + optim_params text) +RETURNS void AS $$ + SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, $5, $6, $7, $8, NULL); +$$ LANGUAGE sql VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); + + + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( + source_table text, + model_table text, + dependent_varname text, + independent_varname text, + kernel_func text, + kernel_params text, + grouping_col text) + RETURNS void AS $$ + SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, $5, $6, $7, NULL); +$$ LANGUAGE sql VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( + source_table text, + model_table text, + dependent_varname text, + independent_varname text, + kernel_func text, + kernel_params text) + RETURNS void AS $$ + SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, $5, $6, NULL); +$$ LANGUAGE sql VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); + + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( + source_table text, + model_table text, + dependent_varname text, + independent_varname text, + kernel_func text) + RETURNS void AS $$ + SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, $5, NULL); +$$ LANGUAGE sql VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); + + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( + source_table text, + model_table text, + dependent_varname text, + independent_varname text) + RETURNS void AS $$ + SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, NULL); +$$ LANGUAGE sql VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); + +----------------- + CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification( source_table text, model_table text, @@ -482,7 +632,11 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification( reg_params text, verbose bool) RETURNS void AS $$ -PythonFunction(svm, svm, svm_classification) + # indent according to PythonFunction + global is_svc, epsilon + is_svc = True + epsilon = 0.0 + PythonFunction(svm, svm, svm) $$ LANGUAGE plpythonu VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); @@ -579,9 +733,9 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); * @param model_table Name of table where the learned model to be used is stored * @param new_data_table Name of table/view containing the data points to be scored * @param id_col Name of column in new_data_table containing the integer identifier of data points - * * - * + * + * * @param output_table Name of table to store the results * * @return Textual summary of the algorithm run diff --git a/src/ports/postgres/modules/svm/test/linear_svm.sql_in b/src/ports/postgres/modules/svm/test/linear_svm.sql_in index 5e8a4cc20..f5966cc1f 100644 --- a/src/ports/postgres/modules/svm/test/linear_svm.sql_in +++ b/src/ports/postgres/modules/svm/test/linear_svm.sql_in @@ -10,6 +10,13 @@ BEGIN END $$ LANGUAGE plpgsql; +CREATE OR REPLACE FUNCTION __svr_target_cl_func(ind float8[]) +RETURNS float8 AS $$ +BEGIN + RETURN 1*ind[1] + 2*ind[2]; +END +$$ LANGUAGE plpgsql; + CREATE OR REPLACE FUNCTION __svm_random_ind(d INT) RETURNS float8[] AS $$ DECLARE @@ -48,8 +55,95 @@ BEGIN END $$ LANGUAGE plpgsql; +CREATE OR REPLACE FUNCTION svr_generate_cls_data( + output_table text, num int, dim int) +RETURNS VOID AS $$ +DECLARE + temp_table text; +BEGIN + temp_table := 'madlib_temp_' || output_table; + EXECUTE ' + CREATE TABLE ' || temp_table || ' AS + SELECT + subq.val AS id, + __svm_random_ind(' || dim || ') AS ind + FROM + (SELECT generate_series(1, ' || num || ') AS val) subq'; + EXECUTE ' + CREATE TABLE ' || output_table || ' AS + SELECT id, ind, __svr_target_cl_func(ind) AS label + FROM ' || temp_table; +END +$$ LANGUAGE plpgsql; + SELECT svm_generate_cls_data('svm_train_data', 1000, 4); SELECT svm_generate_cls_data('svm_test_data', 1000, 4); +SELECT svr_generate_cls_data('svr_train_data', 1000, 4); +SELECT svr_generate_cls_data('svr_test_data', 1000, 4); + +-- check the default values +DROP TABLE IF EXISTS svr_model, svr_model_summary; +SELECT svm_regression( + 'svr_train_data', + 'svr_model', + 'label', + 'ind'); +\x on +SELECT * FROM svr_model; +SELECT * FROM svr_model_summary; +\x off +SELECT + assert( + norm1(coef) < 4, + 'optimal coef should be close to [1, 2, 0, 0]!') +FROM svr_model; + +-- check the use of l1 norm +DROP TABLE IF EXISTS svr_model, svr_model_summary; +SELECT svm_regression( + 'svr_train_data', + 'svr_model', + 'label', + 'ind', + NULL, + NULL, + NULL, + 'init_stepsize=0.01, max_iter=50', + 'lambda=2, norm=l1', + 0.01); +DROP TABLE IF EXISTS svr_test_result; +SELECT svm_predict('svr_model', 'svr_train_data', 'id', 'svr_test_result'); +\x on +SELECT * FROM svr_model; +\x off +SELECT + assert( + avg(subq.err) < 0.1, + 'prediction error is too large!') +FROM + ( + SELECT + train.id, + abs(train.label - test.prediction) AS err + FROM svr_train_data AS train, svr_test_result AS test + WHERE train.id = test.id + ) AS subq; + +-- by default using epsilon == 0.1 +DROP TABLE IF EXISTS svr_model, svr_model_summary; +SELECT svm_regression( + 'svr_train_data', + 'svr_model', + 'label', + 'ind', + NULL, + NULL, + NULL, + 'init_stepsize=1, max_iter=10', + 'lambda=2'); +SELECT + assert(epsilon > 0,'default epsilon is positive!') +FROM svr_model_summary; -- Example usage for LINEAR classification, replace the above by SELECT svm_classification( @@ -118,7 +212,7 @@ SELECT svm_classification( 'ind', NULL, -- kernel_func NULL, -- kernel_pararms - NULL, --grouping_col + NULL, -- grouping_col 'init_stepsize=0.03, decay_factor=1, max_iter=5, tolerance=0', 'lambda=0', true -- verbose From bcc5b3430547e64c0bdeb411d85d3fc62c6a025e Mon Sep 17 00:00:00 2001 From: Xiaocheng Tang Date: Tue, 20 Oct 2015 16:28:22 -0700 Subject: [PATCH 02/14] SVM: Fix minor bugs Install-check passed --- src/ports/postgres/modules/svm/svm.py_in | 46 ++++------------- .../utilities/in_mem_group_control.py_in | 51 +++++++++++++++++++ 2 files changed, 61 insertions(+), 36 deletions(-) diff --git a/src/ports/postgres/modules/svm/svm.py_in b/src/ports/postgres/modules/svm/svm.py_in index 276f5b1d0..78132c64f 100644 --- a/src/ports/postgres/modules/svm/svm.py_in +++ b/src/ports/postgres/modules/svm/svm.py_in @@ -42,36 +42,11 @@ def _compute_svm(args): {epsilon}::FLOAT8 ) """) - if kwargs['decay_factor'] > 0: - it.kwargs['stepsize'] = it.kwargs['stepsize'] * kwargs['decay_factor'] + it.info() + if it.kwargs['decay_factor'] > 0: + it.kwargs['stepsize'] = it.kwargs['stepsize'] * it.kwargs['decay_factor'] else: it.kwargs['stepsize'] = init_stepsize / (it.iteration + 1) - if args['verbose']: - res = plpy.execute(""" - SELECT - (result).loss AS loss, - (result).norm_of_gradient AS norm_of_gradient - FROM ( - SELECT {schema_madlib}.internal_linear_svm_igd_result( - _state) AS result - FROM {rel_state} - WHERE _iteration = {iteration}) subq - """.format(iteration=it.iteration, - schema_madlib=schema_madlib, - rel_state=it.kwargs['rel_state'])) - - loss = res[0]['loss'] - normg = res[0]['norm_of_gradient'] - epsilon = it.kwargs['epsilon'] - stepsize = it.kwargs['stepsize'] - iteration = it.iteration - plpy.notice("DEBUG: \ - iter = {iteration:5d}, \ - loss = {loss:.5e}, \ - |gradient| = {normg:.5e}, \ - stepsize = {stepsize:.5e}, \ - epsilon = {epsilon:.2e}".format(**locals())) - has_converged = it.test( """ {iteration} >= {max_iter} @@ -166,7 +141,8 @@ def svm(schema_madlib, source_table, model_table, grouping_str1 = "" if not grouping_col else grouping_col + "," grouping_str2 = "1 = 1" if not grouping_col else grouping_col - args = {'rel_args': unique_string(), + args.update( + {'rel_args': unique_string(), 'rel_state': unique_string(), 'col_grp_iteration': unique_string(), 'col_grp_state': unique_string(), @@ -175,9 +151,8 @@ def svm(schema_madlib, source_table, model_table, 'state_type': "double precision[]", 'rel_source': source_table, 'col_ind_var': independent_varname, - 'col_dep_var': dependent_varname, - 'mapped_value_for_negative': dep_var_mapping[0] - } + 'col_dep_var': dependent_varname + }) args.update(locals()) # variables defined above cannot be moved below this line # ------------------------------------------------------- @@ -222,7 +197,6 @@ def svm(schema_madlib, source_table, model_table, n_iters_run = _compute_svm(args) # organizing results - args['mapping'] = dep_var_mapping[0] + "," + dep_var_mapping[1] groupby_str = "GROUP BY {grouping_col}, {col_grp_key}".format(**args) if grouping_col else "" using_str = "USING ({col_grp_key})".format(**args) if grouping_col else "ON TRUE" model_table_query = """ @@ -272,7 +246,7 @@ def svm(schema_madlib, source_table, model_table, plpy.execute(""" CREATE TABLE {summary_table} AS SELECT - 'svm'::text AS method, + '{method}'::text AS method, '__MADLIB_VERSION__'::text AS version_number, '{source_table}'::text AS source_table, '{model_table}'::text AS model_table, @@ -397,7 +371,7 @@ def _extract_optim_params(schema_madlib, optim_params, module='SVM'): # NOTICE: the type of values in default_dict should be consistent with # the types specified in optim_params_types default_dict = dict(init_stepsize=0.01, decay_factor=0.9, - max_iter=100, tolerance=1e-3) + max_iter=100, tolerance=1e-10) optim_params_types = dict(init_stepsize=float, decay_factor=float, max_iter=int, tolerance=float) optim_params_dict = extract_keyvalue_params(optim_params, @@ -416,7 +390,7 @@ def _extract_optim_params(schema_madlib, optim_params, module='SVM'): return optim_params_dict -def __extract_reg_params(schema_madlib, reg_params, module='SVM'): +def _extract_reg_params(schema_madlib, reg_params, module='SVM'): default_dict = {'lambda': 1.0, 'norm': 'L2', 'n_folds': 0} reg_params_types_lambda_scalar = {'lambda': float, 'norm': str, 'n_folds': int} reg_params_types_lambda_list = {'lambda': list, 'norm': str, 'n_folds': int} diff --git a/src/ports/postgres/modules/utilities/in_mem_group_control.py_in b/src/ports/postgres/modules/utilities/in_mem_group_control.py_in index 79a26c46a..5331de89b 100644 --- a/src/ports/postgres/modules/utilities/in_mem_group_control.py_in +++ b/src/ports/postgres/modules/utilities/in_mem_group_control.py_in @@ -109,6 +109,57 @@ class GroupIterationController: self.in_with = True return self + def info(self): + """ Logging intermediate state information """ + if not self.kwargs['verbose']: + return + if self.is_state_type_bytea8: + unnest_str = "unnest($1) AS {col_grp_key}, unnest($2) AS {col_grp_state}".format(**self.kwargs) + grouped_state_type = "{schema_madlib}.bytea8[]".format(**self.kwargs) + else: + unnest_str = "grp_key AS {col_grp_key}, state AS {col_grp_state} FROM {schema_madlib}._gen_state($1, NULL, $2)".format(**self.kwargs) + grouped_state_type = "float8[]" + + plan = plpy.prepare( + """ + SELECT + {col_grp_key} AS group, + (result).loss AS loss, + (result).norm_of_gradient AS norm_of_gradient + FROM ( + SELECT + {schema_madlib}.internal_linear_svm_igd_result({col_grp_state}) + AS result, + {col_grp_key} + FROM ( SELECT {unnest_str} ) AS _rel_state + ) subq + """.format(unnest_str=unnest_str, + **self.kwargs), + ["text[]", grouped_state_type]) + + if self.is_state_type_bytea8: + res_tuples = plpy.execute(plan, [self.new_states.keys(), + self.new_states.values()]) + else: + flatten_agg_states = [c for state in self.new_states.values() for c in state] + res_tuples = plpy.execute(plan, [self.new_states.keys(), + flatten_agg_states]) + for t in res_tuples: + grp, loss, normg = t['group'], t['loss'], t['norm_of_gradient'] + iteration = self.iteration + output_str = "DEBUG: \ + grp = {grp:10s}, \ + iter = {iteration:5d}, \ + loss = {loss:.5e}, \ + |gradient| = {normg:.5e}, \ + stepsize = {stepsize:.5e}" + plpy.notice(output_str.format( + grp=grp, iteration=iteration, + loss=loss, normg=normg, + **self.kwargs)) + + + def final(self): """ Store the final converged state to a table for output """ if self.is_state_type_bytea8: From 96a7a874232dca9b028fc05a80247bfe66a2553b Mon Sep 17 00:00:00 2001 From: Xiaocheng Tang Date: Thu, 22 Oct 2015 11:47:18 -0700 Subject: [PATCH 03/14] Refactoring GroupIterationController: Add desp to unique_string() Add _init_group_param() in GroupIterationController --- src/ports/postgres/modules/svm/svm.py_in | 12 +- .../utilities/in_mem_group_control.py_in | 112 +++++++++++------- .../modules/utilities/utilities.py_in | 3 +- 3 files changed, 76 insertions(+), 51 deletions(-) diff --git a/src/ports/postgres/modules/svm/svm.py_in b/src/ports/postgres/modules/svm/svm.py_in index 78132c64f..6c4079c59 100644 --- a/src/ports/postgres/modules/svm/svm.py_in +++ b/src/ports/postgres/modules/svm/svm.py_in @@ -142,12 +142,12 @@ def svm(schema_madlib, source_table, model_table, grouping_str2 = "1 = 1" if not grouping_col else grouping_col args.update( - {'rel_args': unique_string(), - 'rel_state': unique_string(), - 'col_grp_iteration': unique_string(), - 'col_grp_state': unique_string(), - 'col_grp_key': unique_string(), - 'col_n_tuples': unique_string(), + {'rel_args': unique_string(desp='rel_args'), + 'rel_state': unique_string(desp='rel_state'), + 'col_grp_iteration': unique_string(desp='col_grp_iteration'), + 'col_grp_state': unique_string(desp='col_grp_state'), + 'col_grp_key': unique_string(desp='col_grp_key'), + 'col_n_tuples': unique_string(desp='col_n_tuples'), 'state_type': "double precision[]", 'rel_source': source_table, 'col_ind_var': independent_varname, diff --git a/src/ports/postgres/modules/utilities/in_mem_group_control.py_in b/src/ports/postgres/modules/utilities/in_mem_group_control.py_in index 5331de89b..71dad0de5 100644 --- a/src/ports/postgres/modules/utilities/in_mem_group_control.py_in +++ b/src/ports/postgres/modules/utilities/in_mem_group_control.py_in @@ -7,6 +7,7 @@ import plpy from control import MinWarning from utilities import unique_string +from collections import namedtuple class GroupIterationController: @@ -35,10 +36,13 @@ class GroupIterationController: self.kwargs = dict(arg_dict) self.kwargs.update( state_type=arg_dict.get('state_type', 'double precision[]').format(**arg_dict), - col_grp_null=unique_string(), - col_n_tuples=self.kwargs.get('col_n_tuples', unique_string()), - col_grp_key=self.kwargs.get('col_grp_key', unique_string()), - grouping_col=("NULL" if arg_dict["grouping_col"] is None + col_grp_null=unique_string(desp='col_grp_null'), + col_n_tuples=self.kwargs.get('col_n_tuples', + unique_string(desp='col_n_tuples')), + col_grp_key=self.kwargs.get('col_grp_key', + unique_string(desp='col_grp_key')), + grouping_col=("NULL" + if arg_dict["grouping_col"] is None else arg_dict["grouping_col"]), ) self.grp_to_n_tuples = {} @@ -56,8 +60,49 @@ class GroupIterationController: else: plpy.error("Internal error: unexpected state type!") + self.group_param = self._init_group_param() + + + def _init_group_param(self): + _as_string = 'AS _grp' + _grp_key = ("array_to_string(ARRAY[{grouping_str}], ',')" + .format(grouping_str=self.kwargs['grouping_str'])) + _unnest_str = ("grp_key AS {col_grp_key}," + "state AS {col_grp_state} " + "FROM {schema_madlib}._gen_state($1, NULL, $2)" + "".format(**self.kwargs)) + _n_tuples_unnest_str = ("unnest($3) AS {col_grp_key}, " + "unnest($4) AS {col_n_tuples}" + .format(**self.kwargs)) + _using_str="ON TRUE" + _grouped_state_type="float8[]" + _groupby_str="" + if not self.is_group_null: + _groupby_str = "GROUP BY {grouping_col}, {col_grp_key}".format( + **self.kwargs) + _using_str = "USING ({col_grp_key})".format(**self.kwargs) + _grp_key = self.kwargs['col_grp_key'] + _as_string = '' + if self.is_state_type_bytea8: + _unnest_str = "unnest($1) AS {col_grp_key}, unnest($2) AS {col_grp_state}".format(**self.kwargs) + _grouped_state_type = "{schema_madlib}.bytea8[]".format(**self.kwargs) + GroupParam = namedtuple('GroupParam', + 'groupby_str, using_str,' + 'unnest_str,grouped_state_type,' + 'as_string, grp_key,' + 'n_tuples_unnest_str') + return GroupParam(groupby_str=_groupby_str, + using_str=_using_str, + unnest_str=_unnest_str, + as_string=_as_string, + grp_key=_grp_key, + n_tuples_unnest_str=_n_tuples_unnest_str, + grouped_state_type=_grouped_state_type) + + def __enter__(self): - with MinWarning('warning'): + verbosity_level = self.kwargs.get('verbosity_level', 'warning') + with MinWarning(verbosity_level): ############################ # create state table # currently assuming that groups is passed as a valid array @@ -240,14 +285,6 @@ class GroupIterationController: # in the previous update return True - self.is_state_type_bytea8 = False - if self.kwargs['state_type'] == "{0}.bytea8".format(self.kwargs['schema_madlib']): - self.is_state_type_bytea8 = True - elif (self.kwargs['state_type'].lower() == "double precision[]" or - self.kwargs['state_type'].lower() == "float8[]"): - self.is_state_type_bytea8 = False - else: - plpy.error("Internal error: unexpected state type!") if self.is_state_type_bytea8: unnest_str_previous = "unnest($1) AS {col_grp_key}, unnest($2) AS _state_previous".format(**self.kwargs) unnest_str_current = "unnest($3) AS {col_grp_key}, unnest($4) AS _state_current".format(**self.kwargs) @@ -326,20 +363,8 @@ class GroupIterationController: newState = newState.format(**self.kwargs) self.iteration = self.iteration + 1 - - groupby_str = "GROUP BY {grouping_col}, {col_grp_key}".format( - **self.kwargs) if not self.is_group_null else "" - using_str = "USING ({col_grp_key})".format( - **self.kwargs) if not self.is_group_null else "ON TRUE" - if self.is_state_type_bytea8: - unnest_str = "unnest($1) AS {col_grp_key}, unnest($2) AS {col_grp_state}".format(**self.kwargs) - grouped_state_type = "{schema_madlib}.bytea8[]".format(**self.kwargs) - else: - unnest_str = "grp_key AS {col_grp_key}, state AS {col_grp_state} FROM {schema_madlib}._gen_state($1, NULL, $2)".format(**self.kwargs) - grouped_state_type = "float8[]" - - n_tuples_unnest_str = "unnest($3) AS {col_grp_key}, unnest($4) AS {col_n_tuples}".format(**self.kwargs) - + + group_param = self.group_param update_plan = plpy.prepare( """ SELECT @@ -364,18 +389,16 @@ class GroupIterationController: {using_str} {groupby_str} """.format( - iteration=self.iteration, - groupby_str=groupby_str, - using_str=using_str, - as_string='AS _grp' if self.is_group_null else '', - _grp_key=("array_to_string(ARRAY[{grouping_str}], " - "',')".format(**self.kwargs) if self.is_group_null - else self.kwargs['col_grp_key']), newState=newState, - unnest_str=unnest_str, - n_tuples_unnest_str=n_tuples_unnest_str, + iteration=self.iteration, + using_str=group_param.using_str, + groupby_str=group_param.groupby_str, + as_string=group_param.as_string, + _grp_key=group_param.grp_key, + unnest_str=group_param.unnest_str, + n_tuples_unnest_str=group_param.n_tuples_unnest_str, **self.kwargs), - ["text[]", grouped_state_type, "text[]", "integer[]"]) + ["text[]", group_param.grouped_state_type, "text[]", "integer[]"]) if self.is_state_type_bytea8: res_tuples = plpy.execute(update_plan, [self.new_states.keys(), @@ -392,11 +415,12 @@ class GroupIterationController: self.grp_to_n_tuples.keys(), self.grp_to_n_tuples.values()]) - self.old_states = self.new_states - self.new_states = {} + # self.old_states = self.new_states + self.old_states.update(self.new_states) + # self.new_states = {} + col_grp_state = self.kwargs['col_grp_state'] + col_grp_key = self.kwargs['col_grp_key'] for t in res_tuples: - if t[self.kwargs['col_grp_state']] is None: - self.failed_grp_keys.append(t[self.kwargs['col_grp_key']]) - else: - self.new_states[t[self.kwargs['col_grp_key']]] = \ - t[self.kwargs['col_grp_state']] + _grp_key, _grp_state = t[col_grp_key], t[col_grp_state] + if _grp_state is None: self.failed_grp_keys.append(_grp_key) + else: self.new_states[_grp_key] = _grp_state diff --git a/src/ports/postgres/modules/utilities/utilities.py_in b/src/ports/postgres/modules/utilities/utilities.py_in index 2c848f5b2..014204d6c 100644 --- a/src/ports/postgres/modules/utilities/utilities.py_in +++ b/src/ports/postgres/modules/utilities/utilities.py_in @@ -49,10 +49,11 @@ def unique_string(**kwargs): Generate random remporary names for temp table and other names. It has a SQL interface so both SQL and Python functions can call it. """ + desp = kwargs.get('desp', '') r1 = random.randint(1, 100000000) r2 = int(time.time()) r3 = int(time.time()) % random.randint(1, 100000000) - u_string = "__madlib_temp_" + str(r1) + "_" + str(r2) + "_" + str(r3) + "__" + u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" + str(r3) + "__" return u_string # ======================================================================== From f8f6d5ceba557e382ec6015b4ec299093f0f2fec Mon Sep 17 00:00:00 2001 From: Xiaocheng Tang Date: Thu, 22 Oct 2015 22:35:50 -0700 Subject: [PATCH 04/14] Refactoring GroupIterationController: minor changes --- src/ports/postgres/modules/svm/svm.py_in | 7 +- .../utilities/in_mem_group_control.py_in | 108 ++++++++---------- 2 files changed, 51 insertions(+), 64 deletions(-) diff --git a/src/ports/postgres/modules/svm/svm.py_in b/src/ports/postgres/modules/svm/svm.py_in index 6c4079c59..33e0fec6a 100644 --- a/src/ports/postgres/modules/svm/svm.py_in +++ b/src/ports/postgres/modules/svm/svm.py_in @@ -141,8 +141,8 @@ def svm(schema_madlib, source_table, model_table, grouping_str1 = "" if not grouping_col else grouping_col + "," grouping_str2 = "1 = 1" if not grouping_col else grouping_col - args.update( - {'rel_args': unique_string(desp='rel_args'), + args.update({ + 'rel_args': unique_string(desp='rel_args'), 'rel_state': unique_string(desp='rel_state'), 'col_grp_iteration': unique_string(desp='col_grp_iteration'), 'col_grp_state': unique_string(desp='col_grp_state'), @@ -151,8 +151,7 @@ def svm(schema_madlib, source_table, model_table, 'state_type': "double precision[]", 'rel_source': source_table, 'col_ind_var': independent_varname, - 'col_dep_var': dependent_varname - }) + 'col_dep_var': dependent_varname}) args.update(locals()) # variables defined above cannot be moved below this line # ------------------------------------------------------- diff --git a/src/ports/postgres/modules/utilities/in_mem_group_control.py_in b/src/ports/postgres/modules/utilities/in_mem_group_control.py_in index 71dad0de5..0915e2250 100644 --- a/src/ports/postgres/modules/utilities/in_mem_group_control.py_in +++ b/src/ports/postgres/modules/utilities/in_mem_group_control.py_in @@ -9,6 +9,16 @@ from control import MinWarning from utilities import unique_string from collections import namedtuple +def _flatten(l, no=False): + if no: + return l + if l is None or not l: + return [] + if all(map(lambda x: x is None, l)): + return [] + return [s for x in l for s in x] + + class GroupIterationController: """ @@ -64,16 +74,17 @@ class GroupIterationController: def _init_group_param(self): - _as_string = 'AS _grp' _grp_key = ("array_to_string(ARRAY[{grouping_str}], ',')" .format(grouping_str=self.kwargs['grouping_str'])) - _unnest_str = ("grp_key AS {col_grp_key}," - "state AS {col_grp_state} " - "FROM {schema_madlib}._gen_state($1, NULL, $2)" - "".format(**self.kwargs)) - _n_tuples_unnest_str = ("unnest($3) AS {col_grp_key}, " - "unnest($4) AS {col_n_tuples}" - .format(**self.kwargs)) + _select_rel_state = ("SELECT " + "grp_key AS {col_grp_key}," + "state AS {col_grp_state} " + "FROM {schema_madlib}._gen_state($1, NULL, $2)" + .format(**self.kwargs)) + _select_n_tuples = ("SELECT " + "unnest($3) AS {col_grp_key}, " + "unnest($4) AS {col_n_tuples}" + .format(**self.kwargs)) _using_str="ON TRUE" _grouped_state_type="float8[]" _groupby_str="" @@ -82,21 +93,21 @@ class GroupIterationController: **self.kwargs) _using_str = "USING ({col_grp_key})".format(**self.kwargs) _grp_key = self.kwargs['col_grp_key'] - _as_string = '' if self.is_state_type_bytea8: - _unnest_str = "unnest($1) AS {col_grp_key}, unnest($2) AS {col_grp_state}".format(**self.kwargs) + _select_rel_state = ("SELECT " + "unnest($1) AS {col_grp_key}, " + "unnest($2) AS {col_grp_state}" + "".format(**self.kwargs)) _grouped_state_type = "{schema_madlib}.bytea8[]".format(**self.kwargs) GroupParam = namedtuple('GroupParam', 'groupby_str, using_str,' - 'unnest_str,grouped_state_type,' - 'as_string, grp_key,' - 'n_tuples_unnest_str') + 'select_rel_state,grouped_state_type,' + 'grp_key, select_n_tuples') return GroupParam(groupby_str=_groupby_str, using_str=_using_str, - unnest_str=_unnest_str, - as_string=_as_string, + select_rel_state=_select_rel_state, grp_key=_grp_key, - n_tuples_unnest_str=_n_tuples_unnest_str, + select_n_tuples=_select_n_tuples, grouped_state_type=_grouped_state_type) @@ -145,11 +156,13 @@ class GroupIterationController: ############################ # initialize states - ret_states = plpy.execute("SELECT * FROM " + self.kwargs['rel_state']) + rel_state_str = self.kwargs['rel_state'] + col_grp_key = self.kwargs['col_grp_key'] + col_n_tuples = self.kwargs['col_n_tuples'] + ret_states = plpy.execute("SELECT * FROM " + rel_state_str) for state in ret_states: - self.new_states[state[self.kwargs['col_grp_key']]] = None - self.grp_to_n_tuples[state[self.kwargs['col_grp_key']]] = \ - long(state[self.kwargs['col_n_tuples']]) + self.new_states[state[col_grp_key]] = None + self.grp_to_n_tuples[state[col_grp_key]] = long(state[col_n_tuples]) self.in_with = True return self @@ -158,13 +171,7 @@ class GroupIterationController: """ Logging intermediate state information """ if not self.kwargs['verbose']: return - if self.is_state_type_bytea8: - unnest_str = "unnest($1) AS {col_grp_key}, unnest($2) AS {col_grp_state}".format(**self.kwargs) - grouped_state_type = "{schema_madlib}.bytea8[]".format(**self.kwargs) - else: - unnest_str = "grp_key AS {col_grp_key}, state AS {col_grp_state} FROM {schema_madlib}._gen_state($1, NULL, $2)".format(**self.kwargs) - grouped_state_type = "float8[]" - + group_param = self.group_param plan = plpy.prepare( """ SELECT @@ -176,11 +183,11 @@ class GroupIterationController: {schema_madlib}.internal_linear_svm_igd_result({col_grp_state}) AS result, {col_grp_key} - FROM ( SELECT {unnest_str} ) AS _rel_state + FROM ( {select_rel_state} ) AS _rel_state ) subq - """.format(unnest_str=unnest_str, + """.format(select_rel_state=group_param.select_rel_state, **self.kwargs), - ["text[]", grouped_state_type]) + ["text[]", group_param.grouped_state_type]) if self.is_state_type_bytea8: res_tuples = plpy.execute(plan, [self.new_states.keys(), @@ -207,15 +214,7 @@ class GroupIterationController: def final(self): """ Store the final converged state to a table for output """ - if self.is_state_type_bytea8: - unnest_str = "unnest($1) AS {col_grp_key}, unnest($2) AS {col_grp_state}".format(**self.kwargs) - grouped_state_type = "{0}.bytea8[]".format(self.kwargs['schema_madlib']) - else: - unnest_str = ("grp_key AS {col_grp_key}, state AS {col_grp_state} " - "FROM {schema_madlib}._gen_state($1, NULL, $2)". - format(**self.kwargs)) - grouped_state_type = "float8[]" - + group_param = self.group_param insert_sql = """ INSERT INTO {rel_state} SELECT @@ -228,21 +227,17 @@ class GroupIterationController: SELECT {grouping_col}, {col_grp_key} FROM {rel_state} ) AS _src - JOIN ( - SELECT {unnest_str} - ) AS _rel_state + JOIN ( {select_rel_state} ) AS _rel_state USING ({col_grp_key}) - JOIN ( - SELECT unnest($3) AS {col_grp_key}, - unnest($4) AS {col_n_tuples} - ) AS _rel_n_tuples + JOIN ( {select_n_tuples} ) AS _rel_n_tuples USING ({col_grp_key}) """.format( iteration=self.iteration, - unnest_str=unnest_str, + select_rel_state=group_param.select_rel_state, + select_n_tuples=group_param.select_n_tuples, **self.kwargs) insert_plan = plpy.prepare(insert_sql, - ["text[]", grouped_state_type, + ["text[]", group_param.grouped_state_type, "text[]", "bigint[]"]) if self.is_state_type_bytea8: @@ -369,7 +364,7 @@ class GroupIterationController: """ SELECT {_grp_key} AS {col_grp_key}, - {grouping_col} {as_string}, + {grouping_col}, {iteration} AS {col_grp_iteration}, ({newState}) AS {col_grp_state} FROM ( @@ -378,14 +373,9 @@ class GroupIterationController: array_to_string(ARRAY[{grouping_str}], ',') AS {col_grp_key} FROM {rel_source} ) AS _src - JOIN ( - SELECT {unnest_str} - ) AS {rel_state} + JOIN ( {select_rel_state} ) AS {rel_state} {using_str} - JOIN ( - SELECT - {n_tuples_unnest_str} - ) AS _rel_n_tuples + JOIN ( {select_n_tuples} ) AS _rel_n_tuples {using_str} {groupby_str} """.format( @@ -393,10 +383,9 @@ class GroupIterationController: iteration=self.iteration, using_str=group_param.using_str, groupby_str=group_param.groupby_str, - as_string=group_param.as_string, _grp_key=group_param.grp_key, - unnest_str=group_param.unnest_str, - n_tuples_unnest_str=group_param.n_tuples_unnest_str, + select_rel_state=group_param.select_rel_state, + select_n_tuples=group_param.select_n_tuples, **self.kwargs), ["text[]", group_param.grouped_state_type, "text[]", "integer[]"]) @@ -417,7 +406,6 @@ class GroupIterationController: # self.old_states = self.new_states self.old_states.update(self.new_states) - # self.new_states = {} col_grp_state = self.kwargs['col_grp_state'] col_grp_key = self.kwargs['col_grp_key'] for t in res_tuples: From a6a3043787d4c6aef7841adc544cfc1b1924139c Mon Sep 17 00:00:00 2001 From: Xiaocheng Tang Date: Fri, 23 Oct 2015 16:25:57 -0700 Subject: [PATCH 05/14] Refactoring GroupIterationController: checkpoint --- .../utilities/in_mem_group_control.py_in | 271 ++++++++++++------ 1 file changed, 176 insertions(+), 95 deletions(-) diff --git a/src/ports/postgres/modules/utilities/in_mem_group_control.py_in b/src/ports/postgres/modules/utilities/in_mem_group_control.py_in index 0915e2250..5cb0979ae 100644 --- a/src/ports/postgres/modules/utilities/in_mem_group_control.py_in +++ b/src/ports/postgres/modules/utilities/in_mem_group_control.py_in @@ -8,16 +8,147 @@ import plpy from control import MinWarning from utilities import unique_string from collections import namedtuple +from collections import Iterable + + +class BaseState(object): + """@brief Abstraction for intermediate iteration state""" + def __init__(self, **kwargs): + self._state = {} + self._is_none = None + self._len = -1 + self.initialize(**kwargs) + + def __len__(self): + if self._len == -1: + self._len = len(self._state) + return self._len + + def __del__(self): + del self._state + self._len = 0 + + def __getitem__(self, k): + return self._state[k] + + def __setitem__(self, k, v): + self._state[k] = v + + @property + def keys(self): + return self._state.keys() + + @property + def values(self): + if self.is_none(): + return [] + return [s for x in self._state.values() for s in x] + + def delete(self, keys_to_remove): + for k in keys_to_remove: + try: + del self._state[k] + self._len -= 1 + except KeyError: + pass + self._is_none = None + + def initialize(self, + col_grp_key='', + col_grp_state='', + ret_states=None, **kwargs): + self.update(col_grp_key, col_grp_state, ret_states) + + def update(self, + col_grp_key, + col_grp_state, + ret_states): + failed_grp_keys = [] + if ret_states is None: + return failed_grp_keys + t0 = ret_states[0] + # no key column in table ret_states + if col_grp_key not in t0: + return failed_grp_keys + # initialize state to None + if col_grp_state == '': + self._is_none = True + for s in ret_states: + self._state[s[col_grp_key]] = None + return failed_grp_keys + for t in ret_states: + _grp_key, _grp_state = t[col_grp_key], t[col_grp_state] + if _grp_state is None: failed_grp_keys.append(_grp_key) + else: self._state[_grp_key] = _grp_state + # no need to update if all failed + if len(failed_grp_keys) < len(self): + self._is_none = False + return failed_grp_keys + + # entries in self not in other are kept + def update_from_state(self, other, keys=None): + if not isinstance(other, BaseState): + return + if keys is None: + self._state.update(other._state) + else: + for k in keys: + self[k] = other[k] + # reset cache + self._len = -1 + self._is_none = None + + def sync_from(self, other): + self._state = {} + self.update_from_state(other) + self._len = len(other) + self._is_none = other.is_none() + + def is_none(self): + if self._is_none is None: + self._is_none = True + for k, v in self._state.iteritems(): + if v is not None: + self._is_none = False + break + return self._is_none + + def interpret(self, schema_madlib, state_type, keys=None): + if keys is None: + keys = self.keys + elif isinstance(keys, str) or not isinstance(keys, Iterable): + keys = [keys] + s = dict.fromkeys(keys) + plan = plpy.prepare( + """ + SELECT + (result).loss AS loss, + (result).norm_of_gradient AS norm_of_gradient + FROM ( + SELECT + {schema_madlib}.internal_linear_svm_igd_result($1) + AS result + ) subq + """.format(schema_madlib=schema_madlib), [state_type]) + for k in keys: + s[k] = plpy.execute(plan, [self._state[k]])[0] + return s + + +class Bytea8State(BaseState): + """@brief bytea8 type state""" + def __init__(self, **kwargs): + super(Bytea8State, self).__init__(**kwargs) -def _flatten(l, no=False): - if no: - return l - if l is None or not l: - return [] - if all(map(lambda x: x is None, l)): - return [] - return [s for x in l for s in x] + @property + def values(self): + return self._state.values() +def state_factory(is_bytea8, **kwargs): + if is_bytea8: + return Bytea8State(**kwargs) + else: + return BaseState(**kwargs) class GroupIterationController: @@ -56,9 +187,6 @@ class GroupIterationController: else arg_dict["grouping_col"]), ) self.grp_to_n_tuples = {} - self.new_states = {} - self.old_states = {} - self.finished_states = {} self.failed_grp_keys = [] self.is_state_type_bytea8 = False @@ -70,6 +198,10 @@ class GroupIterationController: else: plpy.error("Internal error: unexpected state type!") + self.new_states = state_factory(self.is_state_type_bytea8) + self.old_states = state_factory(self.is_state_type_bytea8) + self.finished_states = state_factory(self.is_state_type_bytea8) + self.group_param = self._init_group_param() @@ -160,9 +292,11 @@ class GroupIterationController: col_grp_key = self.kwargs['col_grp_key'] col_n_tuples = self.kwargs['col_n_tuples'] ret_states = plpy.execute("SELECT * FROM " + rel_state_str) - for state in ret_states: - self.new_states[state[col_grp_key]] = None - self.grp_to_n_tuples[state[col_grp_key]] = long(state[col_n_tuples]) + self.new_states.initialize(col_grp_key=col_grp_key, + col_grp_state='', + ret_states=ret_states) + for s in ret_states: + self.grp_to_n_tuples[s[col_grp_key]] = long(s[col_n_tuples]) self.in_with = True return self @@ -172,32 +306,11 @@ class GroupIterationController: if not self.kwargs['verbose']: return group_param = self.group_param - plan = plpy.prepare( - """ - SELECT - {col_grp_key} AS group, - (result).loss AS loss, - (result).norm_of_gradient AS norm_of_gradient - FROM ( - SELECT - {schema_madlib}.internal_linear_svm_igd_result({col_grp_state}) - AS result, - {col_grp_key} - FROM ( {select_rel_state} ) AS _rel_state - ) subq - """.format(select_rel_state=group_param.select_rel_state, - **self.kwargs), - ["text[]", group_param.grouped_state_type]) - - if self.is_state_type_bytea8: - res_tuples = plpy.execute(plan, [self.new_states.keys(), - self.new_states.values()]) - else: - flatten_agg_states = [c for state in self.new_states.values() for c in state] - res_tuples = plpy.execute(plan, [self.new_states.keys(), - flatten_agg_states]) - for t in res_tuples: - grp, loss, normg = t['group'], t['loss'], t['norm_of_gradient'] + schema_madlib = self.kwargs['schema_madlib'] + res = self.new_states.interpret(schema_madlib, + group_param.grouped_state_type) + for grp, t in res.iteritems(): + loss, normg = t['loss'], t['norm_of_gradient'] iteration = self.iteration output_str = "DEBUG: \ grp = {grp:10s}, \ @@ -210,8 +323,6 @@ class GroupIterationController: loss=loss, normg=normg, **self.kwargs)) - - def final(self): """ Store the final converged state to a table for output """ group_param = self.group_param @@ -240,14 +351,8 @@ class GroupIterationController: ["text[]", group_param.grouped_state_type, "text[]", "bigint[]"]) - if self.is_state_type_bytea8: - insert_values = self.finished_states.values() - - else: - insert_values = [c for state in self.finished_states.values() - for c in state] - plpy.execute(insert_plan, [self.finished_states.keys(), - insert_values, + plpy.execute(insert_plan, [self.finished_states.keys, + self.finished_states.values, self.grp_to_n_tuples.keys(), self.grp_to_n_tuples.values()]) plpy.execute(insert_plan, [self.failed_grp_keys, @@ -275,7 +380,7 @@ class GroupIterationController: @return True if \c expression in all non-failed groups is True, otherwise False """ - if not self.new_states: + if len(self.new_states) == 0: # self.new_states can become empty if the last of the groups failed # in the previous update return True @@ -310,30 +415,17 @@ class GroupIterationController: unnest_str_previous=unnest_str_previous, **self.kwargs), ["text[]", grouped_state_type] * 2) - if self.is_state_type_bytea8: - ret_tuples = plpy.execute(eval_plan, - [self.old_states.keys(), - self.old_states.values(), - self.new_states.keys(), - self.new_states.values()]) - else: - if self.iteration == 1: - flatten_agg_states_old = [] - else: - flatten_agg_states_old = [c for state in self.old_states.values() for c in state] - ret_tuples = plpy.execute(eval_plan, - [self.old_states.keys(), - flatten_agg_states_old, - self.new_states.keys(), - [c for state in self.new_states.values() for c in state]]) - - self.old_states = {} - for t in ret_tuples: - if t['_expression']: - k = t[self.kwargs['col_grp_key']] - self.finished_states[k] = self.new_states[k] - del self.new_states[k] + res = plpy.execute(eval_plan, + [self.old_states.keys, + self.old_states.values, + self.new_states.keys, + self.new_states.values]) + col_grp_key = self.kwargs['col_grp_key'] + if_finished = lambda t: t['_expression'] + finished_keys = [t[col_grp_key] for t in filter(if_finished, res)] + self.finished_states.update_from_state(self.new_states, finished_keys) + self.new_states.delete(finished_keys) return len(self.new_states) == 0 def update(self, newState, **updateKwargs): @@ -389,26 +481,15 @@ class GroupIterationController: **self.kwargs), ["text[]", group_param.grouped_state_type, "text[]", "integer[]"]) - if self.is_state_type_bytea8: - res_tuples = plpy.execute(update_plan, [self.new_states.keys(), - self.new_states.values(), - self.grp_to_n_tuples.keys(), - self.grp_to_n_tuples.values()]) - else: - if self.iteration == 1: - flatten_agg_states = [] - else: - flatten_agg_states = [c for state in self.new_states.values() for c in state] - res_tuples = plpy.execute(update_plan, [self.new_states.keys(), - flatten_agg_states, - self.grp_to_n_tuples.keys(), - self.grp_to_n_tuples.values()]) - - # self.old_states = self.new_states - self.old_states.update(self.new_states) + res_tuples = plpy.execute(update_plan, [self.new_states.keys, + self.new_states.values, + self.grp_to_n_tuples.keys(), + self.grp_to_n_tuples.values()]) + col_grp_state = self.kwargs['col_grp_state'] col_grp_key = self.kwargs['col_grp_key'] - for t in res_tuples: - _grp_key, _grp_state = t[col_grp_key], t[col_grp_state] - if _grp_state is None: self.failed_grp_keys.append(_grp_key) - else: self.new_states[_grp_key] = _grp_state + self.old_states.sync_from(self.new_states) + self.failed_grp_keys.extend(self.new_states.update( + col_grp_key, + col_grp_state, + res_tuples)) From bc8e3c47f6a24a3f96b93ca017e444d332922c22 Mon Sep 17 00:00:00 2001 From: Rahul Iyer Date: Mon, 26 Oct 2015 15:09:39 -0700 Subject: [PATCH 06/14] Light cleanup in utilities --- .../modules/utilities/utilities.py_in | 24 +++++++------------ 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/src/ports/postgres/modules/utilities/utilities.py_in b/src/ports/postgres/modules/utilities/utilities.py_in index 014204d6c..e73be2d90 100644 --- a/src/ports/postgres/modules/utilities/utilities.py_in +++ b/src/ports/postgres/modules/utilities/utilities.py_in @@ -44,18 +44,17 @@ def get_seg_number(): # ------------------------------------------------------------ -def unique_string(**kwargs): +def unique_string(desp='', **kwargs): """ Generate random remporary names for temp table and other names. It has a SQL interface so both SQL and Python functions can call it. """ - desp = kwargs.get('desp', '') r1 = random.randint(1, 100000000) r2 = int(time.time()) r3 = int(time.time()) % random.randint(1, 100000000) u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" + str(r3) + "__" return u_string -# ======================================================================== +# ------------------------------------------------------------------------- def add_postfix(quoted_string, postfix): @@ -132,10 +131,9 @@ def _string_to_array(s): for i in range(len(elm)): elm[i] = elm[i].strip("\"") return elm +# ------------------------------------------------------------------------ -# ======================================================================== - def _string_to_array_with_quotes(s): """ Same as _string_to_array except the double quotes will be kept. @@ -144,8 +142,7 @@ def _string_to_array_with_quotes(s): for m in re.finditer(r"(\"(\\\"|[^\"])*\"|[^\",\s]+)", s): elm.append(m.group(1)) return elm - -# ======================================================================== +# ------------------------------------------------------------------------ def py_list_to_sql_string(array, array_type=None): @@ -167,7 +164,7 @@ def py_list_to_sql_string(array, array_type=None): else: array_str = "ARRAY[ {0} ]" if long_format else "'{{ {0} }}'" return (array_str + "::{1}").format(','.join(map(str, array)), array_type) -# ======================================================================== +# ------------------------------------------------------------------------ def _array_to_string(origin): @@ -175,7 +172,7 @@ def _array_to_string(origin): Convert an array to string """ return "{" + ",".join(map(str, origin)) + "}" -# ======================================================================== +# ------------------------------------------------------------------------ def set_client_min_messages(new_level): @@ -396,7 +393,7 @@ def _string_to_sql_array(schema_madlib, s, **kwargs): def current_user(): """Returns the user name of the current database user.""" return plpy.execute("SELECT current_user")[0]['current_user'] -# ======================================================================== +# ------------------------------------------------------------------------ def madlib_version(schema_madlib): @@ -405,7 +402,7 @@ def madlib_version(schema_madlib): SELECT {schema_madlib}.version() """.format(**locals()))[0]['version'] return raw.split(',')[0].split(' ')[-1] -# ======================================================================== +# ------------------------------------------------------------------------ def preprocess_keyvalue_params(input_params): @@ -435,7 +432,7 @@ def preprocess_keyvalue_params(input_params): ) )""", re.VERBOSE) return [m.group(1).strip() for m in pattern.finditer(input_params)] -# ======================================================================== +# ------------------------------------------------------------------------ def extract_keyvalue_params(input_params, @@ -574,9 +571,6 @@ class UtilitiesTestCase(unittest.TestCase): self.optimizer_types = {'max_iter': int, 'optimizer': str, 'lambda': list, 'precision': float} - def tearDown(self): - pass - def test_preprocess_optimizer(self): self.assertEqual(preprocess_keyvalue_params(self.optimizer_params1), ['max_iter=10', 'optimizer="irls"', 'precision=1e-4']) From 00de2f5c45f7a1645bc64dc763e520dc83ba1a64 Mon Sep 17 00:00:00 2001 From: Xiaocheng Tang Date: Fri, 23 Oct 2015 16:25:57 -0700 Subject: [PATCH 07/14] Refactoring GroupIterationController: checkpoint --- .../modules/utilities/in_mem_group_control.py_in | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/ports/postgres/modules/utilities/in_mem_group_control.py_in b/src/ports/postgres/modules/utilities/in_mem_group_control.py_in index 5cb0979ae..b0dcd4b04 100644 --- a/src/ports/postgres/modules/utilities/in_mem_group_control.py_in +++ b/src/ports/postgres/modules/utilities/in_mem_group_control.py_in @@ -16,17 +16,13 @@ class BaseState(object): def __init__(self, **kwargs): self._state = {} self._is_none = None - self._len = -1 self.initialize(**kwargs) def __len__(self): - if self._len == -1: - self._len = len(self._state) - return self._len + return len(self._state) def __del__(self): del self._state - self._len = 0 def __getitem__(self, k): return self._state[k] @@ -48,7 +44,6 @@ class BaseState(object): for k in keys_to_remove: try: del self._state[k] - self._len -= 1 except KeyError: pass self._is_none = None @@ -95,13 +90,11 @@ class BaseState(object): for k in keys: self[k] = other[k] # reset cache - self._len = -1 self._is_none = None def sync_from(self, other): self._state = {} self.update_from_state(other) - self._len = len(other) self._is_none = other.is_none() def is_none(self): @@ -176,6 +169,7 @@ class GroupIterationController: self.is_group_null = True if arg_dict["grouping_col"] is None else False self.kwargs = dict(arg_dict) self.kwargs.update( + as_rel_source=arg_dict.get('as_rel_source', '_src'), state_type=arg_dict.get('state_type', 'double precision[]').format(**arg_dict), col_grp_null=unique_string(desp='col_grp_null'), col_n_tuples=self.kwargs.get('col_n_tuples', @@ -464,7 +458,7 @@ class GroupIterationController: *, array_to_string(ARRAY[{grouping_str}], ',') AS {col_grp_key} FROM {rel_source} - ) AS _src + ) AS {as_rel_source} JOIN ( {select_rel_state} ) AS {rel_state} {using_str} JOIN ( {select_n_tuples} ) AS _rel_n_tuples From 7c7ca3b2ddfc0aa915f110505599760595749fa3 Mon Sep 17 00:00:00 2001 From: Xiaocheng Tang Date: Tue, 27 Oct 2015 15:26:09 -0700 Subject: [PATCH 08/14] SVM: grouping for regression Install checks added and passed --- src/modules/convex/linear_svm_igd.cpp | 6 +- src/ports/postgres/modules/svm/svm.py_in | 275 +++++++++++------- src/ports/postgres/modules/svm/svm.sql_in | 76 +---- .../modules/svm/test/linear_svm.sql_in | 274 +++++++++++++++-- 4 files changed, 450 insertions(+), 181 deletions(-) diff --git a/src/modules/convex/linear_svm_igd.cpp b/src/modules/convex/linear_svm_igd.cpp index b0ded7660..f4e0f3a84 100644 --- a/src/modules/convex/linear_svm_igd.cpp +++ b/src/modules/convex/linear_svm_igd.cpp @@ -149,9 +149,13 @@ linear_svm_igd_final::run(AnyType &args) { state.algo.loss += L1::loss(state.task.model); L2::gradient(state.task.model, state.algo.gradient); L1::gradient(state.task.model, state.algo.gradient); - elog(NOTICE, "loss and |gradient|: %e, %e\n", (double) state.algo.loss, state.algo.gradient.norm()); + // finalizing LinearSVMIGDAlgorithm::final(state); + elog(NOTICE, "loss = %e, |gradient| = %e, |model| = %e\n", + (double) state.algo.loss, + state.algo.gradient.norm(), + state.task.model.norm()); return state; } diff --git a/src/ports/postgres/modules/svm/svm.py_in b/src/ports/postgres/modules/svm/svm.py_in index 33e0fec6a..9b6e997e5 100644 --- a/src/ports/postgres/modules/svm/svm.py_in +++ b/src/ports/postgres/modules/svm/svm.py_in @@ -39,14 +39,14 @@ def _compute_svm(args): {lambda}::FLOAT8, {is_l2}::BOOLEAN, {col_n_tuples}, - {epsilon}::FLOAT8 + {select_epsilon}::FLOAT8 ) """) it.info() if it.kwargs['decay_factor'] > 0: it.kwargs['stepsize'] = it.kwargs['stepsize'] * it.kwargs['decay_factor'] else: - it.kwargs['stepsize'] = init_stepsize / (it.iteration + 1) + it.kwargs['stepsize'] = it.kwargs['init_stepsize'] / (it.iteration + 1) has_converged = it.test( """ {iteration} >= {max_iter} @@ -60,8 +60,8 @@ def _compute_svm(args): def svm(schema_madlib, source_table, model_table, dependent_varname, independent_varname, kernel_func, - kernel_params, grouping_col, optim_params, reg_params, - epsilon, is_svc, verbose, **kwargs): + kernel_params, grouping_col, params, is_svc, + verbose, **kwargs): """ Executes the linear support vector classification algorithm. """ @@ -77,44 +77,6 @@ def svm(schema_madlib, source_table, model_table, "SVM error: invalid independent_varname ('" + str(independent_varname) + "') for source_table (" + source_table + ")!") - # transform col_dep_var to binary (1`or -1) if classification - args = {'col_dep_var_trans': dependent_varname, - 'mapping': 'NULL', - 'method': 'SVR'} - - if is_svc: - args['method'] = 'SVC' - # dependent variable mapping - dep_labels=plpy.execute(""" - SELECT {dependent_varname} AS y - FROM {source_table} - WHERE ({dependent_varname}) IS NOT NULL - GROUP BY ({dependent_varname}) - ORDER BY ({dependent_varname})""".format(**locals())) - dep_var_mapping = ["'" + d['y'] + "'" if isinstance(d['y'], basestring) else str(d['y']) for d in dep_labels] - if len(dep_var_mapping) != 2: - plpy.error("SVM error: Classification currently only supports binary output") - - col_dep_var_trans = ( - """ - CASE WHEN ({col_dep_var}) IS NULL THEN NULL - WHEN ({col_dep_var}) = {mapped_value_for_negative} THEN -1.0 - ELSE 1.0 - END - """ - .format(col_dep_var=dependent_varname, - mapped_value_for_negative=dep_var_mapping[0]) - ) - args.update({ - 'mapped_value_for_negative': dep_var_mapping[0], - 'col_dep_var_trans': col_dep_var_trans, - 'mapping': dep_var_mapping[0] + "," + dep_var_mapping[1]}) - elif epsilon is None: - # TODO: choose a better default value for epsilon - epsilon = 0.1 - elif epsilon < 0: - plpy.error("SVM error: epsilon cannot be less than 0!") - dep_type = get_expr_type(dependent_varname, source_table) if '[]' in dep_type: plpy.error("SVM error: dependent_varname cannot be of array type!") @@ -141,7 +103,7 @@ def svm(schema_madlib, source_table, model_table, grouping_str1 = "" if not grouping_col else grouping_col + "," grouping_str2 = "1 = 1" if not grouping_col else grouping_col - args.update({ + args = { 'rel_args': unique_string(desp='rel_args'), 'rel_state': unique_string(desp='rel_state'), 'col_grp_iteration': unique_string(desp='col_grp_iteration'), @@ -151,7 +113,7 @@ def svm(schema_madlib, source_table, model_table, 'state_type': "double precision[]", 'rel_source': source_table, 'col_ind_var': independent_varname, - 'col_dep_var': dependent_varname}) + 'col_dep_var': dependent_varname} args.update(locals()) # variables defined above cannot be moved below this line # ------------------------------------------------------- @@ -183,10 +145,44 @@ def svm(schema_madlib, source_table, model_table, "Some predefined keyword(s) ({0}) are not allowed!".format( ', '.join(intersect))) - optim_params_dict = _extract_optim_params(schema_madlib, optim_params) - reg_params_dict = _extract_reg_params(schema_madlib, reg_params) - args.update(optim_params_dict) - args.update(reg_params_dict) + args.update(_extract_params(schema_madlib, params)) + args.update(_process_epsilon(is_svc, args)) + + if not is_svc: + # transform col_dep_var to binary (1`or -1) if classification + args.update({ + 'col_dep_var_trans': dependent_varname, + 'mapping': 'NULL', + 'method': 'SVR'}) + else: + # dependent variable mapping + dep_labels=plpy.execute(""" + SELECT {dependent_varname} AS y + FROM {source_table} + WHERE ({dependent_varname}) IS NOT NULL + GROUP BY ({dependent_varname}) + ORDER BY ({dependent_varname})""".format(**locals())) + dep_var_mapping = ["'" + d['y'] + "'" if isinstance(d['y'], basestring) else str(d['y']) for d in dep_labels] + if len(dep_var_mapping) != 2: + plpy.error("SVM error: Classification currently only supports binary output") + + col_dep_var_trans = ( + """ + CASE WHEN ({col_dep_var}) IS NULL THEN NULL + WHEN ({col_dep_var}) = {mapped_value_for_negative} THEN -1.0 + ELSE 1.0 + END + """ + .format(col_dep_var=dependent_varname, + mapped_value_for_negative=dep_var_mapping[0]) + ) + + args.update({ + 'mapped_value_for_negative': dep_var_mapping[0], + 'col_dep_var_trans': col_dep_var_trans, + 'mapping': dep_var_mapping[0] + "," + dep_var_mapping[1], + 'method': 'SVC'}) + args['stepsize'] = args['init_stepsize'] args['is_l2'] = True if args['norm'] == 'l2' else False @@ -242,6 +238,7 @@ def svm(schema_madlib, source_table, model_table, args['lambda_str'] = '{' + ','.join(str(e) for e in args['lambda']) + '}' else: args['lambda_str'] = str(args['lambda']) + plpy.execute(""" CREATE TABLE {summary_table} AS SELECT @@ -265,7 +262,8 @@ def svm(schema_madlib, source_table, model_table, 0::integer AS num_failed_groups, sum(num_rows_processed)::bigint AS total_rows_processed, sum(num_rows_skipped)::bigint AS total_rows_skipped, - '{epsilon}'::double precision AS EPSILON + '{epsilon}'::double precision AS epsilon, + '{eps_table}'::text AS eps_table FROM {model_table}; """.format(grouping_text="NULL" if not grouping_col else grouping_col, **args)) @@ -366,63 +364,144 @@ def svm_predict(schema_madlib, model_table, new_data_table, id_col_name, plpy.execute(sql) -def _extract_optim_params(schema_madlib, optim_params, module='SVM'): - # NOTICE: the type of values in default_dict should be consistent with - # the types specified in optim_params_types - default_dict = dict(init_stepsize=0.01, decay_factor=0.9, - max_iter=100, tolerance=1e-10) - optim_params_types = dict(init_stepsize=float, decay_factor=float, - max_iter=int, tolerance=float) - optim_params_dict = extract_keyvalue_params(optim_params, - optim_params_types, - default_dict) +def _process_epsilon(is_svc, args): + eps_table = args['eps_table'] + grouping_col = args['grouping_col'] + grouping_str = args['grouping_str'] + col_grp_key = args['col_grp_key'] + rel_source = args['rel_source'] + rel_epsilon = '' + select_epsilon = '' + as_rel_source = '_src' + + epsilon = args['epsilon'] + # c code does SVR when epsilon is non-zero + if is_svc: epsilon = 0.0 + # c code does SVC if epsilon is zero + elif args['epsilon'] == 0: epsilon = 0.00001 + + if is_svc or not grouping_col or not eps_table: + if eps_table: + plpy.warning('SVM: ignore the input epsilon table!') + select_epsilon = '{epsilon}'.format(epsilon=epsilon) + else: + rel_epsilon = unique_string(desp='rel_epsilon') - if optim_params_dict['init_stepsize'] <= 0: - plpy.error("{0} error: init_stepsize must be positive!".format(module)) - if optim_params_dict['decay_factor'] > 1: - plpy.error("{0} error: decay_factor must be <= 1!".format(module)) - if optim_params_dict['max_iter'] <= 0: - plpy.error("{0} error: max_iter must be positive!".format(module)) - if optim_params_dict['tolerance'] < 0: - plpy.error("{0} error: tolerance must be non-negative!".format(module)) + # validate input + input_tbl_valid(eps_table, 'SVM') + + _assert(is_var_valid(eps_table, grouping_col), + "SVM error: invalid column names ('" + str(grouping_col) + + "') for eps_table (" + eps_table + ")!") - return optim_params_dict - - -def _extract_reg_params(schema_madlib, reg_params, module='SVM'): - default_dict = {'lambda': 1.0, 'norm': 'L2', 'n_folds': 0} - reg_params_types_lambda_scalar = {'lambda': float, 'norm': str, 'n_folds': int} - reg_params_types_lambda_list = {'lambda': list, 'norm': str, 'n_folds': int} - try: - reg_params_dict = extract_keyvalue_params(reg_params, - reg_params_types_lambda_scalar, - default_dict) - is_lambda_list = False - except ValueError: - reg_params_dict = extract_keyvalue_params(reg_params, - reg_params_types_lambda_list, - default_dict) - is_lambda_list = True - - if reg_params_dict['n_folds'] < 0: + plpy.execute(""" + DROP TABLE IF EXISTS {rel_epsilon}; + CREATE TEMPORARY TABLE {rel_epsilon} AS ( + SELECT + {col_grp_key}, + coalesce(epsilon, {epsilon}) AS epsilon + FROM ( + SELECT + array_to_string(ARRAY[{grouping_str}], ',') AS + {col_grp_key} + FROM + {rel_source} + GROUP BY {grouping_col} + ) q1 + LEFT JOIN + ( + SELECT + array_to_string(ARRAY[{grouping_str}], ',') AS + {col_grp_key}, + epsilon + FROM + {eps_table} + ) q2 + USING ({col_grp_key}) + ); + """.format(rel_epsilon=rel_epsilon, + col_grp_key=col_grp_key, + epsilon=epsilon, + grouping_str=grouping_str, + rel_source=rel_source, + grouping_col=grouping_col, + eps_table=eps_table)) + + select_epsilon = ( + """ + ( + SELECT epsilon + FROM + {rel_epsilon} + WHERE + {rel_epsilon}.{col_grp_key} = {as_rel_source}.{col_grp_key} + ) + """ + .format(rel_epsilon=rel_epsilon, + as_rel_source=as_rel_source, + col_grp_key=col_grp_key)) + + return {'select_epsilon': select_epsilon, + 'epsilon': epsilon, + 'rel_epsilon': rel_epsilon, + 'as_rel_source': as_rel_source} + + +def _extract_params(schema_madlib, params, module='SVM'): + # NOTICE: the type of values in params_default should be consistent with + # the types specified in params_types + params_default = { + 'init_stepsize': 0.01, + 'decay_factor': 0.9, + 'max_iter': 100, + 'tolerance': 1e-10, + 'lambda': 1.0, + 'norm': 'L2', + 'n_folds': 0, + 'epsilon': 0.01, + 'eps_table': ''} + + params_types = { + 'init_stepsize': float, + 'decay_factor': float, + 'max_iter': int, + 'tolerance': float, + 'lambda': list, + 'norm': str, + 'n_folds': int, + 'epsilon': float, + 'eps_table': str} + + params_vals = extract_keyvalue_params(params, + params_types, + params_default) + + if params_vals['n_folds'] < 0: plpy.error("{0} error: n_folds must be non-negative!".format(module)) # FIXME - if reg_params_dict['n_folds'] > 1: + if params_vals['n_folds'] > 1: plpy.error("{0} error: cross-validation not implemented!".format(module)) # validate lambda - if not is_lambda_list and reg_params_dict['lambda'] < 0: - plpy.error("{0} error: lambda must be non-negative!".format(module)) - if is_lambda_list: - if len(reg_params_dict['lambda']) != 1: + if hasattr(params_vals['lambda'], '__len__'): + if len(params_vals['lambda']) != 1: plpy.error("{0} error: lambda must be a scalar or of length 1 when n_folds is 0 or 1".format(module)) # good for only not CV - reg_params_dict['lambda'] = reg_params_dict['lambda'][0] - if reg_params_dict['lambda'] < 0: - plpy.error("{0} error: lambda must be non-negative!".format(module)) - - reg_params_dict['norm'] = reg_params_dict['norm'].lower() - if reg_params_dict['norm'] != 'l1' and reg_params_dict['norm'] != 'l2': + params_vals['lambda'] = params_vals['lambda'][0] + if params_vals['lambda'] < 0: + plpy.error("{0} error: lambda must be non-negative!".format(module)) + params_vals['norm'] = params_vals['norm'].lower() + if params_vals['norm'] != 'l1' and params_vals['norm'] != 'l2': plpy.error("{0} error: norm must be either L1 or L2!".format(module)) + if params_vals['init_stepsize'] <= 0: + plpy.error("{0} error: init_stepsize must be positive!".format(module)) + if params_vals['decay_factor'] > 1: + plpy.error("{0} error: decay_factor must be <= 1!".format(module)) + if params_vals['max_iter'] <= 0: + plpy.error("{0} error: max_iter must be positive!".format(module)) + if params_vals['tolerance'] < 0: + plpy.error("{0} error: tolerance must be non-negative!".format(module)) + if params_vals['epsilon'] < 0: + plpy.error("{0} error: epsilon cannot be less than 0!".format(module)) - return reg_params_dict + return params_vals diff --git a/src/ports/postgres/modules/svm/svm.sql_in b/src/ports/postgres/modules/svm/svm.sql_in index a08ed267d..24c4952d4 100644 --- a/src/ports/postgres/modules/svm/svm.sql_in +++ b/src/ports/postgres/modules/svm/svm.sql_in @@ -507,9 +507,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( kernel_func text, kernel_params text, grouping_col text, - optim_params text, - reg_params text, - epsilon double precision, + params text, verbose bool) RETURNS void AS $$ # indent according to PythonFunction @@ -527,47 +525,12 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( kernel_func text, kernel_params text, grouping_col text, - optim_params text, - reg_params text, - epsilon double precision) -RETURNS void AS $$ - SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, NULL); -$$ LANGUAGE sql VOLATILE -m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); - -CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( - source_table text, - model_table text, - dependent_varname text, - independent_varname text, - kernel_func text, - kernel_params text, - grouping_col text, - optim_params text, - reg_params text) -RETURNS void AS $$ - SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, $5, $6, $7, $8, $9, NULL); -$$ LANGUAGE sql VOLATILE -m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); - - - -CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( - source_table text, - model_table text, - dependent_varname text, - independent_varname text, - kernel_func text, - kernel_params text, - grouping_col text, - optim_params text) + params text) RETURNS void AS $$ SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, $5, $6, $7, $8, NULL); $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); - - CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( source_table text, model_table text, @@ -576,7 +539,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( kernel_func text, kernel_params text, grouping_col text) - RETURNS void AS $$ +RETURNS void AS $$ SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, $5, $6, $7, NULL); $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); @@ -590,12 +553,13 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( independent_varname text, kernel_func text, kernel_params text) - RETURNS void AS $$ +RETURNS void AS $$ SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, $5, $6, NULL); $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); + CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( source_table text, model_table text, @@ -608,6 +572,7 @@ $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); + CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( source_table text, model_table text, @@ -617,7 +582,6 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, NULL); $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); - ----------------- CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification( @@ -628,14 +592,12 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification( kernel_func text, kernel_params text, grouping_col text, - optim_params text, - reg_params text, + params text, verbose bool) RETURNS void AS $$ # indent according to PythonFunction - global is_svc, epsilon + global is_svc is_svc = True - epsilon = 0.0 PythonFunction(svm, svm, svm) $$ LANGUAGE plpythonu VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); @@ -649,24 +611,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification( kernel_func text, kernel_params text, grouping_col text, - optim_params text, - reg_params text) -RETURNS void AS $$ - SELECT MADLIB_SCHEMA.svm_classification($1, $2, $3, $4, $5, $6, $7, $8, $9, NULL); -$$ LANGUAGE sql VOLATILE -m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); - - - -CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification( - source_table text, - model_table text, - dependent_varname text, - independent_varname text, - kernel_func text, - kernel_params text, - grouping_col text, - optim_params text) + params text) RETURNS void AS $$ SELECT MADLIB_SCHEMA.svm_classification($1, $2, $3, $4, $5, $6, $7, $8, NULL); $$ LANGUAGE sql VOLATILE @@ -682,7 +627,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification( kernel_func text, kernel_params text, grouping_col text) - RETURNS void AS $$ +RETURNS void AS $$ SELECT MADLIB_SCHEMA.svm_classification($1, $2, $3, $4, $5, $6, $7, NULL); $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); @@ -702,6 +647,7 @@ $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); + CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification( source_table text, model_table text, diff --git a/src/ports/postgres/modules/svm/test/linear_svm.sql_in b/src/ports/postgres/modules/svm/test/linear_svm.sql_in index f5966cc1f..4d8cd0781 100644 --- a/src/ports/postgres/modules/svm/test/linear_svm.sql_in +++ b/src/ports/postgres/modules/svm/test/linear_svm.sql_in @@ -108,9 +108,8 @@ SELECT svm_regression( NULL, NULL, NULL, - 'init_stepsize=0.01, max_iter=50', - 'lambda=2, norm=l1', - 0.01); + 'init_stepsize=0.01, max_iter=50, lambda=2, norm=l1, epsilon=0.01', + false); DROP TABLE IF EXISTS svr_test_result; SELECT svm_predict('svr_model', 'svr_train_data', 'id', 'svr_test_result'); \x on @@ -139,8 +138,7 @@ SELECT svm_regression( NULL, NULL, NULL, - 'init_stepsize=1, max_iter=10', - 'lambda=2'); + 'init_stepsize=1, max_iter=10, lambda=2'); SELECT assert(epsilon > 0,'default epsilon is positive!') FROM svr_model_summary; @@ -213,8 +211,7 @@ SELECT svm_classification( NULL, -- kernel_func NULL, -- kernel_pararms NULL, -- grouping_col - 'init_stepsize=0.03, decay_factor=1, max_iter=5, tolerance=0', - 'lambda=0', + 'init_stepsize=0.03, decay_factor=1, max_iter=5, tolerance=0, lambda=0', true -- verbose ); \x on @@ -233,8 +230,7 @@ SELECT svm_classification( NULL, -- kernel_func NULL, -- kernel_pararms NULL, --grouping_col - 'init_stepsize=0.03, decay_factor=1, max_iter=5, tolerance=0', - 'lambda=1' + 'init_stepsize=0.03, decay_factor=1, max_iter=5, tolerance=0, lambda=1' ); \x on SELECT * FROM svm_model_small_norm2; @@ -258,8 +254,7 @@ SELECT svm_classification( NULL, -- kernel_func NULL, -- kernel_pararms NULL, --grouping_col - 'init_stepsize=0.03, decay_factor=1, max_iter=5, tolerance=0', - 'lambda=1, norm=L1' + 'init_stepsize=0.03, decay_factor=1, max_iter=5, tolerance=0, lambda=1, norm=L1' ); \x on SELECT * FROM svm_model_very_sparse; @@ -299,8 +294,7 @@ SELECT svm_classification( NULL, -- kernel_func NULL, -- kernel_pararms NULL, --grouping_col - 'init_stepsize=0.03, decay_factor=0.9, max_iter=5, tolerance=0', - 'lambda={0.001}', + 'init_stepsize=0.03, decay_factor=0.9, max_iter=5, tolerance=0, lambda={0.001}', true -- verbose ); SELECT norm_of_gradient FROM svm_model; @@ -382,8 +376,7 @@ SELECT svm_classification( NULL, -- kernel_func NULL, -- kernel_pararms 'gid', --grouping_col - 'init_stepsize=0.03, decay_factor=0.9, max_iter=5, tolerance=0', - 'lambda=0.001', + 'init_stepsize=0.03, decay_factor=0.9, max_iter=5, tolerance=0, lambda=0.001', TRUE -- verbose ); \x on @@ -417,8 +410,7 @@ SELECT svm_classification( NULL, -- kernel_func NULL, -- kernel_pararms NULL, --grouping_col - 'init_stepsize=0.03, decay_factor=0.9, max_iter=5, tolerance=0', - 'lambda=0.001', + 'init_stepsize=0.03, decay_factor=0.9, max_iter=5, tolerance=0, lambda=0.001', true -- verbose ); \x on @@ -430,3 +422,251 @@ DROP TABLE IF EXISTS svm_test_predict CASCADE; SELECT svm_predict('svm_model_expression', 'svm_test_normalized', 'id', 'svm_test_predict'); SELECT * FROM svm_test_predict; +DROP TABLE IF EXISTS abalone_train_small; +CREATE TABLE abalone_train_small ( + id SERIAL NOT NULL, + sex TEXT, + length DOUBLE PRECISION, + diameter DOUBLE PRECISION, + height DOUBLE PRECISION, + whole DOUBLE PRECISION, + shucked DOUBLE PRECISION, + viscera DOUBLE PRECISION, + shell DOUBLE PRECISION, + rings INTEGER); + +INSERT INTO abalone_train_small(id,sex,length,diameter,height,whole,shucked,viscera,shell,rings) VALUES +(1040,'F',0.66,0.475,0.18,1.3695,0.641,0.294,0.335,6), +(3160,'F',0.34,0.255,0.085,0.204,0.097,0.021,0.05,6), +(3984,'F',0.585,0.45,0.125,0.874,0.3545,0.2075,0.225,6), +(861,'F',0.595,0.475,0.16,1.1405,0.547,0.231,0.271,6), +(932,'F',0.445,0.335,0.11,0.4355,0.2025,0.1095,0.1195,6), +(1585,'F',0.515,0.375,0.11,0.6065,0.3005,0.131,0.15,6), +(3187,'F',0.47,0.36,0.11,0.4965,0.237,0.127,0.13,6), +(3202,'F',0.385,0.3,0.1,0.2725,0.1115,0.057,0.08,6), +(949,'F',0.475,0.36,0.12,0.5915,0.3245,0.11,0.127,6), +(2582,'F',0.53,0.42,0.17,0.828,0.41,0.208,0.1505,6), +(2551,'I',0.28,0.22,0.08,0.1315,0.066,0.024,0.03,5), +(1246,'I',0.385,0.28,0.09,0.228,0.1025,0.042,0.0655,5), +(819,'I',0.35,0.25,0.07,0.18,0.0655,0.048,0.054,6), +(297,'I',0.275,0.205,0.075,0.1105,0.045,0.0285,0.035,6), +(3630,'I',0.27,0.205,0.05,0.084,0.03,0.0185,0.029,6), +(2196,'I',0.26,0.215,0.08,0.099,0.037,0.0255,0.045,5), +(2343,'I',0.255,0.185,0.07,0.075,0.028,0.018,0.025,6), +(49,'I',0.325,0.245,0.07,0.161,0.0755,0.0255,0.045,6), +(2185,'I',0.32,0.235,0.08,0.1485,0.064,0.031,0.045,6), +(2154,'I',0.28,0.2,0.075,0.1225,0.0545,0.0115,0.035,5), +(1996,'I',0.32,0.24,0.07,0.133,0.0585,0.0255,0.041,6), +(126,'I',0.27,0.195,0.06,0.073,0.0285,0.0235,0.03,5), +(1227,'I',0.35,0.27,0.075,0.215,0.1,0.036,0.065,6), +(3969,'I',0.375,0.29,0.095,0.2875,0.123,0.0605,0.08,6), +(2505,'I',0.31,0.24,0.105,0.2885,0.118,0.065,0.083,6), +(2039,'I',0.28,0.215,0.08,0.132,0.072,0.022,0.033,5), +(829,'I',0.41,0.325,0.1,0.394,0.208,0.0655,0.106,6), +(3197,'I',0.325,0.245,0.075,0.1495,0.0605,0.033,0.045,5), +(1447,'I',0.44,0.34,0.105,0.369,0.164,0.08,0.1015,5), +(2821,'I',0.375,0.285,0.09,0.2545,0.119,0.0595,0.0675,6), +(1828,'I',0.34,0.26,0.085,0.1885,0.0815,0.0335,0.06,6), +(2002,'I',0.36,0.27,0.085,0.2185,0.1065,0.038,0.062,6), +(785,'I',0.215,0.155,0.06,0.0525,0.021,0.0165,0.015,5), +(2199,'I',0.27,0.19,0.08,0.081,0.0265,0.0195,0.03,6), +(3527,'I',0.335,0.26,0.085,0.192,0.097,0.03,0.054,6), +(466,'I',0.175,0.125,0.05,0.0235,0.008,0.0035,0.008,5), +(425,'I',0.26,0.2,0.07,0.092,0.037,0.02,0.03,6), +(1825,'I',0.185,0.135,0.04,0.027,0.0105,0.0055,0.009,5), +(3815,'I',0.38,0.275,0.095,0.2425,0.106,0.0485,0.21,6), +(2503,'I',0.285,0.21,0.07,0.109,0.044,0.0265,0.033,5), +(3998,'I',0.36,0.27,0.09,0.2075,0.098,0.039,0.062,6), +(333,'I',0.3,0.22,0.08,0.121,0.0475,0.042,0.035,5), +(1837,'I',0.415,0.31,0.09,0.2815,0.1245,0.0615,0.085,6), +(2813,'I',0.24,0.17,0.05,0.0545,0.0205,0.016,0.0155,5), +(930,'I',0.44,0.345,0.13,0.4495,0.209,0.0835,0.134,6), +(1436,'I',0.385,0.3,0.09,0.247,0.1225,0.044,0.0675,5), +(3972,'I',0.4,0.295,0.095,0.252,0.1105,0.0575,0.066,6), +(1433,'I',0.365,0.255,0.08,0.1985,0.0785,0.0345,0.053,5), +(1252,'I',0.405,0.285,0.09,0.2645,0.1265,0.0505,0.075,6), +(3439,'I',0.43,0.335,0.105,0.378,0.188,0.0785,0.09,6), +(1250,'I',0.395,0.27,0.1,0.2985,0.1445,0.061,0.082,5), +(2865,'I',0.31,0.23,0.07,0.1245,0.0505,0.0265,0.038,6), +(3411,'I',0.415,0.31,0.105,0.3595,0.167,0.083,0.0915,6), +(1539,'I',0.355,0.27,0.075,0.1775,0.079,0.0315,0.054,6), +(1990,'I',0.28,0.21,0.075,0.1195,0.053,0.0265,0.03,6), +(1771,'I',0.455,0.335,0.105,0.422,0.229,0.0865,0.1,6), +(2291,'I',0.325,0.27,0.1,0.185,0.08,0.0435,0.065,6), +(3381,'I',0.19,0.13,0.045,0.0265,0.009,0.005,0.009,5), +(1545,'I',0.37,0.27,0.095,0.2175,0.097,0.046,0.065,6), +(652,'I',0.335,0.245,0.09,0.1665,0.0595,0.04,0.06,6), +(3434,'I',0.365,0.27,0.105,0.2155,0.0915,0.0475,0.063,6), +(2004,'I',0.375,0.28,0.08,0.226,0.105,0.047,0.065,6), +(2000,'I',0.35,0.25,0.07,0.1605,0.0715,0.0335,0.046,6), +(3946,'I',0.235,0.175,0.065,0.0615,0.0205,0.02,0.019,6), +(177,'I',0.315,0.21,0.06,0.125,0.06,0.0375,0.035,5), +(920,'I',0.41,0.31,0.09,0.3335,0.1635,0.061,0.091,6), +(3437,'I',0.38,0.275,0.095,0.2505,0.0945,0.0655,0.075,6), +(2630,'I',0.33,0.24,0.075,0.163,0.0745,0.033,0.048,6), +(1092,'I',0.45,0.33,0.11,0.3685,0.16,0.0885,0.102,6), +(3476,'I',0.4,0.315,0.085,0.2675,0.116,0.0585,0.0765,6), +(3526,'I',0.33,0.23,0.085,0.1695,0.079,0.026,0.0505,6), +(1534,'I',0.295,0.215,0.07,0.121,0.047,0.0155,0.0405,6), +(921,'I',0.415,0.33,0.09,0.3595,0.17,0.081,0.09,6), +(2206,'I',0.275,0.22,0.08,0.1365,0.0565,0.0285,0.042,6), +(1218,'I',0.315,0.23,0.08,0.1375,0.0545,0.031,0.0445,5), +(1998,'I',0.335,0.25,0.08,0.1695,0.0695,0.044,0.0495,6), +(2455,'I',0.275,0.2,0.065,0.092,0.0385,0.0235,0.027,5), +(2548,'I',0.23,0.18,0.05,0.064,0.0215,0.0135,0.02,5), +(3996,'I',0.245,0.175,0.055,0.0785,0.04,0.018,0.02,5), +(3408,'I',0.35,0.265,0.08,0.192,0.081,0.0465,0.053,6), +(3907,'M',0.245,0.18,0.065,0.0635,0.0245,0.0135,0.02,4), +(3850,'M',0.385,0.3,0.115,0.3435,0.1645,0.085,0.1025,6), +(124,'M',0.37,0.265,0.075,0.214,0.09,0.051,0.07,6), +(2583,'M',0.53,0.41,0.14,0.681,0.3095,0.1415,0.1835,6), +(526,'M',0.175,0.125,0.04,0.024,0.0095,0.006,0.005,4), +(2184,'M',0.495,0.4,0.155,0.8085,0.2345,0.1155,0.35,6), +(2132,'M',0.32,0.24,0.08,0.18,0.08,0.0385,0.055,6), +(651,'M',0.255,0.18,0.065,0.079,0.034,0.014,0.025,5), +(612,'M',0.195,0.145,0.05,0.032,0.01,0.008,0.012,4), +(958,'M',0.5,0.39,0.135,0.6595,0.3145,0.1535,0.1565,6), +(3174,'M',0.35,0.265,0.09,0.2265,0.0995,0.0575,0.065,6), +(265,'M',0.27,0.2,0.08,0.1205,0.0465,0.028,0.04,6), +(519,'M',0.325,0.23,0.09,0.147,0.06,0.034,0.045,4), +(2382,'M',0.155,0.115,0.025,0.024,0.009,0.005,0.0075,5), +(698,'M',0.28,0.205,0.1,0.1165,0.0545,0.0285,0.03,5), +(2381,'M',0.175,0.135,0.04,0.0305,0.011,0.0075,0.01,5), +(516,'M',0.27,0.195,0.08,0.1,0.0385,0.0195,0.03,6), +(831,'M',0.415,0.305,0.1,0.325,0.156,0.0505,0.091,6), +(3359,'M',0.285,0.215,0.075,0.106,0.0415,0.023,0.035,5); + +DROP TABLE IF EXISTS abalone_train_f; +CREATE TABLE abalone_train_f AS ( + SELECT + * + FROM + abalone_train_small + WHERE sex = 'F' +); + +DROP TABLE IF EXISTS abalone_train_i; +CREATE TABLE abalone_train_i AS ( + SELECT + * + FROM + abalone_train_small + WHERE sex = 'I' +); + +DROP TABLE IF EXISTS abalone_train_m; +CREATE TABLE abalone_train_m AS ( + SELECT + * + FROM + abalone_train_small + WHERE sex = 'M' +); + +-- create epsilon input table + +DROP TABLE IF EXISTS abalone_eps; +CREATE TABLE abalone_eps AS ( +SELECT + sex, + NULL::double precision AS epsilon +FROM + abalone_train_small +GROUP BY sex); + +UPDATE abalone_eps SET epsilon = 0.2 +WHERE sex = 'I'; + +UPDATE abalone_eps SET epsilon = 0.05 +WHERE sex = 'M'; + +DELETE FROM abalone_eps +WHERE sex = 'F'; + +-- solve different groups individually with different epsilon + +DROP TABLE IF EXISTS svr_mdl_i, svr_mdl_i_summary; +SELECT madlib.svm_regression( + 'abalone_train_i', + 'svr_mdl_i', + 'rings', + 'ARRAY[1,diameter,shell,shucked,length]', + NULL,NULL,'sex', + 'max_iter=50, init_stepsize=1, decay_factor=0.9, tolerance=1e-16, epsilon = 0.2', + false); +DROP TABLE IF EXISTS svr_mdl_m, svr_mdl_m_summary; +SELECT madlib.svm_regression( + 'abalone_train_m', + 'svr_mdl_m', + 'rings', + 'ARRAY[1,diameter,shell,shucked,length]', + NULL,NULL,'sex', + 'max_iter=50, init_stepsize=1, decay_factor=0.9, tolerance=1e-16, epsilon = 0.05', + false); +DROP TABLE IF EXISTS svr_mdl_f, svr_mdl_f_summary; +SELECT madlib.svm_regression( + 'abalone_train_f', + 'svr_mdl_f', + 'rings', + 'ARRAY[1,diameter,shell,shucked,length]', + NULL,NULL,'sex', + 'max_iter=50, init_stepsize=1, decay_factor=0.9, tolerance=1e-16, epsilon = 10', + false); + +-- solve it with grouping and table of epsilon as inputs + +DROP TABLE IF EXISTS svr_mdl, svr_mdl_summary; +SELECT madlib.svm_regression( + 'abalone_train_small', + 'svr_mdl', + 'rings', + 'ARRAY[1,diameter,shell,shucked,length]', + NULL,NULL,'sex', + 'max_iter=50, init_stepsize=1, decay_factor=0.9, tolerance=1e-16, epsilon = 10, eps_table=abalone_eps', + false); + +-- verify that the results are the same + +SELECT assert( + abs_err < 1e-5, + 'SVR with epsilon table input: Wrong results!') +FROM ( + SELECT + abs(t1.norm_of_gradient - t2.norm_of_gradient) AS abs_err + FROM svr_mdl_f AS t1 JOIN svr_mdl AS t2 USING (sex) +) AS q1; + +SELECT assert( + abs_err < 1e-5, + 'SVR with epsilon table input: Wrong results!') +FROM ( + SELECT + abs(t1.norm_of_gradient - t2.norm_of_gradient) AS abs_err + FROM svr_mdl_i AS t1 JOIN svr_mdl AS t2 USING (sex) +) AS q1; + +SELECT assert( + abs_err < 1e-5, + 'SVR with epsilon table input: Wrong results!') +FROM ( + SELECT + abs(t1.norm_of_gradient - t2.norm_of_gradient) AS abs_err + FROM svr_mdl_m AS t1 JOIN svr_mdl AS t2 USING (sex) +) AS q1; + + +DROP TABLE IF EXISTS svr_mdl_i, svr_mdl_i_summary; +SELECT madlib.svm_regression( + 'abalone_train_i', + 'svr_mdl_i', + 'rings', + 'ARRAY[1,diameter,shell,shucked,length]', + NULL,NULL,NULL, + 'max_iter=50, init_stepsize=1, decay_factor=0.9, tolerance=1e-16, epsilon = 0.2', + false); + +SELECT assert( + relative_error(t1.loss, t2.loss) < 1e-3, + 'SVR with epsilon table input: Wrong results!') +FROM svr_mdl_i AS t1, svr_mdl AS t2 where sex = 'I' + From 12c16842a28702ac2c4357d384c53ba608a44f72 Mon Sep 17 00:00:00 2001 From: Xiaocheng Tang Date: Wed, 4 Nov 2015 12:44:05 -0800 Subject: [PATCH 09/14] minor fixes --- .../modules/svm/test/linear_svm.sql_in | 52 ++++++++----------- 1 file changed, 21 insertions(+), 31 deletions(-) diff --git a/src/ports/postgres/modules/svm/test/linear_svm.sql_in b/src/ports/postgres/modules/svm/test/linear_svm.sql_in index 4d8cd0781..169fe6594 100644 --- a/src/ports/postgres/modules/svm/test/linear_svm.sql_in +++ b/src/ports/postgres/modules/svm/test/linear_svm.sql_in @@ -422,8 +422,8 @@ DROP TABLE IF EXISTS svm_test_predict CASCADE; SELECT svm_predict('svm_model_expression', 'svm_test_normalized', 'id', 'svm_test_predict'); SELECT * FROM svm_test_predict; -DROP TABLE IF EXISTS abalone_train_small; -CREATE TABLE abalone_train_small ( +DROP TABLE IF EXISTS abalone_train_small_tmp; +CREATE TABLE abalone_train_small_tmp ( id SERIAL NOT NULL, sex TEXT, length DOUBLE PRECISION, @@ -435,7 +435,7 @@ CREATE TABLE abalone_train_small ( shell DOUBLE PRECISION, rings INTEGER); -INSERT INTO abalone_train_small(id,sex,length,diameter,height,whole,shucked,viscera,shell,rings) VALUES +INSERT INTO abalone_train_small_tmp(id,sex,length,diameter,height,whole,shucked,viscera,shell,rings) VALUES (1040,'F',0.66,0.475,0.18,1.3695,0.641,0.294,0.335,6), (3160,'F',0.34,0.255,0.085,0.204,0.097,0.021,0.05,6), (3984,'F',0.585,0.45,0.125,0.874,0.3545,0.2075,0.225,6), @@ -536,30 +536,30 @@ INSERT INTO abalone_train_small(id,sex,length,diameter,height,whole,shucked,visc (831,'M',0.415,0.305,0.1,0.325,0.156,0.0505,0.091,6), (3359,'M',0.285,0.215,0.075,0.106,0.0415,0.023,0.035,5); +DROP TABLE IF EXISTS abalone_train_small; +CREATE TABLE abalone_train_small AS ( + SELECT * FROM abalone_train_small_tmp + ORDER BY sex, id +); + +DROP TABLE IF EXISTS abalone_train_small_tmp; +SELECT * FROM abalone_train_small; + DROP TABLE IF EXISTS abalone_train_f; CREATE TABLE abalone_train_f AS ( - SELECT - * - FROM - abalone_train_small + SELECT * FROM abalone_train_small WHERE sex = 'F' ); DROP TABLE IF EXISTS abalone_train_i; CREATE TABLE abalone_train_i AS ( - SELECT - * - FROM - abalone_train_small + SELECT * FROM abalone_train_small WHERE sex = 'I' ); DROP TABLE IF EXISTS abalone_train_m; CREATE TABLE abalone_train_m AS ( - SELECT - * - FROM - abalone_train_small + SELECT * FROM abalone_train_small WHERE sex = 'M' ); @@ -594,6 +594,8 @@ SELECT madlib.svm_regression( NULL,NULL,'sex', 'max_iter=50, init_stepsize=1, decay_factor=0.9, tolerance=1e-16, epsilon = 0.2', false); +SELECT * FROM svr_mdl_i; + DROP TABLE IF EXISTS svr_mdl_m, svr_mdl_m_summary; SELECT madlib.svm_regression( 'abalone_train_m', @@ -603,6 +605,8 @@ SELECT madlib.svm_regression( NULL,NULL,'sex', 'max_iter=50, init_stepsize=1, decay_factor=0.9, tolerance=1e-16, epsilon = 0.05', false); +SELECT * FROM svr_mdl_m; + DROP TABLE IF EXISTS svr_mdl_f, svr_mdl_f_summary; SELECT madlib.svm_regression( 'abalone_train_f', @@ -612,6 +616,7 @@ SELECT madlib.svm_regression( NULL,NULL,'sex', 'max_iter=50, init_stepsize=1, decay_factor=0.9, tolerance=1e-16, epsilon = 10', false); +SELECT * FROM svr_mdl_f; -- solve it with grouping and table of epsilon as inputs @@ -624,6 +629,7 @@ SELECT madlib.svm_regression( NULL,NULL,'sex', 'max_iter=50, init_stepsize=1, decay_factor=0.9, tolerance=1e-16, epsilon = 10, eps_table=abalone_eps', false); +SELECT * FROM svr_mdl; -- verify that the results are the same @@ -654,19 +660,3 @@ FROM ( FROM svr_mdl_m AS t1 JOIN svr_mdl AS t2 USING (sex) ) AS q1; - -DROP TABLE IF EXISTS svr_mdl_i, svr_mdl_i_summary; -SELECT madlib.svm_regression( - 'abalone_train_i', - 'svr_mdl_i', - 'rings', - 'ARRAY[1,diameter,shell,shucked,length]', - NULL,NULL,NULL, - 'max_iter=50, init_stepsize=1, decay_factor=0.9, tolerance=1e-16, epsilon = 0.2', - false); - -SELECT assert( - relative_error(t1.loss, t2.loss) < 1e-3, - 'SVR with epsilon table input: Wrong results!') -FROM svr_mdl_i AS t1, svr_mdl AS t2 where sex = 'I' - From 70afde68bfe239fe744511f9905003eb58ab4695 Mon Sep 17 00:00:00 2001 From: Xiaocheng Tang Date: Wed, 4 Nov 2015 14:06:41 -0800 Subject: [PATCH 10/14] minor fixes --- .../modules/svm/test/linear_svm.sql_in | 58 ++++++------------- 1 file changed, 19 insertions(+), 39 deletions(-) diff --git a/src/ports/postgres/modules/svm/test/linear_svm.sql_in b/src/ports/postgres/modules/svm/test/linear_svm.sql_in index 169fe6594..bbe7a09ec 100644 --- a/src/ports/postgres/modules/svm/test/linear_svm.sql_in +++ b/src/ports/postgres/modules/svm/test/linear_svm.sql_in @@ -539,28 +539,6 @@ INSERT INTO abalone_train_small_tmp(id,sex,length,diameter,height,whole,shucked, DROP TABLE IF EXISTS abalone_train_small; CREATE TABLE abalone_train_small AS ( SELECT * FROM abalone_train_small_tmp - ORDER BY sex, id -); - -DROP TABLE IF EXISTS abalone_train_small_tmp; -SELECT * FROM abalone_train_small; - -DROP TABLE IF EXISTS abalone_train_f; -CREATE TABLE abalone_train_f AS ( - SELECT * FROM abalone_train_small - WHERE sex = 'F' -); - -DROP TABLE IF EXISTS abalone_train_i; -CREATE TABLE abalone_train_i AS ( - SELECT * FROM abalone_train_small - WHERE sex = 'I' -); - -DROP TABLE IF EXISTS abalone_train_m; -CREATE TABLE abalone_train_m AS ( - SELECT * FROM abalone_train_small - WHERE sex = 'M' ); -- create epsilon input table @@ -583,11 +561,23 @@ WHERE sex = 'M'; DELETE FROM abalone_eps WHERE sex = 'F'; --- solve different groups individually with different epsilon +-- solve it with grouping and table of epsilon as inputs + +DROP TABLE IF EXISTS svr_mdl, svr_mdl_summary; +SELECT madlib.svm_regression( + 'abalone_train_small', + 'svr_mdl', + 'rings', + 'ARRAY[1,diameter,shell,shucked,length]', + NULL,NULL,'sex', + 'max_iter=50, init_stepsize=1, decay_factor=0.9, tolerance=1e-16, epsilon = 10, eps_table=abalone_eps', + false); +SELECT * FROM svr_mdl; + DROP TABLE IF EXISTS svr_mdl_i, svr_mdl_i_summary; SELECT madlib.svm_regression( - 'abalone_train_i', + 'abalone_train_small', 'svr_mdl_i', 'rings', 'ARRAY[1,diameter,shell,shucked,length]', @@ -598,7 +588,7 @@ SELECT * FROM svr_mdl_i; DROP TABLE IF EXISTS svr_mdl_m, svr_mdl_m_summary; SELECT madlib.svm_regression( - 'abalone_train_m', + 'abalone_train_small', 'svr_mdl_m', 'rings', 'ARRAY[1,diameter,shell,shucked,length]', @@ -609,7 +599,7 @@ SELECT * FROM svr_mdl_m; DROP TABLE IF EXISTS svr_mdl_f, svr_mdl_f_summary; SELECT madlib.svm_regression( - 'abalone_train_f', + 'abalone_train_small', 'svr_mdl_f', 'rings', 'ARRAY[1,diameter,shell,shucked,length]', @@ -618,19 +608,6 @@ SELECT madlib.svm_regression( false); SELECT * FROM svr_mdl_f; --- solve it with grouping and table of epsilon as inputs - -DROP TABLE IF EXISTS svr_mdl, svr_mdl_summary; -SELECT madlib.svm_regression( - 'abalone_train_small', - 'svr_mdl', - 'rings', - 'ARRAY[1,diameter,shell,shucked,length]', - NULL,NULL,'sex', - 'max_iter=50, init_stepsize=1, decay_factor=0.9, tolerance=1e-16, epsilon = 10, eps_table=abalone_eps', - false); -SELECT * FROM svr_mdl; - -- verify that the results are the same SELECT assert( @@ -640,6 +617,7 @@ FROM ( SELECT abs(t1.norm_of_gradient - t2.norm_of_gradient) AS abs_err FROM svr_mdl_f AS t1 JOIN svr_mdl AS t2 USING (sex) + where sex = 'F' ) AS q1; SELECT assert( @@ -649,6 +627,7 @@ FROM ( SELECT abs(t1.norm_of_gradient - t2.norm_of_gradient) AS abs_err FROM svr_mdl_i AS t1 JOIN svr_mdl AS t2 USING (sex) + where sex = 'I' ) AS q1; SELECT assert( @@ -658,5 +637,6 @@ FROM ( SELECT abs(t1.norm_of_gradient - t2.norm_of_gradient) AS abs_err FROM svr_mdl_m AS t1 JOIN svr_mdl AS t2 USING (sex) + where sex = 'M' ) AS q1; From 8a1d5ca1be20be4f12b16d01ad218aa832cabe86 Mon Sep 17 00:00:00 2001 From: Xiaocheng Tang Date: Wed, 4 Nov 2015 15:12:24 -0800 Subject: [PATCH 11/14] minor fixes --- .../modules/svm/test/linear_svm.sql_in | 21 ++++++------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/src/ports/postgres/modules/svm/test/linear_svm.sql_in b/src/ports/postgres/modules/svm/test/linear_svm.sql_in index bbe7a09ec..e4a2be2db 100644 --- a/src/ports/postgres/modules/svm/test/linear_svm.sql_in +++ b/src/ports/postgres/modules/svm/test/linear_svm.sql_in @@ -544,22 +544,13 @@ CREATE TABLE abalone_train_small AS ( -- create epsilon input table DROP TABLE IF EXISTS abalone_eps; -CREATE TABLE abalone_eps AS ( -SELECT - sex, - NULL::double precision AS epsilon -FROM - abalone_train_small -GROUP BY sex); - -UPDATE abalone_eps SET epsilon = 0.2 -WHERE sex = 'I'; - -UPDATE abalone_eps SET epsilon = 0.05 -WHERE sex = 'M'; +CREATE TABLE abalone_eps ( + sex TEXT, + epsilon DOUBLE PRECISION); -DELETE FROM abalone_eps -WHERE sex = 'F'; +INSERT INTO abalone_eps(sex, epsilon) VALUES +('I', 0.2), +('M', 0.05); -- solve it with grouping and table of epsilon as inputs From 90683be0a89bb3d06b008279ce2d1254fae34e9a Mon Sep 17 00:00:00 2001 From: Xiaocheng Tang Date: Wed, 4 Nov 2015 16:16:29 -0800 Subject: [PATCH 12/14] minor fixes --- .../modules/svm/test/linear_svm.sql_in | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/ports/postgres/modules/svm/test/linear_svm.sql_in b/src/ports/postgres/modules/svm/test/linear_svm.sql_in index e4a2be2db..cd8d7a2e6 100644 --- a/src/ports/postgres/modules/svm/test/linear_svm.sql_in +++ b/src/ports/postgres/modules/svm/test/linear_svm.sql_in @@ -561,7 +561,7 @@ SELECT madlib.svm_regression( 'rings', 'ARRAY[1,diameter,shell,shucked,length]', NULL,NULL,'sex', - 'max_iter=50, init_stepsize=1, decay_factor=0.9, tolerance=1e-16, epsilon = 10, eps_table=abalone_eps', + 'max_iter=50, init_stepsize=1, decay_factor=0.9, tolerance=1e-16, epsilon = 50, eps_table=abalone_eps', false); SELECT * FROM svr_mdl; @@ -575,7 +575,7 @@ SELECT madlib.svm_regression( NULL,NULL,'sex', 'max_iter=50, init_stepsize=1, decay_factor=0.9, tolerance=1e-16, epsilon = 0.2', false); -SELECT * FROM svr_mdl_i; +SELECT * FROM svr_mdl_i where sex = 'I'; DROP TABLE IF EXISTS svr_mdl_m, svr_mdl_m_summary; SELECT madlib.svm_regression( @@ -586,7 +586,7 @@ SELECT madlib.svm_regression( NULL,NULL,'sex', 'max_iter=50, init_stepsize=1, decay_factor=0.9, tolerance=1e-16, epsilon = 0.05', false); -SELECT * FROM svr_mdl_m; +SELECT * FROM svr_mdl_m where sex = 'M'; DROP TABLE IF EXISTS svr_mdl_f, svr_mdl_f_summary; SELECT madlib.svm_regression( @@ -595,9 +595,9 @@ SELECT madlib.svm_regression( 'rings', 'ARRAY[1,diameter,shell,shucked,length]', NULL,NULL,'sex', - 'max_iter=50, init_stepsize=1, decay_factor=0.9, tolerance=1e-16, epsilon = 10', + 'max_iter=50, init_stepsize=1, decay_factor=0.9, tolerance=1e-16, epsilon = 50', false); -SELECT * FROM svr_mdl_f; +SELECT * FROM svr_mdl_f where sex = 'F'; -- verify that the results are the same @@ -612,21 +612,21 @@ FROM ( ) AS q1; SELECT assert( - abs_err < 1e-5, + rel_err < 1e-1, 'SVR with epsilon table input: Wrong results!') FROM ( SELECT - abs(t1.norm_of_gradient - t2.norm_of_gradient) AS abs_err + relative_error(t1.norm_of_gradient, t2.norm_of_gradient) AS rel_err FROM svr_mdl_i AS t1 JOIN svr_mdl AS t2 USING (sex) where sex = 'I' ) AS q1; SELECT assert( - abs_err < 1e-5, + rel_err < 1e-1, 'SVR with epsilon table input: Wrong results!') FROM ( SELECT - abs(t1.norm_of_gradient - t2.norm_of_gradient) AS abs_err + relative_error(t1.norm_of_gradient, t2.norm_of_gradient) AS rel_err FROM svr_mdl_m AS t1 JOIN svr_mdl AS t2 USING (sex) where sex = 'M' ) AS q1; From c9f1582cfb1f912383226009072c106aad57afdb Mon Sep 17 00:00:00 2001 From: Xiaocheng Tang Date: Wed, 4 Nov 2015 16:35:22 -0800 Subject: [PATCH 13/14] minor fixes --- src/ports/postgres/modules/svm/test/linear_svm.sql_in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ports/postgres/modules/svm/test/linear_svm.sql_in b/src/ports/postgres/modules/svm/test/linear_svm.sql_in index cd8d7a2e6..0253c77b0 100644 --- a/src/ports/postgres/modules/svm/test/linear_svm.sql_in +++ b/src/ports/postgres/modules/svm/test/linear_svm.sql_in @@ -424,8 +424,8 @@ SELECT * FROM svm_test_predict; DROP TABLE IF EXISTS abalone_train_small_tmp; CREATE TABLE abalone_train_small_tmp ( - id SERIAL NOT NULL, sex TEXT, + id SERIAL NOT NULL, length DOUBLE PRECISION, diameter DOUBLE PRECISION, height DOUBLE PRECISION, From b105d1c42168bc669bf4ebeae1861361b4d16eac Mon Sep 17 00:00:00 2001 From: Xiaocheng Tang Date: Thu, 29 Oct 2015 14:28:52 -0700 Subject: [PATCH 14/14] SVM: Add cross validation support and generic CrossValidator class JIRA: MADLIB-915 Authors: Xiaocheng Tang Rahul Iyer Changes: - Add cross validation support on lambda, epsilon, init_stepsize, max_iter, and decay_factor - Add support for optionally writing validation results to a sql table - Add support for lazy-generation of cv datasets - Add internal generic CrossValidator class which is used for implementing this issue - Refactoring SVM for better modularity - Ignore cv on epsilon when it is classification - Cross validation now works when independent variables are queries - Fixed "zero length field name in format" error in python < 2.7 - Fix error messages so that it pass the svm input tests --- src/ports/postgres/modules/svm/svm.py_in | 649 +++++++++++------- .../modules/svm/test/linear_svm.sql_in | 79 ++- .../utilities/in_mem_group_control.py_in | 8 +- .../modules/utilities/utilities.py_in | 11 + .../modules/validation/cross_validation.py_in | 18 +- .../modules/validation/cv_utils.py_in | 1 + .../validation/internal/__init__.py_in | 0 .../internal/cross_validation.py_in | 420 ++++++++++++ 8 files changed, 923 insertions(+), 263 deletions(-) create mode 100644 src/ports/postgres/modules/validation/internal/__init__.py_in create mode 100644 src/ports/postgres/modules/validation/internal/cross_validation.py_in diff --git a/src/ports/postgres/modules/svm/svm.py_in b/src/ports/postgres/modules/svm/svm.py_in index 9b6e997e5..e073ebe72 100644 --- a/src/ports/postgres/modules/svm/svm.py_in +++ b/src/ports/postgres/modules/svm/svm.py_in @@ -1,3 +1,5 @@ +from __future__ import division + import plpy from utilities.control import MinWarning @@ -9,6 +11,7 @@ from utilities.utilities import add_postfix from utilities.utilities import _string_to_array_with_quotes from utilities.utilities import _string_to_array from utilities.utilities import _assert +from utilities.utilities import num_features from utilities.validate_args import cols_in_tbl_valid from utilities.validate_args import input_tbl_valid @@ -16,6 +19,14 @@ from utilities.validate_args import output_tbl_valid from utilities.validate_args import is_var_valid from utilities.validate_args import get_expr_type +from validation.internal.cross_validation import CrossValidator + +from collections import namedtuple +from functools import partial +from operator import itemgetter +from operator import attrgetter +from itertools import product, repeat, imap + def _compute_svm(args): """ @@ -23,6 +34,7 @@ def _compute_svm(args): @return Number of iterations that has been run """ + args['stepsize'] = args['init_stepsize'] iterationCtrl = GroupIterationController(args) with iterationCtrl as it: it.iteration = 0 @@ -58,216 +70,299 @@ def _compute_svm(args): # --------------------------------------------------- +def _verify_table(source_table, model_table, dependent_varname, + independent_varname, **kwargs): + # validate input + input_tbl_valid(source_table, 'SVM') + _assert(is_var_valid(source_table, dependent_varname), + "SVM error: invalid dependent_varname " + "('{dependent_varname}') for source_table " + "({source_table})!".format(dependent_varname=dependent_varname, + source_table=source_table)) + _assert(is_var_valid(source_table, independent_varname), + "SVM error: invalid independent_varname " + "('{independent_varname}') for source_table " + "({source_table})!".format(independent_varname=independent_varname, + source_table=source_table)) + + dep_type = get_expr_type(dependent_varname, source_table) + if '[]' in dep_type: + plpy.error("SVM error: dependent_varname cannot be of array type!") + + # validate output tables + output_tbl_valid(model_table, 'SVM') + summary_table = add_postfix(model_table, "_summary") + output_tbl_valid(summary_table, 'SVM') + + +def _verify_grouping(schema_madlib, source_table, grouping_col): + if grouping_col: + grouping_list = [i + "::text" + for i in explicit_bool_to_text( + source_table, + _string_to_array_with_quotes(grouping_col), + schema_madlib)] + grouping_str = ','.join(grouping_list) + else: + grouping_str = "Null" + + if grouping_col: + cols_in_tbl_valid(source_table, _string_to_array_with_quotes(grouping_col), 'SVM') + intersect = frozenset(_string_to_array(grouping_col)).intersection( + frozenset( + ('coef', '__random_feature_data', + '__random_feature_data', 'loss' + 'num_rows_processed', 'num_rows_skipped', + 'norm_of_gradient', 'num_iterations'))) + if len(intersect) > 0: + plpy.error("SVM error: Conflicting grouping column name.\n" + "Some predefined keyword(s) ({0}) are not allowed!".format( + ', '.join(intersect))) + return grouping_str + + +def _verify_kernel(kernel_func): + kernel_func = 'linear' if not kernel_func else kernel_func.lower() + # Add non-linear kernels below after implementing them. + supported_kernels = ['linear'] + try: + # allow user to specify a prefix substring of + # supported kernel function names. This works because the supported + # kernel functions have unique prefixes. + kernel_func = next(x for x in supported_kernels if x.startswith(kernel_func)) + except StopIteration: + # next() returns a StopIteration if no element found + plpy.error("SVM Error: Invalid kernel function: {0}. Supported kernel functions are ({1})" + .format(kernel_func, ','.join(sorted(supported_kernels)))) + return kernel_func + + +def _verify_params_dict(params_dict): + if hasattr(params_dict['lambda'], '__len__'): + plpy.error("SVM Error: lambda should not be a list " + "after cross validation!") + if hasattr(params_dict['epsilon'], '__len__'): + plpy.error("SVM Error: epsilon should not be a list " + "after cross validation!") + return params_dict + + +def _summary(n_iters_run, model_table, args): + grouping_col = args['grouping_col'] + dependent_varname = args['col_dep_var'] + independent_varname = args['col_ind_var'] + source_table = args['rel_source'] + col_grp_key = args['col_grp_key'] + groupby_str, grouping_str1, using_str = "", "", "ON TRUE" + if grouping_col: + groupby_str = "GROUP BY {grouping_col}, {col_grp_key}".format(grouping_col=grouping_col, col_grp_key=col_grp_key) + grouping_str1 = grouping_col + "," + using_str = "USING ({col_grp_key})".format(col_grp_key=col_grp_key) + # organizing results + dep_type = get_expr_type(dependent_varname, source_table) + model_table_query = """ + CREATE TABLE {model_table} AS + SELECT + {grouping_str1} + (result).coefficients AS coef, + (result).loss AS loss, + (result).norm_of_gradient AS norm_of_gradient, + {n_iters_run} AS num_iterations, + (result).num_rows_processed AS num_rows_processed, + n_tuples_including_nulls - (result).num_rows_processed + AS num_rows_skipped, + NULL AS __random_feature_data, + ARRAY[{mapping}]::{dep_type}[] AS dep_var_mapping + FROM + ( + SELECT + {schema_madlib}.internal_linear_svm_igd_result( + {col_grp_state} + ) AS result, + {col_grp_key} + FROM {rel_state} + WHERE {col_grp_iteration} = {n_iters_run} + ) rel_state_subq + JOIN + ( + SELECT + {grouping_str1} + count(*) AS n_tuples_including_nulls, + array_to_string(ARRAY[{grouping_str}], + ',' + ) AS {col_grp_key} + FROM {source_table} + {groupby_str} + ) n_tuples_including_nulls_subq + {using_str} + """.format(n_iters_run=n_iters_run, + groupby_str=groupby_str, + grouping_str1=grouping_str1, + using_str=using_str, + source_table=source_table, + model_table=model_table, + dep_type=dep_type, **args) + plpy.execute(model_table_query) + + if type(args['lambda']) is list: + args['lambda_str'] = '{' + ','.join(str(e) for e in args['lambda']) + '}' + else: + args['lambda_str'] = str(args['lambda']) + summary_table = add_postfix(model_table, "_summary") + grouping_text = "NULL" if not grouping_col else grouping_col + plpy.execute(""" + CREATE TABLE {summary_table} AS + SELECT + '{method}'::text AS method, + '__MADLIB_VERSION__'::text AS version_number, + '{source_table}'::text AS source_table, + '{model_table}'::text AS model_table, + '{dependent_varname}'::text AS dependent_varname, + '{independent_varname}'::text AS independent_varname, + 'linear'::text AS kernel_func, + NULL::text AS kernel_params, + '{grouping_text}'::text AS grouping_col, + 'init_stepsize={init_stepsize}, ' || + 'decay_factor={decay_factor}, ' || + 'max_iter={max_iter}, ' || + 'tolerance={tolerance}'::text AS optim_params, + 'lambda={lambda_str}, ' || + 'norm={norm}, ' || + 'n_folds={n_folds}'::text AS reg_params, + count(*)::integer AS num_all_groups, + 0::integer AS num_failed_groups, + sum(num_rows_processed)::bigint AS total_rows_processed, + sum(num_rows_skipped)::bigint AS total_rows_skipped, + '{epsilon}'::double precision AS epsilon, + '{eps_table}'::text AS eps_table + FROM {model_table}; + """.format(grouping_text=grouping_text, + summary_table=summary_table, + source_table=source_table, + model_table=model_table, + dependent_varname=dependent_varname, + independent_varname=independent_varname, + **args)) + + def svm(schema_madlib, source_table, model_table, dependent_varname, independent_varname, kernel_func, - kernel_params, grouping_col, params, is_svc, + kernel_params, grouping_col, params, is_svc, verbose, **kwargs): """ Executes the linear support vector classification algorithm. """ - # verbosing + # verbosing verbosity_level = "info" if verbose else "error" with MinWarning(verbosity_level): - # validate input - input_tbl_valid(source_table, 'SVM') - _assert(is_var_valid(source_table, dependent_varname), - "SVM error: invalid dependent_varname ('" + str(dependent_varname) + - "') for source_table (" + source_table + ")!") - _assert(is_var_valid(source_table, independent_varname), - "SVM error: invalid independent_varname ('" + str(independent_varname) + - "') for source_table (" + source_table + ")!") - - dep_type = get_expr_type(dependent_varname, source_table) - if '[]' in dep_type: - plpy.error("SVM error: dependent_varname cannot be of array type!") - - # validate output tables - output_tbl_valid(model_table, 'SVM') - summary_table = add_postfix(model_table, "_summary") - output_tbl_valid(summary_table, 'SVM') - - # arguments for iterating - n_features = plpy.execute("SELECT array_upper({0}, 1) AS dim " - "FROM {1} LIMIT 1". - format(independent_varname, source_table) - )[0]['dim'] - if grouping_col: - grouping_list = [i + "::text" - for i in explicit_bool_to_text( - source_table, - _string_to_array_with_quotes(grouping_col), - schema_madlib)] - grouping_str = ','.join(grouping_list) - else: - grouping_str = "Null" - grouping_str1 = "" if not grouping_col else grouping_col + "," - grouping_str2 = "1 = 1" if not grouping_col else grouping_col - - args = { - 'rel_args': unique_string(desp='rel_args'), - 'rel_state': unique_string(desp='rel_state'), - 'col_grp_iteration': unique_string(desp='col_grp_iteration'), - 'col_grp_state': unique_string(desp='col_grp_state'), - 'col_grp_key': unique_string(desp='col_grp_key'), - 'col_n_tuples': unique_string(desp='col_n_tuples'), - 'state_type': "double precision[]", - 'rel_source': source_table, - 'col_ind_var': independent_varname, - 'col_dep_var': dependent_varname} - args.update(locals()) - # variables defined above cannot be moved below this line - # ------------------------------------------------------- - - # other params - kernel_func = 'linear' if not kernel_func else kernel_func.lower() - # Add non-linear kernels below after implementing them. - supported_kernels = ['linear'] - try: - # allow user to specify a prefix substring of - # supported kernel function names. This works because the supported - # kernel functions have unique prefixes. - kernel_func = next(x for x in supported_kernels if x.startswith(kernel_func)) - except StopIteration: - # next() returns a StopIteration if no element found - plpy.error("SVM Error: Invalid kernel function: {0}. Supported kernel functions are ({1})" - .format(kernel_func, ','.join(sorted(supported_kernels)))) - - if grouping_col: - cols_in_tbl_valid(source_table, _string_to_array_with_quotes(grouping_col), 'SVM') - intersect = frozenset(_string_to_array(grouping_col)).intersection( - frozenset( - ('coef', '__random_feature_data', - '__random_feature_data', 'loss' - 'num_rows_processed', 'num_rows_skipped', - 'norm_of_gradient', 'num_iterations'))) - if len(intersect) > 0: - plpy.error("SVM error: Conflicting grouping column name.\n" - "Some predefined keyword(s) ({0}) are not allowed!".format( - ', '.join(intersect))) - - args.update(_extract_params(schema_madlib, params)) - args.update(_process_epsilon(is_svc, args)) - - if not is_svc: - # transform col_dep_var to binary (1`or -1) if classification - args.update({ - 'col_dep_var_trans': dependent_varname, - 'mapping': 'NULL', - 'method': 'SVR'}) - else: - # dependent variable mapping - dep_labels=plpy.execute(""" - SELECT {dependent_varname} AS y - FROM {source_table} - WHERE ({dependent_varname}) IS NOT NULL - GROUP BY ({dependent_varname}) - ORDER BY ({dependent_varname})""".format(**locals())) - dep_var_mapping = ["'" + d['y'] + "'" if isinstance(d['y'], basestring) else str(d['y']) for d in dep_labels] - if len(dep_var_mapping) != 2: - plpy.error("SVM error: Classification currently only supports binary output") - - col_dep_var_trans = ( - """ - CASE WHEN ({col_dep_var}) IS NULL THEN NULL - WHEN ({col_dep_var}) = {mapped_value_for_negative} THEN -1.0 - ELSE 1.0 - END - """ - .format(col_dep_var=dependent_varname, - mapped_value_for_negative=dep_var_mapping[0]) - ) - - args.update({ - 'mapped_value_for_negative': dep_var_mapping[0], - 'col_dep_var_trans': col_dep_var_trans, - 'mapping': dep_var_mapping[0] + "," + dep_var_mapping[1], - 'method': 'SVC'}) - - args['stepsize'] = args['init_stepsize'] - args['is_l2'] = True if args['norm'] == 'l2' else False - - # place holder for compatibility - plpy.execute("CREATE TABLE pg_temp.{0} AS SELECT 1".format(args['rel_args'])) - # actual iterative algorithm computation - n_iters_run = _compute_svm(args) - - # organizing results - groupby_str = "GROUP BY {grouping_col}, {col_grp_key}".format(**args) if grouping_col else "" - using_str = "USING ({col_grp_key})".format(**args) if grouping_col else "ON TRUE" - model_table_query = """ - CREATE TABLE {model_table} AS - SELECT - {grouping_str1} - (result).coefficients AS coef, - (result).loss AS loss, - (result).norm_of_gradient AS norm_of_gradient, - {n_iters_run} AS num_iterations, - (result).num_rows_processed AS num_rows_processed, - n_tuples_including_nulls - (result).num_rows_processed - AS num_rows_skipped, - NULL AS __random_feature_data, - ARRAY[{mapping}]::{dep_type}[] AS dep_var_mapping - FROM - ( - SELECT - {schema_madlib}.internal_linear_svm_igd_result( - {col_grp_state} - ) AS result, - {col_grp_key} - FROM {rel_state} - WHERE {col_grp_iteration} = {n_iters_run} - ) rel_state_subq - JOIN - ( - SELECT - {grouping_str1} - count(*) AS n_tuples_including_nulls, - array_to_string(ARRAY[{grouping_str}], - ',' - ) AS {col_grp_key} - FROM {source_table} - {groupby_str} - ) n_tuples_including_nulls_subq - {using_str} - """.format(n_iters_run=n_iters_run, - groupby_str=groupby_str, - using_str=using_str, **args) - plpy.execute(model_table_query) - - if type(args['lambda']) is list: - args['lambda_str'] = '{' + ','.join(str(e) for e in args['lambda']) + '}' - else: - args['lambda_str'] = str(args['lambda']) - - plpy.execute(""" - CREATE TABLE {summary_table} AS - SELECT - '{method}'::text AS method, - '__MADLIB_VERSION__'::text AS version_number, - '{source_table}'::text AS source_table, - '{model_table}'::text AS model_table, - '{dependent_varname}'::text AS dependent_varname, - '{independent_varname}'::text AS independent_varname, - 'linear'::text AS kernel_func, - NULL::text AS kernel_params, - '{grouping_text}'::text AS grouping_col, - 'init_stepsize={init_stepsize}, ' || - 'decay_factor={decay_factor}, ' || - 'max_iter={max_iter}, ' || - 'tolerance={tolerance}'::text AS optim_params, - 'lambda={lambda_str}, ' || - 'norm={norm}, ' || - 'n_folds={n_folds}'::text AS reg_params, - count(*)::integer AS num_all_groups, - 0::integer AS num_failed_groups, - sum(num_rows_processed)::bigint AS total_rows_processed, - sum(num_rows_skipped)::bigint AS total_rows_skipped, - '{epsilon}'::double precision AS epsilon, - '{eps_table}'::text AS eps_table - FROM {model_table}; - """.format(grouping_text="NULL" if not grouping_col else grouping_col, - **args)) -# ------------------------------------------------------------------------------ + _verify_table(source_table, + model_table, + dependent_varname, + independent_varname) + args = locals() + args['params_dict'] = _extract_params(schema_madlib, params) + _cross_validate_svm(args) + _svm_parsed_params(**args) + + +def _cross_validate_svm(args): + # updating params_dict will also update + # also update args['params_dict'] + params_dict = args['params_dict'] + + if params_dict['n_folds'] > 1 and args['grouping_col']: + plpy.error('SVM error: cross validation ' + 'with grouping is not supported!') + + # currently only support cross validation + # on lambda and epsilon + cv_params = {} + if len(params_dict['lambda']) > 1: + cv_params['lambda'] = params_dict['lambda'] + else: + params_dict['lambda'] = params_dict['lambda'][0] + if len(params_dict['epsilon']) > 1 and not args['is_svc']: + cv_params['epsilon'] = params_dict['epsilon'] + else: + params_dict['epsilon'] = params_dict['epsilon'][0] + if len(params_dict['init_stepsize']) > 1: + cv_params['init_stepsize'] = params_dict['init_stepsize'] + else: + params_dict['init_stepsize'] = params_dict['init_stepsize'][0] + if len(params_dict['max_iter']) > 1: + cv_params['max_iter'] = params_dict['max_iter'] + else: + params_dict['max_iter'] = params_dict['max_iter'][0] + if len(params_dict['decay_factor']) > 1: + cv_params['decay_factor'] = params_dict['decay_factor'] + else: + params_dict['decay_factor'] = params_dict['decay_factor'][0] + + if not cv_params and params_dict['n_folds'] <= 1: + return + + if cv_params and params_dict['n_folds'] <= 1: + plpy.error("SVM Error: parameters must be a scalar " + "or of length 1 when n_folds is 0 or 1") + return + + if not cv_params and params_dict['n_folds'] > 1: + plpy.warning('SVM Warning: no cross validate params provided! ' + 'Ignore {}-folds cross validation request.' + .format(params_dict['n_folds'])) + return + + scorer = 'classification' if args['is_svc'] else 'regression' + sub_args = {'params_dict':cv_params} + cv = CrossValidator(_svm_parsed_params,svm_predict,scorer,args) + val_res = cv.validate(sub_args, params_dict['n_folds']).sorted() + val_res.output_tbl(params_dict['validation_result']) + params_dict.update(val_res.first('sub_args')['params_dict']) + + +def _svm_parsed_params(schema_madlib, source_table, model_table, + dependent_varname, independent_varname, kernel_func, + kernel_params, grouping_col, params_dict, is_svc, + verbose, **kwargs): + """ + Executes the linear support vector classification algorithm. + """ + grouping_str = _verify_grouping(schema_madlib, + source_table, + grouping_col) + + kernel_func = _verify_kernel(kernel_func) + + # arguments for iterating + n_features = num_features(source_table, + independent_varname) + + args = { + 'rel_args': unique_string(desp='rel_args'), + 'rel_state': unique_string(desp='rel_state'), + 'col_grp_iteration': unique_string(desp='col_grp_iteration'), + 'col_grp_state': unique_string(desp='col_grp_state'), + 'col_grp_key': unique_string(desp='col_grp_key'), + 'col_n_tuples': unique_string(desp='col_n_tuples'), + 'state_type': "double precision[]", + 'n_features': n_features, + 'verbose': verbose, + 'schema_madlib': schema_madlib, + 'grouping_str': grouping_str, + 'grouping_col': grouping_col, + 'rel_source': source_table, + 'col_ind_var': independent_varname, + 'col_dep_var': dependent_varname} + + args.update(_verify_params_dict(params_dict)) + args.update(_process_epsilon(is_svc, args)) + args.update(_svc_or_svr(is_svc, source_table, dependent_varname)) + + # place holder for compatibility + plpy.execute("CREATE TABLE pg_temp.{0} AS SELECT 1".format(args['rel_args'])) + # actual iterative algorithm computation + n_iters_run = _compute_svm(args) + _summary(n_iters_run, model_table, args) def svm_predict(schema_madlib, model_table, new_data_table, id_col_name, @@ -343,20 +438,20 @@ def svm_predict(schema_madlib, model_table, new_data_table, id_col_name, sql = """ CREATE TABLE {output_table} AS SELECT - {id_col_name} AS id, + {id_col_name} AS {id_col_name}, {pred_query} AS prediction, {model_table}.{grouping_col} as grouping_col FROM {model_table} JOIN {new_data_table} ON {model_table}.{grouping_col} = {new_data_table}.{grouping_col} WHERE not {schema_madlib}.array_contains_null({independent_varname}) - ORDER BY grouping_col, id + ORDER BY grouping_col, {id_col_name} """.format(**locals()) else: sql=""" CREATE TABLE {output_table} AS SELECT - {id_col_name} AS id, + {id_col_name} AS {id_col_name}, {pred_query} as prediction FROM {model_table}, {new_data_table} WHERE not {schema_madlib}.array_contains_null({independent_varname}) @@ -364,6 +459,48 @@ def svm_predict(schema_madlib, model_table, new_data_table, id_col_name, plpy.execute(sql) +def _svc_or_svr(is_svc, source_table, dependent_varname): + # transform col_dep_var to binary (1`or -1) if classification + _args = {'col_dep_var_trans': dependent_varname, + 'mapping': 'NULL', + 'method': 'SVR'} + + if is_svc: + # dependent variable mapping + dep_labels=plpy.execute(""" + SELECT {dependent_varname} AS y + FROM {source_table} + WHERE ({dependent_varname}) IS NOT NULL + GROUP BY ({dependent_varname}) + ORDER BY ({dependent_varname}) + """.format(source_table=source_table, + dependent_varname=dependent_varname)) + + dep_var_mapping = ["'" + d['y'] + "'" if isinstance(d['y'], basestring) else str(d['y']) for d in dep_labels] + + if len(dep_var_mapping) != 2: + plpy.error("SVM error: Classification currently only supports binary output") + + col_dep_var_trans = ( + """ + CASE WHEN ({col_dep_var}) IS NULL THEN NULL + WHEN ({col_dep_var}) = {mapped_value_for_negative} THEN -1.0 + ELSE 1.0 + END + """ + .format(col_dep_var=dependent_varname, + mapped_value_for_negative=dep_var_mapping[0]) + ) + + _args.update({ + 'mapped_value_for_negative': dep_var_mapping[0], + 'col_dep_var_trans': col_dep_var_trans, + 'mapping': dep_var_mapping[0] + "," + dep_var_mapping[1], + 'method': 'SVC'}) + + return _args + + def _process_epsilon(is_svc, args): eps_table = args['eps_table'] grouping_col = args['grouping_col'] @@ -373,18 +510,22 @@ def _process_epsilon(is_svc, args): rel_epsilon = '' select_epsilon = '' as_rel_source = '_src' - - epsilon = args['epsilon'] - # c code does SVR when epsilon is non-zero - if is_svc: epsilon = 0.0 - # c code does SVC if epsilon is zero - elif args['epsilon'] == 0: epsilon = 0.00001 - if is_svc or not grouping_col or not eps_table: + epsilon = args['epsilon'] + # c code does SVR when epsilon is non-zero + if is_svc: + epsilon = 0.0 + select_epsilon = '{epsilon}'.format(epsilon=epsilon) + elif not grouping_col or not eps_table: if eps_table: - plpy.warning('SVM: ignore the input epsilon table!') + plpy.warning('SVM warning: no grouping and ' + ' ignore the input epsilon table!') + # c code does SVC if epsilon is zero + if epsilon == 0: epsilon = 0.00001 select_epsilon = '{epsilon}'.format(epsilon=epsilon) else: + # c code does SVC if epsilon is zero + if epsilon == 0: epsilon = 0.00001 rel_epsilon = unique_string(desp='rel_epsilon') # validate input @@ -401,17 +542,17 @@ def _process_epsilon(is_svc, args): {col_grp_key}, coalesce(epsilon, {epsilon}) AS epsilon FROM ( - SELECT - array_to_string(ARRAY[{grouping_str}], ',') AS + SELECT + array_to_string(ARRAY[{grouping_str}], ',') AS {col_grp_key} - FROM + FROM {rel_source} GROUP BY {grouping_col} ) q1 - LEFT JOIN + LEFT JOIN ( - SELECT - array_to_string(ARRAY[{grouping_str}], ',') AS + SELECT + array_to_string(ARRAY[{grouping_str}], ',') AS {col_grp_key}, epsilon FROM @@ -430,10 +571,10 @@ def _process_epsilon(is_svc, args): select_epsilon = ( """ ( - SELECT epsilon + SELECT epsilon FROM {rel_epsilon} - WHERE + WHERE {rel_epsilon}.{col_grp_key} = {as_rel_source}.{col_grp_key} ) """ @@ -451,57 +592,67 @@ def _extract_params(schema_madlib, params, module='SVM'): # NOTICE: the type of values in params_default should be consistent with # the types specified in params_types params_default = { - 'init_stepsize': 0.01, - 'decay_factor': 0.9, - 'max_iter': 100, + 'init_stepsize': [0.01], + 'decay_factor': [0.9], + 'max_iter': [100], 'tolerance': 1e-10, - 'lambda': 1.0, + 'lambda': [0.01], 'norm': 'L2', 'n_folds': 0, - 'epsilon': 0.01, + 'validation_result': '', + 'epsilon': [0.01], 'eps_table': ''} params_types = { - 'init_stepsize': float, - 'decay_factor': float, - 'max_iter': int, + 'init_stepsize': list, + 'decay_factor': list, + 'max_iter': list, 'tolerance': float, 'lambda': list, 'norm': str, 'n_folds': int, - 'epsilon': float, + 'validation_result': str, + 'epsilon': list, 'eps_table': str} params_vals = extract_keyvalue_params(params, params_types, params_default) - if params_vals['n_folds'] < 0: plpy.error("{0} error: n_folds must be non-negative!".format(module)) - # FIXME - if params_vals['n_folds'] > 1: - plpy.error("{0} error: cross-validation not implemented!".format(module)) # validate lambda - if hasattr(params_vals['lambda'], '__len__'): - if len(params_vals['lambda']) != 1: - plpy.error("{0} error: lambda must be a scalar or of length 1 when n_folds is 0 or 1".format(module)) - # good for only not CV - params_vals['lambda'] = params_vals['lambda'][0] - if params_vals['lambda'] < 0: - plpy.error("{0} error: lambda must be non-negative!".format(module)) + params_vals['lambda'] = map(float, params_vals['lambda']) + if [lmd for lmd in params_vals['lambda'] if lmd < 0]: + plpy.error("{0} error: lambda must be " + "non-negative!".format(module)) + # validate epsilon + params_vals['epsilon'] = map(float, params_vals['epsilon']) + if [e for e in params_vals['epsilon'] if e < 0]: + plpy.error("{0} error: epsilon must be " + "non-negative!".format(module)) + # validating cross validation is delegated to _cross_validate_svm() + params_vals['init_stepsize'] = map(float, params_vals['init_stepsize']) + if [e for e in params_vals['init_stepsize'] if e < 0]: + plpy.error("{0} error: init_stepsize must be positive!".format(module)) + + params_vals['max_iter'] = map(int, params_vals['max_iter']) + if [e for e in params_vals['max_iter'] if e < 0]: + plpy.error("{0} error: max_iter must be positive!".format(module)) + + params_vals['decay_factor'] = map(float, params_vals['decay_factor']) + if [e for e in params_vals['decay_factor'] if e > 1]: + plpy.error("{0} error: decay_factor must be <= 1!".format(module)) + + if params_vals['validation_result']: + output_tbl_valid(params_vals['validation_result'], 'SVM') + params_vals['norm'] = params_vals['norm'].lower() if params_vals['norm'] != 'l1' and params_vals['norm'] != 'l2': plpy.error("{0} error: norm must be either L1 or L2!".format(module)) - if params_vals['init_stepsize'] <= 0: - plpy.error("{0} error: init_stepsize must be positive!".format(module)) - if params_vals['decay_factor'] > 1: - plpy.error("{0} error: decay_factor must be <= 1!".format(module)) - if params_vals['max_iter'] <= 0: - plpy.error("{0} error: max_iter must be positive!".format(module)) if params_vals['tolerance'] < 0: plpy.error("{0} error: tolerance must be non-negative!".format(module)) - if params_vals['epsilon'] < 0: - plpy.error("{0} error: epsilon cannot be less than 0!".format(module)) + + params_vals['is_l2'] = True if params_vals['norm'] == 'l2' else False return params_vals diff --git a/src/ports/postgres/modules/svm/test/linear_svm.sql_in b/src/ports/postgres/modules/svm/test/linear_svm.sql_in index 0253c77b0..bba65673c 100644 --- a/src/ports/postgres/modules/svm/test/linear_svm.sql_in +++ b/src/ports/postgres/modules/svm/test/linear_svm.sql_in @@ -108,7 +108,7 @@ SELECT svm_regression( NULL, NULL, NULL, - 'init_stepsize=0.01, max_iter=50, lambda=2, norm=l1, epsilon=0.01', + 'init_stepsize=0.01, max_iter=50, lambda=2, norm=l2, epsilon=0.01', false); DROP TABLE IF EXISTS svr_test_result; SELECT svm_predict('svr_model', 'svr_train_data', 'id', 'svr_test_result'); @@ -631,3 +631,80 @@ FROM ( where sex = 'M' ) AS q1; +DROP TABLE IF EXISTS m1, m1_summary; +SELECT svm_regression( + 'svr_train_data', + 'm1', + 'label', + 'ind', + NULL,NULL,NULL, + 'init_stepsize=0.01, max_iter=3, lambda=[0.0002, 0.2], ' + 'n_folds=3, epsilon = [0.003, 0.2]', + true); + +DROP TABLE IF EXISTS m1, m1_summary; +SELECT svm_regression( + 'svr_train_data', + 'm1', + 'label', + 'ind', + NULL,NULL,NULL, + 'init_stepsize=0.01, max_iter=2, lambda=[0.0002, 0.2], n_folds=3', + false); +-- check which lambda is selected +SELECT reg_params FROM m1_summary; + +-- epsilon values are ignored +-- the validation table only contains +-- init_stepsize and lambda +DROP TABLE IF EXISTS m1, m1_summary, val_res; +SELECT svm_classification( + 'svm_train_data', + 'm1', + 'label', + 'ind', + NULL,NULL,NULL, + 'init_stepsize=[0.01, 1], max_iter=3, lambda=[20, 0.0002, 0.02], ' + 'n_folds=3, epsilon=[0.1, 1], validation_result=val_res'); +SELECT * FROM val_res; + +DROP TABLE IF EXISTS m1, m1_summary, val_res; +SELECT svm_classification( + 'svm_train_data', + 'm1', + 'label', + 'ind', + NULL,NULL,NULL, + 'init_stepsize=0.01, max_iter=20, lambda=[20, 0.0002, 0.02], ' + 'n_folds=3, validation_result=val_res'); +SELECT * FROM val_res; +-- check which lambda is selected +SELECT reg_params FROM m1_summary; +DROP TABLE IF EXISTS svm_test_predict CASCADE; +SELECT svm_predict('m1','svm_test_data', 'id', 'svm_test_predict'); +-- accuracy with cv +SELECT + count(*) AS misclassification_count +FROM svm_test_predict NATURAL JOIN svm_test_data +WHERE prediction <> label; + +DROP TABLE IF EXISTS m1, m1_summary; +SELECT svm_classification( + 'svm_train_data', + 'm1', + 'label', + 'ind', + NULL,NULL,NULL, + 'init_stepsize=0.01, max_iter=20, lambda=0.000002'); +DROP TABLE IF EXISTS svm_test_predict CASCADE; +SELECT svm_predict('m1','svm_test_data', 'id', 'svm_test_predict'); +-- accuracy without cv +SELECT + count(*) AS misclassification_count +FROM svm_test_predict NATURAL JOIN svm_test_data +WHERE prediction <> label; + + + + + diff --git a/src/ports/postgres/modules/utilities/in_mem_group_control.py_in b/src/ports/postgres/modules/utilities/in_mem_group_control.py_in index b0dcd4b04..ea7868229 100644 --- a/src/ports/postgres/modules/utilities/in_mem_group_control.py_in +++ b/src/ports/postgres/modules/utilities/in_mem_group_control.py_in @@ -238,7 +238,7 @@ class GroupIterationController: def __enter__(self): - verbosity_level = self.kwargs.get('verbosity_level', 'warning') + verbosity_level = "info" if self.kwargs['verbose'] else "error" with MinWarning(verbosity_level): ############################ # create state table @@ -446,8 +446,7 @@ class GroupIterationController: self.iteration = self.iteration + 1 group_param = self.group_param - update_plan = plpy.prepare( - """ + run_sql = """ SELECT {_grp_key} AS {col_grp_key}, {grouping_col}, @@ -472,7 +471,8 @@ class GroupIterationController: _grp_key=group_param.grp_key, select_rel_state=group_param.select_rel_state, select_n_tuples=group_param.select_n_tuples, - **self.kwargs), + **self.kwargs) + update_plan = plpy.prepare(run_sql, ["text[]", group_param.grouped_state_type, "text[]", "integer[]"]) res_tuples = plpy.execute(update_plan, [self.new_states.keys, diff --git a/src/ports/postgres/modules/utilities/utilities.py_in b/src/ports/postgres/modules/utilities/utilities.py_in index e73be2d90..31ba6dab8 100644 --- a/src/ports/postgres/modules/utilities/utilities.py_in +++ b/src/ports/postgres/modules/utilities/utilities.py_in @@ -43,6 +43,17 @@ def get_seg_number(): !>) # ------------------------------------------------------------ +def num_features(source_table, independent_varname): + return plpy.execute("SELECT array_upper({0}, 1) AS dim " + "FROM {1} LIMIT 1" + .format(independent_varname, + source_table))[0]['dim'] +# ------------------------------------------------------------ + +def num_samples(source_table): + return plpy.execute("SELECT count(*) AS n FROM {0}" + .format(source_table))[0]['n'] +# ------------------------------------------------------------ def unique_string(desp='', **kwargs): """ diff --git a/src/ports/postgres/modules/validation/cross_validation.py_in b/src/ports/postgres/modules/validation/cross_validation.py_in index fce391d1b..7b39c90fb 100644 --- a/src/ports/postgres/modules/validation/cross_validation.py_in +++ b/src/ports/postgres/modules/validation/cross_validation.py_in @@ -317,15 +317,15 @@ def cross_validation_general( explore_type_str = "" if not explore_type else "::" + str(explore_type) # all temporary names - tbl_all_data = unique_string() - tbl_train = unique_string() - tbl_valid = unique_string() - col_random_id = unique_string() - tbl_random_id = unique_string() - tbl_output_model = "pg_temp." + unique_string() - tbl_output_pred = "pg_temp." + unique_string() - tbl_output_error = "pg_temp." + unique_string() - tbl_accum_error = unique_string() + tbl_all_data = unique_string(desp='tbl_all_data') + tbl_train = unique_string(desp='tbl_train') + tbl_valid = unique_string(desp='tbl_valid') + col_random_id = unique_string(desp='col_random_id') + tbl_random_id = unique_string(desp='tbl_random_id') + tbl_output_model = "pg_temp." + unique_string(desp='output_model') + tbl_output_pred = "pg_temp." + unique_string(desp='output_pred') + tbl_output_error = "pg_temp." + unique_string(desp='output_error') + tbl_accum_error = unique_string(desp='accum_error') tbl_used, col_random_id = _create_data_tbl_id(**locals()) diff --git a/src/ports/postgres/modules/validation/cv_utils.py_in b/src/ports/postgres/modules/validation/cv_utils.py_in index afd71e83e..b6eb60148 100644 --- a/src/ports/postgres/modules/validation/cv_utils.py_in +++ b/src/ports/postgres/modules/validation/cv_utils.py_in @@ -45,6 +45,7 @@ def __cv_copy_data_with_id(rel_origin, col_data, rel_copied, random_id): def __cv_copy_data_with_id_compute(rel_origin, col_string, rel_copied, random_id): plpy.execute(""" + select setseed(0.5); drop table if exists {rel_copied}; create temp table {rel_copied} as select diff --git a/src/ports/postgres/modules/validation/internal/__init__.py_in b/src/ports/postgres/modules/validation/internal/__init__.py_in new file mode 100644 index 000000000..e69de29bb diff --git a/src/ports/postgres/modules/validation/internal/cross_validation.py_in b/src/ports/postgres/modules/validation/internal/cross_validation.py_in new file mode 100644 index 000000000..0ff1bee44 --- /dev/null +++ b/src/ports/postgres/modules/validation/internal/cross_validation.py_in @@ -0,0 +1,420 @@ +import plpy +from validation.cv_utils import __cv_produce_col_name_string as _cv_col_string +from validation.cv_utils import __cv_validation_rows as _cv_validation_rows +from utilities.utilities import __mad_version +from utilities.utilities import unique_string +from utilities.utilities import num_samples + +from math import sqrt +from collections import namedtuple +from functools import partial +from operator import itemgetter +from operator import attrgetter +from itertools import product, repeat, imap, chain + +version_wrapper = __mad_version() +mad_vec = version_wrapper.select_vecfunc() + + +def _extract_data_cols(y, x): + """ Extract independent data columns from ARRAY[...] and combine it with dependent data column + + Parameters + ========== + y : string + The dependent data column + + x : string + A string of such form: ARRAY[1,?...] where ? indicate 0 or 1 repetitions of the preceding sequence + + Returns + ======= + columns : a list of strings corresponding to column names. + """ + if not x.startswith('ARRAY'): + return [y] + x.split(',') + import re + m = re.search(r'\[((1,)?(?P.+))\]', x) + if not m: + plpy.error("SVM error: invalid ({0}) " + "for cross validation!".format(x)) + return [y] + m.group('cols').split(',') + + +class ValidationResult(object): + """Wrapper class for cross validation results + + Parameters + ========== + cv_history : list, optional + List of dictionaries. + Each dictionary contains the following three keys: + + - mean: float, average of scores using sub_args + - std: float, standard deviation of scores using sub_args + - sub_args: dict, the values of arguments being validated + """ + def __init__(self, cv_history=None): + if cv_history is None: + cv_history = [] + self._cv_history = cv_history + + def _get_leafs(self, sub_args): + def _run(sub_args): + a = [] + for k, v in sub_args.iteritems(): + if isinstance(v, dict): + a.extend(_run(v)) + else: + a.append((k, v)) + return a + return _run(sub_args) + + def _flatten(self): + a = [] + for h in self._cv_history: + h = dict(h) + sub_args = h.pop('sub_args') + h.update(dict(self._get_leafs(sub_args))) + a.append(h) + return a + + def add_one(self, mean, std, sub_args): + """Add one record to the history""" + record = dict(mean=mean, std=std, sub_args=sub_args) + self._cv_history.append(record) + + def sorted(self): + """Sort the history w.r.t. mean value and return a new ValidationResult object""" + ch = sorted(self._cv_history, reverse=True, key=itemgetter('mean')) + return ValidationResult(ch) + + def first(self, attr=None): + """Return the attribute of the first record of history + + Parameters + ========== + attr : string, optional + Any string in {'mean', 'std', 'sub_args'} or None + + Returns + ======= + record : dict or float. + The return value is either the first record, or the value of the corresponding attr in the first record. + """ + if attr is None: + return self._cv_history[0] + else: + return self._cv_history[0].get(attr) + + def top(self, attr=None): + """Return the first after sort""" + svr = self.sorted() + return svr.first(attr) + + # output cv results as a SQL table + def output_tbl(self, tbl_name): + """Create a table tbl_name that contains the history + + The columns of tbl_name are mean, std and the leaf keys in sub_args. + All column types are assumed to be double precision. + """ + if tbl_name == '' or tbl_name is None: + return + + cv_history_f = self._flatten() + header = cv_history_f[0].keys() + # assuming all keys are string + header_str = ','.join(header) + # assuming all values are double precision + header_with_type_str = ','.join([c + ' double precision' + for c in header]) + plpy.execute(""" + DROP TABLE IF EXISTS {tbl_name}; + CREATE TABLE {tbl_name} ({header}) + """.format(tbl_name=tbl_name, + header=header_with_type_str)) + + data = [] + for h in cv_history_f: + values = ','.join([str(h[k]) for k in header]) + data.append("({0})".format(values)) + data = ','.join(data) + + plpy.execute(""" + INSERT INTO {tbl_name}({header}) VALUES + {data}""".format(data=data, + header=header_str, + tbl_name=tbl_name)) + + +class _ValidationArgs(object): + """docstring for _ValidationArgs""" + def __init__(self, args): + self._args = args + + @classmethod + def grid(cls, sub_args): + def comb_dict(dicts): + return dict(chain.from_iterable(d.iteritems() for d in dicts)) + def make_dicts(k, vs): + return [dict([t]) for t in zip(repeat(k), vs)] + + a = [] + for k, v in sub_args.iteritems(): + if isinstance(v, list): + a.append(make_dicts(k, v)) + elif isinstance(v, dict): + a.append(make_dicts(k, cls.grid(v))) + tuples = product(*a) + return map(comb_dict, tuples) + + def make_from(self, **kwargs): + def _update_dict(d1, d2): + if not isinstance(d1, dict): + raise TypeError("{0} is not dict".format(type(d1))) + if not isinstance(d2, dict): + raise TypeError("{0} is not dict".format(type(d2))) + for k, v in d2.iteritems(): + if isinstance(v, dict) and isinstance(d1[k], dict): + _update_dict(d1[k], v) + else: + d1[k] = v + args = dict(self._args) + _update_dict(args, kwargs) + return args + + +def _cv_copy_data(rel_origin, dependent_varname, + independent_varname, rel_copied, random_id): + """ + """ + target_col, features_col = 'y', 'x' + plpy.execute(""" + select setseed(0.5); + drop table if exists {rel_copied}; + create temp table {rel_copied} as + select + row_number() over (order by random()) as {random_id}, + {dependent_varname} as {target_col}, + {independent_varname} as {features_col} + from {rel_origin} + """.format(rel_origin=rel_origin, + rel_copied=rel_copied, + random_id=random_id, + dependent_varname=dependent_varname, + independent_varname=independent_varname, + target_col=target_col, features_col=features_col)) + return target_col, features_col +## ======================================================================== + + +def _cv_split_data(rel_source, col_data, col_id, row_num, + rel_train, rel_valid, n_folds, which_fold): + """ + """ + col_string = _cv_col_string(rel_source, col_data, [col_id]) + + (start_row, end_row) = _cv_validation_rows(row_num, n_folds, which_fold) + kwargs = dict(rel_train=rel_train, rel_source=rel_source, + col_id=col_id, start_row=start_row, + rel_valid=rel_valid, + end_row=end_row, col_string=col_string) + # Extract the training part of data, + # which corresponds to rows outside of [start_row, end_row). + # Extract the validation part of data, + # which corresponds to rows inside of [start_row, end_row). + sql = """ + drop view if exists {rel_train}; + create temp view {rel_train} as + select {col_id}, {col_string} from {rel_source} + where {col_id} < {start_row} + or {col_id} >= {end_row}; + + drop view if exists {rel_valid}; + create temp view {rel_valid} as + select {col_id}, {col_string} from {rel_source} + where {col_id} >= {start_row} + and {col_id} < {end_row} + """.format(**kwargs) + plpy.execute(sql) + return None +# ======================================================================== + + +class CrossValidator(object): + """ + Cross validation for estimator + + Parameters + ========== + estimator : estimator function + The arguments to estimator are contained in args, e.g., estimator(**args) + + predictor : predictor function + Arguments: + + - schema_madlib: see args for details + - model_table: see args for details + - rel_valid: name of data table to be tested + - col_id: columns containing unique id for each data point + - output_table: table created for the testing results + + scorer : string, either 'classification' or 'regression' + Indicate the scoring method to be used. + + args : dict (recursive) + Contains all the arguments to run estimator and the data to be used for validation: + + - source_table: the data table + - independent_varname: the column for features + - dependent_varname: the column for target + - schema_madlib: the schema where madlib is installed + - model_table: table created for the trained model + + """ + def __init__(self, estimator, predictor, scorer, args): + self._cv_args = _ValidationArgs(args) + self._estimator = estimator + self._predictor = predictor + self._scorer = scorer + self._target_col = None + self._features_col = None + self._set_data(**args) + + def _set_data(self, source_table, + independent_varname, + dependent_varname, **kwargs): + self._col_data = _extract_data_cols(dependent_varname, + independent_varname) + self._col_id = unique_string(desp='col_id') + self._rel_copied = unique_string(desp='rel_copied') + self._row_num = num_samples(source_table) + y, x = _cv_copy_data(source_table, dependent_varname, + independent_varname, + self._rel_copied, self._col_id) + self._target_col, self._features_col = y, x + + def _gen_split_data(self, n_folds): + rel_copied = self._rel_copied + col_data = [self._target_col, self._features_col] + col_id = self._col_id + row_num = self._row_num + SplitData = namedtuple('SplitData', 'rel_train, rel_valid') + for k in range(n_folds): + rel_train = unique_string(desp='cv_train_{0}'.format(k)) + rel_valid = unique_string(desp='cv_valid_{0}'.format(k)) + _cv_split_data(rel_copied, col_data, col_id, row_num, + rel_train, rel_valid, n_folds, k+1) + yield SplitData(rel_train=rel_train, rel_valid=rel_valid) + + def _test_one_fold(self, split_data, sub_args): + col_id = self._col_id + estimator = self._estimator + predictor = self._predictor + scorer = self._scorer + rel_train = split_data.rel_train + rel_valid = split_data.rel_valid + + args = self._cv_args.make_from(source_table=rel_train, + dependent_varname=self._target_col, + independent_varname=self._features_col, + **sub_args) + estimator(**args) + + schema_madlib = args['schema_madlib'] + output_table = unique_string(desp='output_table') + model_table = args['model_table'] + + predictor(schema_madlib, model_table, + rel_valid, col_id, output_table) + score = self._score(output_table, rel_valid, scorer) + plpy.execute(""" + DROP TABLE IF EXISTS {model_table}, {model_table}_summary; + """.format(model_table=model_table)) + plpy.execute(""" + DROP TABLE IF EXISTS {output_table}; + """.format(output_table=output_table)) + return score + + def _score(self, pred, orig, method): + target = self._target_col + col_id = self._col_id + if method == 'regression': + return plpy.execute( + """ + SELECT + -avg(({target}-prediction)^2) AS accuracy + FROM {pred} JOIN {orig} + ON {pred}.{id} = {orig}.{id} + """.format(pred=pred, + orig=orig, + id=col_id, + target=target))[0]['accuracy'] + elif method == 'classification': + return plpy.execute( + """ + SELECT (1 - miss / total) AS accuracy + FROM + ( + SELECT count(*)::float8 AS miss + FROM {pred} JOIN {orig} + ON {pred}.{id} = {orig}.{id} + WHERE prediction <> {target}) s, + ( + SELECT count(*)::float8 AS total + FROM {orig}) r; + """.format(pred=pred, + orig=orig, + id=col_id, + target=target))[0]['accuracy'] + else: + plpy.error("Cross Validation Error: invalid method value ({0})! " + "Need to be either classification " + "or regression!".format(method)) + + def validate(self, sub_args, n_folds=3): + """Returns the results of cross validation + + Parameters + ========== + sub_args : dict (recursive) + Arguments to be validated. Multiple values are provided in a list, e.g., + + sub_args = { + 'params_dict': + { + 'lambda': [0.1, 1, 10], + 'epsilon': [0.1, 1, 10] + } + } + + Before running estimator, sub_args_single is generated from sub_args replacing the lists with single value for each argument and args is updated recursively using sub_args_single. + + n_folds : int, default=3 + Number of folds. Must be at least 2 + + Returns + ======= + validation_result : object + See class ValidationResult for more details + """ + def _stats(nums): + n = len(nums) + # no need to check against 0 division + # because n_folds is larger than 1 + a = sum(nums) / n + s = sqrt(sum([(x - a)**2 for x in nums]) / (n - 1)) + return a, s + + if not sub_args: + return [] + + cv_history = ValidationResult() + split_data = list(self._gen_split_data(n_folds)) + for sa in _ValidationArgs.grid(sub_args): + _test = partial(self._test_one_fold, sub_args=sa) + scores = map(_test, split_data) + a, s = _stats(scores) + cv_history.add_one(mean=a, std=s, sub_args=sa) + return cv_history \ No newline at end of file