From 12afd7ba0000eec0de05ba6f59aa780c518f89e0 Mon Sep 17 00:00:00 2001 From: Nandish Jayaram Date: Mon, 24 Apr 2017 09:46:03 -0700 Subject: [PATCH 1/2] Bugfix: Elastic net gives inconsistent result JIRA: MADLIB-1092 - Elastic net used to consider the number of rows as the total number of rows in the table even when grouping was used. This fix changes that to consider the number of rows in a group while computing IGD. - Elastic net used to consider mean and standard deviation for both independent and dependent variables based on the entire table even when grouping was used. This is now computed based on a group, which is used to computed the scaled data when standardize=TRUE for Gaussian IGD. - One approximation still remains. During gradient computation (C++), every value in the independent variable (for each dimension) is subtracted with the mean computed based on the entire table and not groups. This approximiation was adopted since it is messy to pass group specific mean values for every row in the table to the C++ layer. --- .../modules/convex/utils_regularization.py_in | 131 +++++++++- .../elastic_net_generate_result.py_in | 85 ++++-- .../elastic_net_optimizer_fista.py_in | 20 +- .../elastic_net_optimizer_igd.py_in | 106 ++++---- .../elastic_net/elastic_net_utils.py_in | 242 ++++++------------ 5 files changed, 337 insertions(+), 247 deletions(-) diff --git a/src/ports/postgres/modules/convex/utils_regularization.py_in b/src/ports/postgres/modules/convex/utils_regularization.py_in index 64204aaf3..0acd54f08 100644 --- a/src/ports/postgres/modules/convex/utils_regularization.py_in +++ b/src/ports/postgres/modules/convex/utils_regularization.py_in @@ -6,13 +6,12 @@ from validation.cv_utils import __cv_split_data_using_id_col_compute from validation.cv_utils import __cv_split_data_using_id_tbl_compute from validation.cv_utils import __cv_generate_random_id from utilities.utilities import __mad_version +from utilities.utilities import split_quoted_delimited_str version_wrapper = __mad_version() mad_vec = version_wrapper.select_vecfunc() # ======================================================================== - - def __utils_ind_var_scales(**kwargs): """ The mean and standard deviation for each dimension of an array stored in a column. @@ -38,6 +37,50 @@ def __utils_ind_var_scales(**kwargs): return x_scales # ======================================================================== +def __utils_ind_var_scales_grouping(**kwargs): + """ + The mean and standard deviation for each dimension of an array stored in a column. + + Returns: + Dictionary with keys 'mean' and 'std' each with a value of an array of + the mean and std. dev values respectively. + + """ + group_col = _cast_if_null(kwargs.get('grouping_col', None), unique_string('grp_col')) + x_scales = plpy.execute( + """ + CREATE TABLE {x_mean_table} AS + SELECT (f).*, {group_col} + FROM ( + SELECT {group_col}, + {schema_madlib}.__utils_var_scales_result( + {schema_madlib}.utils_var_scales({col_ind_var}, {dimension})) as f + FROM + {tbl_data} + GROUP BY {group_col} + ) q2 + """.format(group_col=group_col, **kwargs)) +# ======================================================================== + +def __utils_dep_var_scale_grouping_binomial(**kwargs): + """ + The mean and standard deviation for each element of the dependent variable, + which is a scalar in ridge and lasso. For Binomial, the mean and std are + set to 0 and 1 respectively for each group. + """ + group_col = _cast_if_null(kwargs.get('grouping_col', None), + unique_string('grp_col')) + plpy.execute( + """ + CREATE TABLE {y_mean_table} AS + SELECT {group_col}, + 0 as mean, + 1 as std + from {tbl_data} + GROUP BY {group_col} + """.format(group_col=group_col, **kwargs)) +# ======================================================================== + def __utils_dep_var_scale(**kwargs): """ The mean and standard deviation for each element of the dependent variable, @@ -53,7 +96,6 @@ def __utils_dep_var_scale(**kwargs): col_ind_var -- independent variables column col_dep_var -- dependent variable column """ - y_scale = plpy.execute( """ select @@ -61,10 +103,85 @@ def __utils_dep_var_scale(**kwargs): 1 as std from {tbl_data} """.format(**kwargs))[0] - return y_scale # ======================================================================== +def __utils_dep_var_scale_grouping(**kwargs): + """ + The mean and standard deviation for each element of the dependent variable, + which is a scalar in ridge and lasso. + + The output will be stored in a temp table: a mean array and a std array + + This function is also used in lasso. + + Parameters: + schema_madlib -- madlib schema + tbl_data -- original data + col_ind_var -- independent variables column + col_dep_var -- dependent variable column + """ + group_col = _cast_if_null(kwargs.get('grouping_col', None), + unique_string('grp_col')) + plpy.execute( + """ + CREATE TABLE {y_mean_table} AS + SELECT {group_col}, + avg(case when not {schema_madlib}.array_contains_null({col_ind_var}) then {col_dep_var} end) as mean, + 1 as std + from {tbl_data} + GROUP BY {group_col} + """.format(group_col=group_col, **kwargs)) +# ======================================================================== + +def __utils_normalize_data_grouping(y_decenter=True, **kwargs): + """ + Normalize the independent and dependent variables using the calculated mean's and std's + in __utils_ind_var_scales and __utils_dep_var_scale. + + Compute the scaled variables by: scaled_value = (origin_value - mean) / std, and special + care is needed if std is zero. + + The output is a table with scaled independent and dependent variables. + + This function is also used in lasso. + + Parameters: + tbl_data -- original data + col_ind_var -- independent variables column + dimension -- length of independent variable array + col_dep_var -- dependent variable column + tbl_ind_scales -- independent variables scales array + tbl_dep_scale -- dependent variable scale + tbl_data_scaled -- scaled data result + col_ind_var_norm_new -- create a new name for the scaled array + to be compatible with array[...] expressions + """ + group_col = kwargs.get('grouping_col') + group_col_list = split_quoted_delimited_str(group_col) + group_where_x = ' AND '.join(['{tbl_data}.{grp}=__x__.{grp}'.format(grp=grp, + **kwargs) for grp in group_col_list]) + group_where_y = ' AND '.join(['{tbl_data}.{grp}=__y__.{grp}'.format(grp=grp, + **kwargs) for grp in group_col_list]) + ydecenter_str = "- __y__.mean".format(**kwargs) if y_decenter else "" + plpy.execute( + """ + create temp table {tbl_data_scaled} as + select + ({schema_madlib}.utils_normalize_data({col_ind_var}, + __x__.mean::double precision[], + __x__.std::double precision[])) + as {col_ind_var_norm_new}, + ({col_dep_var} {ydecenter_str}) as {col_dep_var_norm_new}, + {tbl_data}.{group_col} + from {tbl_data} + INNER JOIN {x_mean_table} AS __x__ ON {group_where_x} + INNER JOIN {y_mean_table} AS __y__ ON {group_where_y} + """.format(ydecenter_str=ydecenter_str, group_col=group_col, + group_where_x=group_where_x, group_where_y=group_where_y, **kwargs)) + return None +# ======================================================================== + def __utils_normalize_data(y_decenter=True, **kwargs): """ Normalize the independent and dependent variables using the calculated mean's and std's @@ -88,7 +205,6 @@ def __utils_normalize_data(y_decenter=True, **kwargs): col_ind_var_norm_new -- create a new name for the scaled array to be compatible with array[...] expressions """ - group_col = _cast_if_null(kwargs.get('grouping_col', None), unique_string('grp_col')) ydecenter_str = "- {y_mean}".format(**kwargs) if y_decenter else "" plpy.execute( """ @@ -98,10 +214,9 @@ def __utils_normalize_data(y_decenter=True, **kwargs): '{x_mean_str}'::double precision[], '{x_std_str}'::double precision[])) as {col_ind_var_norm_new}, - ({col_dep_var} {ydecenter_str}) as {col_dep_var_norm_new}, - {group_col} + ({col_dep_var} {ydecenter_str}) as {col_dep_var_norm_new} from {tbl_data} - """.format(ydecenter_str=ydecenter_str, group_col=group_col, **kwargs)) + """.format(ydecenter_str=ydecenter_str, **kwargs)) return None # ======================================================================== diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in index 6246ed9e1..15dda32c6 100644 --- a/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in +++ b/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in @@ -2,7 +2,7 @@ import plpy from elastic_net_utils import _process_results from elastic_net_utils import _compute_log_likelihood from utilities.validate_args import get_cols_and_types - +from utilities.utilities import split_quoted_delimited_str def _elastic_net_generate_result(optimizer, iteration_run, **args): """ @@ -10,6 +10,10 @@ def _elastic_net_generate_result(optimizer, iteration_run, **args): """ standardize_flag = "True" if args["normalization"] else "False" source_table = args["rel_source"] + data_scaled = False + if args["normalization"] or optimizer == "igd": + # x_mean_table and y_mean_table are created only in these conditions. + data_scaled = True if optimizer == "fista": result_func = "__gaussian_fista_result({0})".format(args["col_grp_state"]) elif optimizer == "igd": @@ -30,20 +34,39 @@ def _elastic_net_generate_result(optimizer, iteration_run, **args): grouping_str = args['grouping_str'] cols_types = dict(get_cols_and_types(args["tbl_source"])) grouping_str1 = grouping_column + "," - select_grouping_info = ','.join([grp_col.strip() + "\t" + cols_types[grp_col.strip()] - for grp_col in grouping_column.split(',')]) + "," + + select_x_mean = '' + select_x_std = '' + select_y_mean = '' + inner_join_x = '' + inner_join_y = '' + if data_scaled: + grouping_cols_list = split_quoted_delimited_str(grouping_column) + select_grouping_info = ','.join([ + grp_col.strip()+"\t"+cols_types[grp_col.strip()] + for grp_col in grouping_column.split(',')]) + "," + select_grp = ','.join(['n_tuples_including_nulls_subq.'+str(grp) + for grp in grouping_cols_list]) + ',' + x_grp_cols = ' AND '.join([ + 'n_tuples_including_nulls_subq.{0}={1}.{2}'.format(grp, + args["x_mean_table"], grp) for grp in grouping_cols_list]) + y_grp_cols = ' AND '.join([ + 'n_tuples_including_nulls_subq.{0}={1}.{2}'.format(grp, + args["y_mean_table"], grp) for grp in grouping_cols_list]) + select_x_mean = ' {0}.mean AS x_mean, '.format(args["x_mean_table"]) + select_y_mean = ' {0}.mean AS y_mean, '.format(args["y_mean_table"]) + select_x_std = ' {0}.std AS x_std, '.format(args["x_mean_table"]) + inner_join_x = ' INNER JOIN {0} ON {1} '.format( + args["x_mean_table"], x_grp_cols) + inner_join_y = ' INNER JOIN {0} ON {1} '.format( + args["y_mean_table"], y_grp_cols) out_table_qstr = """ SELECT - {grouping_str1} + {select_grp} + {select_x_mean} {select_x_std} {select_y_mean} (result).coefficients AS coef, (result).intercept AS intercept FROM - ( - SELECT {schema_madlib}.{result_func} AS result, {col_grp_key} - FROM {tbl_state} - WHERE {col_grp_iteration} = {iteration_run} - ) t - JOIN ( SELECT {grouping_str1} @@ -51,7 +74,14 @@ def _elastic_net_generate_result(optimizer, iteration_run, **args): FROM {source_table} GROUP BY {grouping_col}, {col_grp_key} ) n_tuples_including_nulls_subq - USING ({col_grp_key}) + INNER JOIN + ( + SELECT {schema_madlib}.{result_func} AS result, {col_grp_key} + FROM {tbl_state} + WHERE {col_grp_iteration} = {iteration_run} + ) t USING ({col_grp_key}) + {inner_join_x} + {inner_join_y} """.format(result_func=result_func, tbl_state=tbl_state, grouping_col=grouping_column, @@ -61,7 +91,13 @@ def _elastic_net_generate_result(optimizer, iteration_run, **args): grouping_str=grouping_str, col_grp_key=col_grp_key, source_table=source_table, - schema_madlib=args["schema_madlib"]) + schema_madlib=args["schema_madlib"], + select_grp=select_grp, + select_x_mean=select_x_mean, + select_y_mean=select_y_mean, + select_x_std=select_x_std, + inner_join_x=inner_join_x, + inner_join_y=inner_join_y) else: # It's a much simpler query when there is no grouping. grouping_str1 = "" @@ -139,7 +175,12 @@ def build_output_table(res, grouping_column, grouping_str1, r_coef = res["coef"] if r_coef: if args["normalization"]: - (coef, intercept) = _restore_scale(r_coef, res["intercept"], args) + if grouping_column: + (coef, intercept) = _restore_scale(r_coef, res["intercept"], + args, res["x_mean"], res["x_std"], res["y_mean"]) + else: + (coef, intercept) = _restore_scale(r_coef, + res["intercept"], args) else: coef = r_coef intercept = res["intercept"] @@ -167,20 +208,22 @@ def build_output_table(res, grouping_column, grouping_str1, **args) plpy.execute(fquery) # ------------------------------------------------------------------------ - - -def _restore_scale(coef, intercept, args): +def _restore_scale(coef, intercept, args, + x_mean=None, x_std=None, y_mean=None): """ Restore the original scales """ + if x_mean == None and x_std == None and y_mean == None: + x_mean = args["x_scales"]["mean"] + y_mean = args["y_scale"]["mean"] + x_std = args["x_scales"]["std"] rcoef = [0] * len(coef) if args["family"] == "gaussian": - rintercept = float(args["y_scale"]["mean"]) + rintercept = float(y_mean) elif args["family"] == "binomial": rintercept = float(intercept) for i in range(len(coef)): - if args["x_scales"]["std"][i] != 0: - rcoef[i] = coef[i] / args["x_scales"]["std"][i] - rintercept -= (coef[i] * args["x_scales"]["mean"][i] / - args["x_scales"]["std"][i]) + if x_std[i] != 0: + rcoef[i] = coef[i] / x_std[i] + rintercept -= (coef[i] * x_mean[i] / x_std[i]) return (rcoef, rintercept) diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_fista.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_fista.py_in index f50a21479..a6ef699b3 100644 --- a/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_fista.py_in +++ b/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_fista.py_in @@ -136,6 +136,8 @@ def _fista_cleanup_temp_tbls(**kwargs): drop table if exists {tbl_data_scaled}; drop table if exists {tbl_fista_args}; drop table if exists pg_temp.{tbl_fista_state}; + drop table if exists {x_mean_table}; + drop table if exists {y_mean_table}; """.format(**kwargs)) return None @@ -209,7 +211,8 @@ def _elastic_net_fista_train_compute(schema_madlib, func_step_aggregate, lambda_value, tolerance, schema_madlib)) - + args.update({'x_mean_table':unique_string(desp='x_mean_table')}) + args.update({'y_mean_table':unique_string(desp='y_mean_table')}) args.update({'grouping_col': grouping_col}) # use normalized data or not if normalization: @@ -226,19 +229,15 @@ def _elastic_net_fista_train_compute(schema_madlib, func_step_aggregate, if args["warmup_lambdas"] is not None: args["warm_no"] = len(args["warmup_lambdas"]) - args["warmup_lambdas"] = args["warmup_lambdas"] if args["warmup"] and args["warmup_lambdas"] is None: # average squares of each feature # used to estimate the largest lambda value args["sq"] = _compute_average_sq(**args) args["warmup_lambdas"] = \ - _generate_warmup_lambda_sequence( - tbl_used, args["col_ind_var_new"], args["col_dep_var_new"], - dimension, row_num, lambda_value, alpha, - args["warmup_lambda_no"], args["sq"]) + _generate_warmup_lambda_sequence(lambda_value, + args["warmup_lambda_no"]) args["warm_no"] = len(args["warmup_lambdas"]) - args["warmup_lambdas"] = args["warmup_lambdas"] elif args["warmup"] is False: args["warm_no"] = 1 args["warmup_lambdas"] = [lambda_value] # only one value @@ -340,6 +339,11 @@ def _compute_fista(schema_madlib, func_step_aggregate, func_state_diff, if (it.kwargs["lambda_count"] > len(args.get('lambda_name'))): break it.kwargs["warmup_lambda_value"] = args.get('lambda_name')[it.kwargs["lambda_count"] - 1] + # Fix for JIRA MADLIB-1092 + # 'col_n_tuples' is supposed to refer to the number of rows in the + # table, or the number of rows in a group. col_n_tuples gets + # the right value in in_mem_group_control, so using this instead + # of row_num (which was used hitherto). it.update(""" {schema_madlib}.{func_step_aggregate}( ({col_ind_var})::double precision[], @@ -348,7 +352,7 @@ def _compute_fista(schema_madlib, func_step_aggregate, func_state_diff, ({warmup_lambda_value})::double precision, ({alpha})::double precision, ({dimension})::integer, - ({row_num})::integer, + ({col_n_tuples})::integer, ({max_stepsize})::double precision, ({eta})::double precision, ({use_active_set})::integer, diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in index 091aefb78..d73a75423 100644 --- a/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in +++ b/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in @@ -144,6 +144,8 @@ def _igd_cleanup_temp_tbls(**args): drop table if exists {tbl_data_scaled}; drop table if exists {tbl_igd_args}; drop table if exists pg_temp.{tbl_igd_state}; + drop table if exists {x_mean_table}; + drop table if exists {y_mean_table}; """.format(**args)) return None # ------------------------------------------------------------------------ @@ -193,17 +195,19 @@ def _elastic_net_igd_train_compute(schema_madlib, func_step_aggregate, is '{arg = value, ...}'::varchar[] """ with MinWarning('error'): - (dimension, row_num) = _tbl_dimension_rownum(schema_madlib, tbl_source, col_ind_var) + (dimension, row_num) = _tbl_dimension_rownum(schema_madlib, + tbl_source, col_ind_var) # generate a full dict to ease the following string format # including several temporary table names - args = _igd_construct_dict(schema_madlib, family, tbl_source, col_ind_var, - col_dep_var, tbl_result, - dimension, row_num, lambda_value, alpha, normalization, - max_iter, tolerance, outstr_array, - _igd_params_parser(optimizer_params, lambda_value, - tolerance, schema_madlib)) - + args = _igd_construct_dict(schema_madlib, family, tbl_source, + col_ind_var, col_dep_var, tbl_result, dimension, row_num, + lambda_value, alpha, normalization, max_iter, tolerance, + outstr_array, _igd_params_parser(optimizer_params, lambda_value, + tolerance, schema_madlib)) + + args.update({'x_mean_table':unique_string(desp='x_mean_table')}) + args.update({'y_mean_table':unique_string(desp='y_mean_table')}) args.update({'grouping_col': grouping_col}) # use normalized data or not if normalization: @@ -216,9 +220,10 @@ def _elastic_net_igd_train_compute(schema_madlib, func_step_aggregate, tbl_used = tbl_source args["col_ind_var_new"] = col_ind_var args["col_dep_var_new"] = col_dep_var - args["tbl_used"] = tbl_used + # parameter values required by the IGD optimizer + (xmean, ymean) = _compute_means(args) # average squares of each feature # used to estimate the largest lambda value # also used to screen out tiny values, so order is needed @@ -227,23 +232,16 @@ def _elastic_net_igd_train_compute(schema_madlib, func_step_aggregate, if args["warmup_lambdas"] is not None: args["warm_no"] = len(args["warmup_lambdas"]) - args["warmup_lambdas"] = args["warmup_lambdas"] if args["warmup"] and args["warmup_lambdas"] is None: args["warmup_lambdas"] = \ - _generate_warmup_lambda_sequence( - args["tbl_used"], args["col_ind_var_new"], args["col_dep_var_new"], - dimension, row_num, lambda_value, alpha, - args["warmup_lambda_no"], args["sq"]) + _generate_warmup_lambda_sequence(lambda_value, + args["warmup_lambda_no"]) args["warm_no"] = len(args["warmup_lambdas"]) - args["warmup_lambdas"] = args["warmup_lambdas"] elif args["warmup"] is False: args["warm_no"] = 1 args["warmup_lambdas"] = [lambda_value] # only one value - # parameter values required by the IGD optimizer - (xmean, ymean) = _compute_means(**args) - args.update({ 'rel_args': args["tbl_igd_args"], 'rel_state': args["tbl_igd_state"], @@ -263,38 +261,39 @@ def _elastic_net_igd_train_compute(schema_madlib, func_step_aggregate, if not args.get('parallel'): func_step_aggregate += "_single_seg" # perform the actual calculation - iteration_run = _compute_igd(schema_madlib, - func_step_aggregate, - func_state_diff, - args["tbl_igd_args"], - args["tbl_igd_state"], - tbl_used, - args["col_ind_var_new"], - args["col_dep_var_new"], - grouping_str, - grouping_col, - start_iter=0, - max_iter=args["max_iter"], - tolerance=args["tolerance"], - warmup_tolerance=args["warmup_tolerance"], - warm_no=args["warm_no"], - step_decay=args["step_decay"], - dimension=args["dimension"], - stepsize=args["stepsize"], - lambda_name=args["warmup_lambdas"], - warmup_lambda_value=args.get('warmup_lambdas')[args["lambda_count"]-1], - alpha=args["alpha"], - row_num=args["row_num"], - xmean_val=args["xmean_val"], - ymean_val=args["ymean_val"], - lambda_count=args["lambda_count"], - rel_state=args["tbl_igd_state"], - col_grp_iteration=args["col_grp_iteration"], - col_grp_state=args["col_grp_state"], - col_grp_key=args["col_grp_key"], - col_n_tuples=args["col_n_tuples"], - rel_source=args["rel_source"], - state_type=args["state_type"],) + iteration_run = _compute_igd( + schema_madlib, + func_step_aggregate, + func_state_diff, + args["tbl_igd_args"], + args["tbl_igd_state"], + tbl_used, + args["col_ind_var_new"], + args["col_dep_var_new"], + grouping_str, + grouping_col, + start_iter=0, + max_iter=args["max_iter"], + tolerance=args["tolerance"], + warmup_tolerance=args["warmup_tolerance"], + warm_no=args["warm_no"], + step_decay=args["step_decay"], + dimension=args["dimension"], + stepsize=args["stepsize"], + lambda_name=args["warmup_lambdas"], + warmup_lambda_value=args.get('warmup_lambdas')[args["lambda_count"]-1], + alpha=args["alpha"], + row_num=args["row_num"], + xmean_val=args["xmean_val"], + ymean_val=args["ymean_val"], + lambda_count=args["lambda_count"], + rel_state=args["tbl_igd_state"], + col_grp_iteration=args["col_grp_iteration"], + col_grp_state=args["col_grp_state"], + col_grp_key=args["col_grp_key"], + col_n_tuples=args["col_n_tuples"], + rel_source=args["rel_source"], + state_type=args["state_type"]) _elastic_net_generate_result("igd", iteration_run, **args) @@ -341,6 +340,11 @@ def _compute_igd(schema_madlib, func_step_aggregate, func_state_diff, if (it.kwargs["lambda_count"] > len(args.get('lambda_name'))): break it.kwargs["warmup_lambda_value"] = args.get('lambda_name')[it.kwargs["lambda_count"] - 1] + # Fix for JIRA MADLIB-1092 + # 'col_n_tuples' is supposed to refer to the number of rows in the + # table, or the number of rows in a group. col_n_tuples gets + # the right value in in_mem_group_control, so using this instead + # of row_num (which was used hitherto). it.update(""" {schema_madlib}.{func_step_aggregate}( ({col_ind_var})::double precision[], @@ -350,7 +354,7 @@ def _compute_igd(schema_madlib, func_step_aggregate, func_state_diff, ({alpha})::double precision, ({dimension})::integer, ({stepsize})::double precision, - ({row_num})::integer, + ({col_n_tuples})::integer, ('{xmean_val}')::double precision[], ({ymean_val})::double precision, ({step_decay})::double precision diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in index ce6b28020..a15fe5d34 100644 --- a/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in +++ b/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in @@ -6,8 +6,11 @@ from utilities.utilities import _array_to_string from convex.utils_regularization import __utils_ind_var_scales from convex.utils_regularization import __utils_dep_var_scale from convex.utils_regularization import __utils_normalize_data +from convex.utils_regularization import __utils_ind_var_scales_grouping +from convex.utils_regularization import __utils_dep_var_scale_grouping +from convex.utils_regularization import __utils_normalize_data_grouping +from convex.utils_regularization import __utils_dep_var_scale_grouping_binomial from utilities.validate_args import table_exists -from utilities.control import IterationController2S from collections import namedtuple @@ -108,7 +111,6 @@ def _generate_warmup_lambda_sequence(lambda_value, n_steps): return seq # ------------------------------------------------------------------------ - def _compute_average_sq(**args): """ Compute the average squares of all features, used to estimtae the largest lambda @@ -192,16 +194,38 @@ def _elastic_net_validate_args(tbl_source, col_ind_var, col_dep_var, return None # ------------------------------------------------------------------------ +def _compute_data_scales_grouping(args): + __utils_ind_var_scales_grouping(tbl_data=args["tbl_source"], + col_ind_var=args["col_ind_var"], dimension=args["dimension"], + schema_madlib=args["schema_madlib"], + grouping_col=args["grouping_col"], + x_mean_table=args["x_mean_table"]) + if args["family"] == "binomial": + # set mean and std to 0 and 1 respectively, for each group. + __utils_dep_var_scale_grouping_binomial( + tbl_data=args["tbl_source"], + grouping_col=args["grouping_col"], + y_mean_table=args["y_mean_table"]) + else: + __utils_dep_var_scale_grouping(schema_madlib=args["schema_madlib"], + tbl_data=args["tbl_source"], + col_ind_var=args["col_ind_var"], + col_dep_var=args["col_dep_var"], + grouping_col=args["grouping_col"], + y_mean_table=args["y_mean_table"]) def _compute_data_scales(args): - args["x_scales"] = __utils_ind_var_scales(tbl_data=args["tbl_source"], col_ind_var=args["col_ind_var"], - dimension=args["dimension"], schema_madlib=args["schema_madlib"]) + args["x_scales"] = __utils_ind_var_scales(tbl_data=args["tbl_source"], + col_ind_var=args["col_ind_var"], dimension=args["dimension"], + schema_madlib=args["schema_madlib"]) if args["family"] == "binomial": args["y_scale"] = dict(mean=0, std=1) else: - args["y_scale"] = __utils_dep_var_scale(schema_madlib=args["schema_madlib"], tbl_data=args["tbl_source"], - col_ind_var=args["col_ind_var"], col_dep_var=args["col_dep_var"]) + args["y_scale"] = __utils_dep_var_scale( + schema_madlib=args["schema_madlib"], + tbl_data=args["tbl_source"], col_ind_var=args["col_ind_var"], + col_dep_var=args["col_dep_var"]) args["xmean_str"] = _array_to_string(args["x_scales"]["mean"]) # ------------------------------------------------------------------------ @@ -214,23 +238,35 @@ def _normalize_data(args): The output is stored in tbl_data_scaled """ - _compute_data_scales(args) - y_decenter = True if args["family"] == "gaussian" else False - - __utils_normalize_data(y_decenter=y_decenter, - tbl_data=args["tbl_source"], - col_ind_var=args["col_ind_var"], - col_dep_var=args["col_dep_var"], - tbl_data_scaled=args["tbl_data_scaled"], - col_ind_var_norm_new=args["col_ind_var_norm_new"], - col_dep_var_norm_new=args["col_dep_var_norm_new"], - schema_madlib=args["schema_madlib"], - x_mean_str=args["xmean_str"], - x_std_str=_array_to_string(args["x_scales"]["std"]), - y_mean=args["y_scale"]["mean"], - y_std=args["y_scale"]["std"], - grouping_col=args["grouping_col"]) + if args["grouping_col"]: + _compute_data_scales_grouping(args) + __utils_normalize_data_grouping(y_decenter=y_decenter, + tbl_data=args["tbl_source"], + col_ind_var=args["col_ind_var"], + col_dep_var=args["col_dep_var"], + tbl_data_scaled=args["tbl_data_scaled"], + col_ind_var_norm_new=args["col_ind_var_norm_new"], + col_dep_var_norm_new=args["col_dep_var_norm_new"], + schema_madlib=args["schema_madlib"], + x_mean_table=args["x_mean_table"], + y_mean_table=args["y_mean_table"], + grouping_col=args["grouping_col"]) + else: + _compute_data_scales(args) + __utils_normalize_data(y_decenter=y_decenter, + tbl_data=args["tbl_source"], + col_ind_var=args["col_ind_var"], + col_dep_var=args["col_dep_var"], + tbl_data_scaled=args["tbl_data_scaled"], + col_ind_var_norm_new=args["col_ind_var_norm_new"], + col_dep_var_norm_new=args["col_dep_var_norm_new"], + schema_madlib=args["schema_madlib"], + x_mean_str=args["xmean_str"], + x_std_str=_array_to_string(args["x_scales"]["std"]), + y_mean=args["y_scale"]["mean"], + y_std=args["y_scale"]["std"], + grouping_col=args["grouping_col"]) return None # ------------------------------------------------------------------------ @@ -242,27 +278,27 @@ def _tbl_dimension_rownum(schema_madlib, tbl_source, col_ind_var): """ # independent variable array length dimension = plpy.execute(""" - select array_upper({col_ind_var},1) as dimension - from {tbl_source} limit 1 - """.format(tbl_source=tbl_source, - col_ind_var=col_ind_var))[0]["dimension"] + SELECT array_upper({col_ind_var},1) AS dimension + FROM {tbl_source} LIMIT 1 + """.format(tbl_source=tbl_source, + col_ind_var=col_ind_var))[0]["dimension"] # total row number of data source table - # The WHERE clause here ignores rows in the table that contain one or more NULLs in the - # independent variable (x). There is no NULL check made for the dependent variable (y), - # since one of the hard requirements/assumptions of the input data to elastic_net is that the - # dependent variable cannot be NULL. + # The WHERE clause here ignores rows in the table that contain one or more + # NULLs in the independent variable (x). There is no NULL check made for + # the dependent variable (y), since one of the hard assumptions of the + # input data to elastic_net is that the dependent variable cannot be NULL. row_num = plpy.execute(""" - select count(*) from {tbl_source} - WHERE not {schema_madlib}.array_contains_null({col_ind_var}) - """.format(tbl_source=tbl_source, - schema_madlib=schema_madlib, - col_ind_var=col_ind_var))[0]["count"] + SELECT COUNT(*) FROM {tbl_source} + WHERE NOT {schema_madlib}.array_contains_null({col_ind_var}) + """.format(tbl_source=tbl_source, + schema_madlib=schema_madlib, + col_ind_var=col_ind_var))[0]["count"] return (dimension, row_num) # ------------------------------------------------------------------------ -def _compute_means(**args): +def _compute_means(args): """ Compute the averages of dependent (y) and independent (x) variables """ @@ -270,127 +306,15 @@ def _compute_means(**args): xmean_str = _array_to_string([0] * args["dimension"]) ymean = 0 return (xmean_str, ymean) - else: - return (args["xmean_str"], args["y_scale"]["mean"]) -# ------------------------------------------------------------------------ - - -class IterationControllerNoTableDrop (IterationController2S): - - """ - IterationController but without table dropping - - Useful if one wants to use it in cross validation - where dropping tables in a loop would use up all the locks - and get "out of memory" error - """ - # ------------------------------------------------------------------------ - - def __init__(self, rel_args, rel_state, stateType, - temporaryTables=True, - truncAfterIteration=False, - schema_madlib="MADLIB_SCHEMA_MISSING", - verbose=False, - **kwargs): - # Need to call super class's init method to initialize - # member fields - super(IterationControllerNoTableDrop, self).__init__( - self, rel_args, rel_state, stateType, temporaryTables, - truncAfterIteration, schema_madlib, verbose, **kwargs) - # self.kwargs["rel_state"] = "pg_temp" + rel_state, but for testing - # the existence of a table, schema name should be used together - self.state_exists = plpy.execute( - "select count(*) from information_schema.tables " - "where table_name = '{0}' and table_schema = 'pg_temp'". - format(rel_state))[0]['count'] == 1 - # The current total row number of rel_state table - if self.state_exists: - self.state_row_num = plpy.execute("select count(*) from {rel_state}". - format(**self.kwargs))[0]["count"] - - # ------------------------------------------------------------------------ - - def update(self, newState): - """ - Update state of calculation. - """ - newState = newState.format(iteration=self.iteration, **self.kwargs) - self.iteration += 1 - if self.state_exists and self.iteration <= self.state_row_num: - # If the rel_state table already exists, and - # iteration number is smaller than total row number, - # use UPDATE instead of append. UPDATE does not use - # extra locks. - self.runSQL(""" - update {rel_state} set _state = ({newState}) - where _iteration = {iteration} - """.format(iteration=self.iteration, - newState=newState, - **self.kwargs)) - else: - # rel_state table is newly created, and - # append data to this table - self.runSQL(""" - INSERT INTO {rel_state} - SELECT - {iteration}, - ({newState}) - """.format(iteration=self.iteration, - newState=newState, - **self.kwargs)) - # ------------------------------------------------------------------------ - - def __enter__(self): - """ - __enter__ and __exit__ methods are special. They are automatically called - when using "with" block. - """ - if self.state_exists is False: - # create rel_state table when it does not already exist - super(IterationControllerNoTableDrop, self).__enter__(self) - self.inWith = True - return self + if args["grouping_col"]: + # We can use the mean of the entire table instead of groups here. + # The absolute correct thing to do is to use group specific + # mean, but we will need to add a new column and change the input + # table contents to do that (it has to be accessed by the group + # iteration controller, C++ code). That is a lot more messier, + # so living with this approximation for now. + _compute_data_scales(args) + # If there is no grouping_col, note that _compute_data_scales() was + # already called, so we don't have to call it again. + return (args["xmean_str"], args["y_scale"]["mean"]) # ------------------------------------------------------------------------ - - -class IterationControllerTableAppend (IterationControllerNoTableDrop): - - def __init__(self, rel_args, rel_state, stateType, - temporaryTables=True, - truncAfterIteration=False, - schema_madlib="MADLIB_SCHEMA_MISSING", - verbose=False, - **kwargs): - self.kwargs = kwargs - self.kwargs.update( - rel_args=rel_args, - rel_state=rel_state, - stateType=stateType.format(schema_madlib=schema_madlib), - schema_madlib=schema_madlib) - self.temporaryTables = temporaryTables - self.truncAfterIteration = truncAfterIteration - self.verbose = verbose - self.inWith = False - self.iteration = -1 - - self.state_exists = plpy.execute(""" - select count(*) - from information_schema.tables - where table_name = '{rel_state}' - """.format(**self.kwargs))[0]['count'] == 1 - # ------------------------------------------------------------------------ - - def update(self, newState): - """ - Update state of calculation. - """ - newState = newState.format(iteration=self.iteration, **self.kwargs) - self.iteration += 1 - self.runSQL(""" - INSERT INTO {rel_state} - SELECT - {iteration}, - ({newState}) - """.format(iteration=self.iteration, - newState=newState, - **self.kwargs)) From 6dd17e31aeab2214b4e80397b7f276d06a092ec7 Mon Sep 17 00:00:00 2001 From: Nandish Jayaram Date: Fri, 28 Apr 2017 17:44:40 -0700 Subject: [PATCH 2/2] Minor code changes --- .../modules/convex/utils_regularization.py_in | 134 +++++++++--------- .../elastic_net_generate_result.py_in | 34 ++--- .../elastic_net/elastic_net_utils.py_in | 46 +++--- 3 files changed, 96 insertions(+), 118 deletions(-) diff --git a/src/ports/postgres/modules/convex/utils_regularization.py_in b/src/ports/postgres/modules/convex/utils_regularization.py_in index 0acd54f08..879c0d6ad 100644 --- a/src/ports/postgres/modules/convex/utils_regularization.py_in +++ b/src/ports/postgres/modules/convex/utils_regularization.py_in @@ -12,9 +12,11 @@ version_wrapper = __mad_version() mad_vec = version_wrapper.select_vecfunc() # ======================================================================== -def __utils_ind_var_scales(**kwargs): + +def __utils_ind_var_scales(tbl_data, col_ind_var, dimension, schema_madlib): """ - The mean and standard deviation for each dimension of an array stored in a column. + The mean and standard deviation for each dimension of an array stored + in a column. Returns: Dictionary with keys 'mean' and 'std' each with a value of an array of @@ -31,25 +33,23 @@ def __utils_ind_var_scales(**kwargs): FROM {tbl_data} ) q2 - """.format(**kwargs))[0] + """.format(**locals()))[0] x_scales["mean"] = mad_vec(x_scales["mean"], text=False) x_scales["std"] = mad_vec(x_scales["std"], text=False) return x_scales # ======================================================================== -def __utils_ind_var_scales_grouping(**kwargs): +def __utils_ind_var_scales_grouping(tbl_data, col_ind_var, dimension, + schema_madlib, grouping_col, x_mean_table): """ - The mean and standard deviation for each dimension of an array stored in a column. - - Returns: - Dictionary with keys 'mean' and 'std' each with a value of an array of - the mean and std. dev values respectively. - + The mean and standard deviation for each dimension of an array stored in + a column. Creates a table containing the mean (array) and std of each + dimension of the independent variable, for each group. """ - group_col = _cast_if_null(kwargs.get('grouping_col', None), unique_string('grp_col')) + group_col = _cast_if_null(grouping_col, unique_string('grp_col')) x_scales = plpy.execute( """ - CREATE TABLE {x_mean_table} AS + CREATE TEMP TABLE {x_mean_table} AS SELECT (f).*, {group_col} FROM ( SELECT {group_col}, @@ -59,35 +59,15 @@ def __utils_ind_var_scales_grouping(**kwargs): {tbl_data} GROUP BY {group_col} ) q2 - """.format(group_col=group_col, **kwargs)) -# ======================================================================== - -def __utils_dep_var_scale_grouping_binomial(**kwargs): - """ - The mean and standard deviation for each element of the dependent variable, - which is a scalar in ridge and lasso. For Binomial, the mean and std are - set to 0 and 1 respectively for each group. - """ - group_col = _cast_if_null(kwargs.get('grouping_col', None), - unique_string('grp_col')) - plpy.execute( - """ - CREATE TABLE {y_mean_table} AS - SELECT {group_col}, - 0 as mean, - 1 as std - from {tbl_data} - GROUP BY {group_col} - """.format(group_col=group_col, **kwargs)) + """.format(**locals())) # ======================================================================== -def __utils_dep_var_scale(**kwargs): +def __utils_dep_var_scale(schema_madlib, tbl_data, col_ind_var, + col_dep_var): """ The mean and standard deviation for each element of the dependent variable, which is a scalar in ridge and lasso. - The output will be stored in a temp table: a mean array and a std array - This function is also used in lasso. Parameters: @@ -98,53 +78,67 @@ def __utils_dep_var_scale(**kwargs): """ y_scale = plpy.execute( """ - select - avg(case when not {schema_madlib}.array_contains_null({col_ind_var}) then {col_dep_var} end) as mean, - 1 as std - from {tbl_data} - """.format(**kwargs))[0] + SELECT + avg(CASE WHEN NOT {schema_madlib}.array_contains_null({col_ind_var}) THEN {col_dep_var} END) AS mean, + 1 AS std + FROM {tbl_data} + """.format(**locals()))[0] return y_scale # ======================================================================== -def __utils_dep_var_scale_grouping(**kwargs): +def __utils_dep_var_scale_grouping(y_mean_table, tbl_data, grouping_col, + family, schema_madlib=None, col_ind_var=None, col_dep_var=None): """ The mean and standard deviation for each element of the dependent variable, - which is a scalar in ridge and lasso. + w.r.t a group, which is a scalar in ridge and lasso. - The output will be stored in a temp table: a mean array and a std array + The output will be stored in a temp table: a mean array and a std array, + for each group. + If the family is Binomial, mean and std for each group is set to 0 and 1 + respectively. This function is also used in lasso. Parameters: + y_mean_table -- name of the output table to write into + tbl_data -- input table + grouping_col -- the columns to group the data on + family -- if family is Gaussian, ALL following parameters must be defined schema_madlib -- madlib schema - tbl_data -- original data col_ind_var -- independent variables column col_dep_var -- dependent variable column """ - group_col = _cast_if_null(kwargs.get('grouping_col', None), - unique_string('grp_col')) + group_col = _cast_if_null(grouping_col, unique_string('grp_col')) + if family == 'binomial': + mean_str = '0' + else: + # If the family is Gaussian, schema_madlib, col_ind_var and + # col_dep_var must be passed along. + if schema_madlib is None or col_ind_var is None or col_dep_var is None: + plpy.error("Schema name, indpendent column and dependent column names required.") + mean_str = ' avg(CASE WHEN NOT {0}.array_contains_null({1}) THEN {2} END) '.format( + schema_madlib, col_ind_var, col_dep_var) plpy.execute( """ - CREATE TABLE {y_mean_table} AS + CREATE TEMP TABLE {y_mean_table} AS SELECT {group_col}, - avg(case when not {schema_madlib}.array_contains_null({col_ind_var}) then {col_dep_var} end) as mean, - 1 as std - from {tbl_data} + {mean_str} AS mean, + 1 AS std + FROM {tbl_data} GROUP BY {group_col} - """.format(group_col=group_col, **kwargs)) + """.format(**locals())) # ======================================================================== def __utils_normalize_data_grouping(y_decenter=True, **kwargs): """ - Normalize the independent and dependent variables using the calculated mean's and std's - in __utils_ind_var_scales and __utils_dep_var_scale. - - Compute the scaled variables by: scaled_value = (origin_value - mean) / std, and special - care is needed if std is zero. + Normalize the independent and dependent variables using the calculated + mean's and std's in __utils_ind_var_scales and __utils_dep_var_scale. - The output is a table with scaled independent and dependent variables. + Compute the scaled variables by: scaled_value = (origin_value - mean) / std, + and special care is needed if std is zero. - This function is also used in lasso. + The output is a table with scaled independent and dependent variables, + based on mean and std for each group. This function is also used in lasso. Parameters: tbl_data -- original data @@ -156,6 +150,9 @@ def __utils_normalize_data_grouping(y_decenter=True, **kwargs): tbl_data_scaled -- scaled data result col_ind_var_norm_new -- create a new name for the scaled array to be compatible with array[...] expressions + x_mean_table -- name of the table containing mean of 'x' for each group + y_mean_table -- name of the table containing mean of 'y' for each group + grouping_col -- columns to group the data on """ group_col = kwargs.get('grouping_col') group_col_list = split_quoted_delimited_str(group_col) @@ -166,15 +163,15 @@ def __utils_normalize_data_grouping(y_decenter=True, **kwargs): ydecenter_str = "- __y__.mean".format(**kwargs) if y_decenter else "" plpy.execute( """ - create temp table {tbl_data_scaled} as - select + CREATE TEMP TABLE {tbl_data_scaled} AS + SELECT ({schema_madlib}.utils_normalize_data({col_ind_var}, __x__.mean::double precision[], __x__.std::double precision[])) - as {col_ind_var_norm_new}, - ({col_dep_var} {ydecenter_str}) as {col_dep_var_norm_new}, + AS {col_ind_var_norm_new}, + ({col_dep_var} {ydecenter_str}) AS {col_dep_var_norm_new}, {tbl_data}.{group_col} - from {tbl_data} + FROM {tbl_data} INNER JOIN {x_mean_table} AS __x__ ON {group_where_x} INNER JOIN {y_mean_table} AS __y__ ON {group_where_y} """.format(ydecenter_str=ydecenter_str, group_col=group_col, @@ -208,20 +205,19 @@ def __utils_normalize_data(y_decenter=True, **kwargs): ydecenter_str = "- {y_mean}".format(**kwargs) if y_decenter else "" plpy.execute( """ - create temp table {tbl_data_scaled} as - select + CREATE TEMP TABLE {tbl_data_scaled} AS + SELECT ({schema_madlib}.utils_normalize_data({col_ind_var}, '{x_mean_str}'::double precision[], '{x_std_str}'::double precision[])) - as {col_ind_var_norm_new}, - ({col_dep_var} {ydecenter_str}) as {col_dep_var_norm_new} - from {tbl_data} + AS {col_ind_var_norm_new}, + ({col_dep_var} {ydecenter_str}) AS {col_dep_var_norm_new} + FROM {tbl_data} """.format(ydecenter_str=ydecenter_str, **kwargs)) return None # ======================================================================== - def __utils_cv_preprocess(kwargs): """ Some common processes used in both ridge and lasso cross validation functions: diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in index 15dda32c6..df5489f00 100644 --- a/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in +++ b/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in @@ -35,9 +35,7 @@ def _elastic_net_generate_result(optimizer, iteration_run, **args): cols_types = dict(get_cols_and_types(args["tbl_source"])) grouping_str1 = grouping_column + "," - select_x_mean = '' - select_x_std = '' - select_y_mean = '' + select_mean_and_std = '' inner_join_x = '' inner_join_y = '' if data_scaled: @@ -53,9 +51,9 @@ def _elastic_net_generate_result(optimizer, iteration_run, **args): y_grp_cols = ' AND '.join([ 'n_tuples_including_nulls_subq.{0}={1}.{2}'.format(grp, args["y_mean_table"], grp) for grp in grouping_cols_list]) - select_x_mean = ' {0}.mean AS x_mean, '.format(args["x_mean_table"]) - select_y_mean = ' {0}.mean AS y_mean, '.format(args["y_mean_table"]) - select_x_std = ' {0}.std AS x_std, '.format(args["x_mean_table"]) + select_mean_and_std = ' {0}.mean AS x_mean, '.format(args["x_mean_table"]) +\ + ' {0}.mean AS y_mean, '.format(args["y_mean_table"]) +\ + ' {0}.std AS x_std, '.format(args["x_mean_table"]) inner_join_x = ' INNER JOIN {0} ON {1} '.format( args["x_mean_table"], x_grp_cols) inner_join_y = ' INNER JOIN {0} ON {1} '.format( @@ -63,7 +61,7 @@ def _elastic_net_generate_result(optimizer, iteration_run, **args): out_table_qstr = """ SELECT {select_grp} - {select_x_mean} {select_x_std} {select_y_mean} + {select_mean_and_std} (result).coefficients AS coef, (result).intercept AS intercept FROM @@ -72,7 +70,7 @@ def _elastic_net_generate_result(optimizer, iteration_run, **args): {grouping_str1} array_to_string(ARRAY[{grouping_str}], ',') AS {col_grp_key} FROM {source_table} - GROUP BY {grouping_col}, {col_grp_key} + GROUP BY {grouping_column}, {col_grp_key} ) n_tuples_including_nulls_subq INNER JOIN ( @@ -82,22 +80,8 @@ def _elastic_net_generate_result(optimizer, iteration_run, **args): ) t USING ({col_grp_key}) {inner_join_x} {inner_join_y} - """.format(result_func=result_func, - tbl_state=tbl_state, - grouping_col=grouping_column, - col_grp_iteration=args["col_grp_iteration"], - iteration_run=iteration_run, - grouping_str1=grouping_str1, - grouping_str=grouping_str, - col_grp_key=col_grp_key, - source_table=source_table, - schema_madlib=args["schema_madlib"], - select_grp=select_grp, - select_x_mean=select_x_mean, - select_y_mean=select_y_mean, - select_x_std=select_x_std, - inner_join_x=inner_join_x, - inner_join_y=inner_join_y) + """.format(schema_madlib=args["schema_madlib"], + col_grp_iteration=args["col_grp_iteration"], **locals()) else: # It's a much simpler query when there is no grouping. grouping_str1 = "" @@ -213,7 +197,7 @@ def _restore_scale(coef, intercept, args, """ Restore the original scales """ - if x_mean == None and x_std == None and y_mean == None: + if x_mean is None and x_std is None and y_mean is None: x_mean = args["x_scales"]["mean"] y_mean = args["y_scale"]["mean"] x_std = args["x_scales"]["std"] diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in index a15fe5d34..b2f2505ab 100644 --- a/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in +++ b/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in @@ -9,7 +9,6 @@ from convex.utils_regularization import __utils_normalize_data from convex.utils_regularization import __utils_ind_var_scales_grouping from convex.utils_regularization import __utils_dep_var_scale_grouping from convex.utils_regularization import __utils_normalize_data_grouping -from convex.utils_regularization import __utils_dep_var_scale_grouping_binomial from utilities.validate_args import table_exists from collections import namedtuple @@ -195,37 +194,26 @@ def _elastic_net_validate_args(tbl_source, col_ind_var, col_dep_var, # ------------------------------------------------------------------------ def _compute_data_scales_grouping(args): - __utils_ind_var_scales_grouping(tbl_data=args["tbl_source"], - col_ind_var=args["col_ind_var"], dimension=args["dimension"], - schema_madlib=args["schema_madlib"], - grouping_col=args["grouping_col"], - x_mean_table=args["x_mean_table"]) + __utils_ind_var_scales_grouping(args["tbl_source"], args["col_ind_var"], + args["dimension"], args["schema_madlib"], args["grouping_col"], + args["x_mean_table"]) if args["family"] == "binomial": # set mean and std to 0 and 1 respectively, for each group. - __utils_dep_var_scale_grouping_binomial( - tbl_data=args["tbl_source"], - grouping_col=args["grouping_col"], - y_mean_table=args["y_mean_table"]) + __utils_dep_var_scale_grouping(args["y_mean_table"], + args["tbl_source"], args["grouping_col"], args["family"]) else: - __utils_dep_var_scale_grouping(schema_madlib=args["schema_madlib"], - tbl_data=args["tbl_source"], - col_ind_var=args["col_ind_var"], - col_dep_var=args["col_dep_var"], - grouping_col=args["grouping_col"], - y_mean_table=args["y_mean_table"]) + __utils_dep_var_scale_grouping(args["y_mean_table"], + args["tbl_source"], args["grouping_col"], args["family"], + args["schema_madlib"], args["col_ind_var"], args["col_dep_var"]) def _compute_data_scales(args): - args["x_scales"] = __utils_ind_var_scales(tbl_data=args["tbl_source"], - col_ind_var=args["col_ind_var"], dimension=args["dimension"], - schema_madlib=args["schema_madlib"]) - + args["x_scales"] = __utils_ind_var_scales(args["tbl_source"], + args["col_ind_var"], args["dimension"], args["schema_madlib"]) if args["family"] == "binomial": args["y_scale"] = dict(mean=0, std=1) else: - args["y_scale"] = __utils_dep_var_scale( - schema_madlib=args["schema_madlib"], - tbl_data=args["tbl_source"], col_ind_var=args["col_ind_var"], - col_dep_var=args["col_dep_var"]) + args["y_scale"] = __utils_dep_var_scale(args["schema_madlib"], + args["tbl_source"], args["col_ind_var"], args["col_dep_var"]) args["xmean_str"] = _array_to_string(args["x_scales"]["mean"]) # ------------------------------------------------------------------------ @@ -240,7 +228,14 @@ def _normalize_data(args): """ y_decenter = True if args["family"] == "gaussian" else False if args["grouping_col"]: + # When grouping_col is defined, we must find an array containing + # the mean of every dimension in the independent variable (x), the + # mean of dependent variable (y) and the standard deviation for them + # specific to groups. Store these results in temp tables x_mean_table + # and y_mean_table. _compute_data_scales_grouping(args) + # __utils_normalize_data_grouping reads the various means and stds + # from the tables. __utils_normalize_data_grouping(y_decenter=y_decenter, tbl_data=args["tbl_source"], col_ind_var=args["col_ind_var"], @@ -253,6 +248,9 @@ def _normalize_data(args): y_mean_table=args["y_mean_table"], grouping_col=args["grouping_col"]) else: + # When no grouping_col is defined, the mean and std for both 'x' and + # 'y' can be defined using strings, stored in x_mean_str, x_std_str + # etc. We don't need a table like how we needed for grouping. _compute_data_scales(args) __utils_normalize_data(y_decenter=y_decenter, tbl_data=args["tbl_source"],