From 29fe759ac2511024f1afab1f7da767e8a62ffd8c Mon Sep 17 00:00:00 2001 From: Nandish Jayaram Date: Tue, 20 Mar 2018 15:43:25 -0700 Subject: [PATCH 1/2] MLP: Allow one-hot encoded dependent var for classification JIRA:MADLIB-1222 MLP currently automatically encodes categorical variables for classification but does not allow already encoded arrays for dependent variables in mlp_classification. This commit lets users have an already encoded array for the dependent variable and train a model. Closes #250 --- src/modules/convex/mlp_igd.cpp | 6 +- src/modules/convex/task/mlp.hpp | 20 ++- src/ports/postgres/modules/convex/mlp.sql_in | 3 +- .../postgres/modules/convex/mlp_igd.py_in | 146 +++++++++++------- .../postgres/modules/convex/test/mlp.sql_in | 48 +++++- 5 files changed, 156 insertions(+), 67 deletions(-) diff --git a/src/modules/convex/mlp_igd.cpp b/src/modules/convex/mlp_igd.cpp index b8738d43c..e914c41de 100644 --- a/src/modules/convex/mlp_igd.cpp +++ b/src/modules/convex/mlp_igd.cpp @@ -384,7 +384,8 @@ internal_predict_mlp::run(AnyType &args) { size_t numberOfStages = layerSizes.size()-1; double is_classification = args[2].getAs(); double activation = args[3].getAs(); - bool get_class = is_classification && is_response; + int is_dep_var_array_for_classification = args[8].getAs(); + bool is_classification_response = is_classification && is_response; model.rebind(&is_classification, &activation, &coeff.data()[0], numberOfStages, &layerSizes.data()[0]); @@ -393,7 +394,8 @@ internal_predict_mlp::run(AnyType &args) { } catch (const ArrayWithNullException &e) { return args[0]; } - ColumnVector prediction = MLPTask::predict(model, indVar, get_class); + ColumnVector prediction = MLPTask::predict(model, indVar, is_classification_response, + is_dep_var_array_for_classification); return prediction; } diff --git a/src/modules/convex/task/mlp.hpp b/src/modules/convex/task/mlp.hpp index 8a68aaa56..adf87a762 100644 --- a/src/modules/convex/task/mlp.hpp +++ b/src/modules/convex/task/mlp.hpp @@ -66,7 +66,8 @@ class MLP { static ColumnVector predict( const model_type &model, const independent_variables_type &x, - const bool get_class); + const bool is_classification_response, + const bool is_dep_var_array_for_classification); const static int RELU = 0; const static int SIGMOID = 1; @@ -219,17 +220,26 @@ ColumnVector MLP::predict( const model_type &model, const independent_variables_type &x, - const bool get_class) { + const bool is_classification_response, + const bool is_dep_var_array_for_classification) { std::vector net, o; feedForward(model, x, net, o); ColumnVector output = o.back(); - if(get_class){ // Return a length 1 array with the predicted index + if(is_classification_response){ int max_idx; output.maxCoeff(&max_idx); - output.resize(1); - output[0] = (double) max_idx; + if(is_dep_var_array_for_classification) { + // Return the entire array, but with 1 for the class level with + // largest probability and 0s for the rest. + output.setZero(); + output[max_idx] = 1; + } else { + // Return a length 1 array with the predicted index + output.resize(1); + output[0] = (double) max_idx; + } } return output; } diff --git a/src/ports/postgres/modules/convex/mlp.sql_in b/src/ports/postgres/modules/convex/mlp.sql_in index 739007e50..f153722eb 100644 --- a/src/ports/postgres/modules/convex/mlp.sql_in +++ b/src/ports/postgres/modules/convex/mlp.sql_in @@ -1669,7 +1669,8 @@ CREATE FUNCTION MADLIB_SCHEMA.internal_predict_mlp( layer_sizes DOUBLE PRECISION[], is_response INTEGER, x_means DOUBLE PRECISION[], - x_stds DOUBLE PRECISION[] + x_stds DOUBLE PRECISION[], + array_dep_var_for_classification INTEGER ) RETURNS DOUBLE PRECISION[] AS 'MODULE_PATHNAME' diff --git a/src/ports/postgres/modules/convex/mlp_igd.py_in b/src/ports/postgres/modules/convex/mlp_igd.py_in index 4a1416cce..800ec29d8 100644 --- a/src/ports/postgres/modules/convex/mlp_igd.py_in +++ b/src/ports/postgres/modules/convex/mlp_igd.py_in @@ -130,12 +130,14 @@ def mlp(schema_madlib, source_table, output_table, independent_varname, num_input_nodes = get_col_dimension(source_table, independent_varname, dim=2) if is_classification: - _assert(pp_summary_dict["class_values"], - "MLP Error: The pre-processed table created using" - " madlib.minibatch_preprocessor was probably run" - " without casting depedent variable to ::TEXT.") - classes = pp_summary_dict["class_values"] - num_output_nodes = len(classes) + if pp_summary_dict["class_values"]: + classes = pp_summary_dict["class_values"] + num_output_nodes = len(classes) + else: + # Assume that the dependent variable is already one-hot-encoded + num_output_nodes = get_col_dimension(source_table, + dependent_varname, + dim=2) else: num_output_nodes = get_col_dimension(source_table, dependent_varname, dim=2) @@ -156,18 +158,27 @@ def mlp(schema_madlib, source_table, output_table, independent_varname, dependent_type = get_expr_type(dependent_varname, source_table) if is_classification: - labels = plpy.execute("SELECT DISTINCT {0} FROM {1}". - format(dependent_varname, source_table)) - num_output_nodes = len(labels) - for label_obj in labels: - label = _format_label(label_obj[dependent_varname]) - classes.append(label) - classes.sort() - level_vals_str = ','.join(["{0}={1}".format( - col_dep_var_norm_new, str(c)) - for c in classes]) - # dependent_varname should be replaced with one-hot encoded varname - dependent_varname = "ARRAY[{0}]::integer[]".format(level_vals_str) + # If dependent variable is an array during classification, assume + # that it is already one-hot-encoded. + if "[]" in dependent_type: + # We are now using tbl_data_scaled, so change the dependent + # varname accordingly. + dependent_varname = col_dep_var_norm_new + num_output_nodes = get_col_dimension(tbl_data_scaled, + dependent_varname) + else: + labels = plpy.execute("SELECT DISTINCT {0} FROM {1}". + format(dependent_varname, source_table)) + num_output_nodes = len(labels) + for label_obj in labels: + label = _format_label(label_obj[dependent_varname]) + classes.append(label) + classes.sort() + level_vals_str = ','.join(["{0}={1}".format( + col_dep_var_norm_new, str(c)) + for c in classes]) + # dependent_varname should be replaced with one-hot encoded varname + dependent_varname = "ARRAY[{0}]::integer[]".format(level_vals_str) else: if "[]" not in dependent_type: dependent_varname = "ARRAY[" + col_dep_var_norm_new + "]" @@ -665,15 +676,26 @@ def _validate_dependent_var(source_table, dependent_varname, "Dependent variable column should be of numeric type.") else: if is_classification: - # Currently, classification doesn't accept an - # array for dep type in IGD - _assert("[]" not in expr_type and expr_type in classification_types, - "Dependent variable column should be of type: " - "{0}".format(classification_types)) + _assert(("[]" in expr_type \ + and is_psql_numeric_type(expr_type[:-2]) \ + and not _is_dep_var_multi_dim(dependent_varname, source_table) \ + ) \ + or expr_type in classification_types, + "Dependent variable column should either be a numeric 1-D" + " array, or be of type: {0}".format(classification_types)) else: _assert("[]" in expr_type or is_psql_numeric_type(expr_type), "Dependent variable column should be of numeric type.") +def _is_dep_var_multi_dim(dependent_varname, source_table): + # Check if dependent variable is an array of two or higher dimension + dep_array_sec_dim = plpy.execute(""" + SELECT array_upper({0}, 2) AS n_y + FROM {1} + LIMIT 1 + """.format(dependent_varname, source_table)) + return bool(dep_array_sec_dim[0]['n_y']) + def _validate_params_based_on_minibatch(source_table, independent_varname, dependent_varname, weights, is_classification, @@ -856,8 +878,16 @@ def mlp_predict(schema_madlib, model_table, data_table, id_col_name, activation = _get_activation_index(summary['activation']) layer_sizes = PY2SQL( summary['layer_sizes'], array_type="DOUBLE PRECISION") - is_classification = int(summary["is_classification"]) is_response = int(pred_type == 'response') + is_classification = int(summary["is_classification"]) + classes = summary['classes'] + # Set a flag to indicate that it is a classification model, with an array + # as the dependent var. The only scenario where classification allows for + # an array dep var is when the user has provided a one-hot encoded dep var + # during training, and mlp_classification does not one-hot encode + # (and hence classes column in model's summary table is NULL). + is_dep_var_an_array_for_classification = int(is_classification and not classes) + # Fix to ensure that 1.12 models run on 1.13 or higher. # As a result of adding grouping support in 1.13, some changes were # made wrt standardization. @@ -922,7 +952,6 @@ def mlp_predict(schema_madlib, model_table, data_table, id_col_name, else: # if not grouping, then directly read out the coeff, mean # and std values from the model and standardization tables. - if is_pre_113_model: # Get mean and std from the summary table standardization = plpy.execute(""" @@ -952,37 +981,22 @@ def mlp_predict(schema_madlib, model_table, data_table, id_col_name, {layer_sizes}, {is_response}, {mean_col}, - {std_col} + {std_col}, + {is_dep_var_an_array_for_classification} ) """.format(**locals()) - if not is_classification: - dependent_type = get_expr_type(dependent_varname, source_table) - unnest_if_not_array = "" - # Return the same type as the user provided. Internally we always - # use an array, but if they provided a scalar, unnest it for - # the user - if "[]" not in dependent_type: - unnest_if_not_array = "UNNEST" - sql = header + """ - SELECT {grouping_col_comma} - {id_col_name}, - {unnest_if_not_array}({predict_uda_query}) AS {pred_name} - FROM {data_table} - {join_str} - {group_by_predict_str} - """ - else: - summary_query = """ - SELECT classes FROM {0} - """.format(summary_table) - classes = plpy.execute(summary_query)[0]['classes'] + if is_classification: if pred_type == "response": - classes_with_index_table = unique_string() - classes_table = unique_string() + if classes: + prediction_select_clause = "(ARRAY{0})[pred_idx[1]+1] AS {1}".format(classes, pred_name) + else: + # Case when the training step did not have to one-hot encode + # the dependent var. + prediction_select_clause = "pred_idx AS {0}".format(pred_name) sql = header + """ SELECT {select_grouping_col} q.{id_col_name}, - (ARRAY{classes})[pred_idx[1]+1] as {pred_name} + {prediction_select_clause} FROM ( SELECT {grouping_col_comma} {id_col_name}, @@ -994,11 +1008,16 @@ def mlp_predict(schema_madlib, model_table, data_table, id_col_name, """ else: intermediate_col = unique_string() - score_format = ',\n'.join([ - 'CAST({interim}[{j}] as DOUBLE PRECISION) as "estimated_prob_{c_str}"'. - format(j=i + 1, c_str=str(c).strip(' "'), - interim=intermediate_col) - for i, c in enumerate(classes)]) + if classes: + score_format = ',\n'.join([ + 'CAST({interim}[{j}] as DOUBLE PRECISION) as "estimated_prob_{c_str}"'. + format(j=i + 1, c_str=str(c).strip(' "'), + interim=intermediate_col) + for i, c in enumerate(classes)]) + else: + # Case when the training step did not have to one-hot encode + # the dependent var. + score_format = '{0} AS estimated_prob'.format(intermediate_col) sql = header + """ SELECT {select_grouping_col} {id_col_name}, @@ -1012,6 +1031,23 @@ def mlp_predict(schema_madlib, model_table, data_table, id_col_name, {group_by_predict_str} ) q """ + else: + # Regression + dependent_type = get_expr_type(dependent_varname, source_table) + unnest_if_not_array = "" + # Return the same type as the user provided. Internally we always + # use an array, but if they provided a scalar, unnest it for + # the user + if "[]" not in dependent_type: + unnest_if_not_array = "UNNEST" + sql = header + """ + SELECT {grouping_col_comma} + {id_col_name}, + {unnest_if_not_array}({predict_uda_query}) AS {pred_name} + FROM {data_table} + {join_str} + {group_by_predict_str} + """ sql = sql.format(**locals()) plpy.execute(sql) diff --git a/src/ports/postgres/modules/convex/test/mlp.sql_in b/src/ports/postgres/modules/convex/test/mlp.sql_in index da4117204..8a2c92b69 100644 --- a/src/ports/postgres/modules/convex/test/mlp.sql_in +++ b/src/ports/postgres/modules/convex/test/mlp.sql_in @@ -215,7 +215,6 @@ CREATE TABLE iris_data_batch_summary( ); INSERT INTO iris_data_batch_summary VALUES ('iris_data','iris_data_batch','class::TEXT','attributes',30,ARRAY[1,2,3],141,0,'grp'); - -- Create the corresponding standardization table for preprocessed data CREATE TABLE iris_data_batch_standardization( grp text, @@ -227,7 +226,6 @@ INSERT INTO iris_data_batch_standardization VALUES ('1',ARRAY[5.74893617021,3.02482269504,3.6865248227,1.18014184397],ARRAY[0.785472439601,0.396287027644,1.68671151195,0.750245336531]), ('2',ARRAY[5.74893617021,3.02482269504,3.6865248227,1.18014184397],ARRAY[0.785472439601,0.396287027644,1.68671151195,0.750245336531]); --- without minibatch without grouping and without warm start DROP TABLE IF EXISTS mlp_class, mlp_class_summary, mlp_class_standardization; SELECT mlp_classification( 'iris_data', -- Source table @@ -246,7 +244,7 @@ SELECT mlp_classification( False, 'grp' ); -DROP TABLE IF EXISTS mlp_prediction_output, mlp_prediction_output_summary, mlp_prediction_output_standardization;; +DROP TABLE IF EXISTS mlp_prediction_output; SELECT mlp_predict( 'mlp_class', 'iris_data', @@ -301,7 +299,9 @@ SELECT mlp_classification( False, 'grp' ); -DROP TABLE IF EXISTS mlp_prediction_batch_output, mlp_prediction_batch_output_summary, mlp_prediction_batch_output_standardization; + +DROP TABLE IF EXISTS mlp_prediction_batch_output, mlp_prediction_output; +-- See prediction accuracy for training data SELECT mlp_predict( 'mlp_class_batch', 'iris_data', @@ -336,6 +336,46 @@ SELECT mlp_predict( 'response'); ------------------------------------------------ Regression ------------------------------------------------------------ +DROP TABLE IF EXISTS mlp_class_batch, mlp_class_batch_summary, mlp_class_batch_standardization; + +-- Set class_values column value to NULL so that encoding info is not captured, to test +-- case where dependent variable is an array for classification. +UPDATE iris_data_batch_summary SET class_values = NULL WHERE source_table='iris_data'; +SELECT mlp_classification( + 'iris_data_batch', -- Source table + 'mlp_class_batch', -- Desination table + 'independent_varname', -- Input features + 'dependent_varname', -- Label + ARRAY[5], -- Number of units per layer + 'learning_rate_init=0.1, + learning_rate_policy=constant, + n_iterations=5, + tolerance=0, + n_epochs=20', + 'sigmoid', + '', + False, + False +); +DROP TABLE IF EXISTS mlp_prediction_batch_output, mlp_prediction_output; +SELECT mlp_predict( + 'mlp_class_batch', + 'iris_data', + 'id', + 'mlp_prediction_batch_output', + 'response'); +SELECT * FROM mlp_prediction_batch_output; +DROP TABLE IF EXISTS mlp_prediction_batch_output; +SELECT mlp_predict( + 'mlp_class_batch', + 'iris_data', + 'id', + 'mlp_prediction_batch_output', + 'prob'); +SELECT * FROM mlp_prediction_batch_output; +DROP TABLE IF EXISTS mlp_prediction_batch_output; +DROP TABLE IF EXISTS mlp_class_batch, mlp_class_batch_summary, mlp_class_batch_standardization; + DROP TABLE IF EXISTS lin_housing_wi CASCADE; CREATE TABLE lin_housing_wi (id serial, x float8[], grp int, y float8); From 1670923bf155395caf51cd7b07edbe9533a6908b Mon Sep 17 00:00:00 2001 From: Nandish Jayaram Date: Tue, 3 Apr 2018 14:32:03 -0700 Subject: [PATCH 2/2] MLP: Remove minibatch training dependency on original source The original source table used in minibatch preprocessor is stored in a column named 'original_source_table' in the summary table. Ideally, this table doesn't need to exist after the preprocessed table is created. The current MLP training code had a dependency on it and would fail if that table was deleted. This commit removes that dependency. Co-authored-by: Nikhil Kak --- .../postgres/modules/convex/mlp_igd.py_in | 22 ++++++++++++++----- .../postgres/modules/convex/test/mlp.sql_in | 9 +++++++- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/src/ports/postgres/modules/convex/mlp_igd.py_in b/src/ports/postgres/modules/convex/mlp_igd.py_in index 800ec29d8..687011cca 100644 --- a/src/ports/postgres/modules/convex/mlp_igd.py_in +++ b/src/ports/postgres/modules/convex/mlp_igd.py_in @@ -141,9 +141,14 @@ def mlp(schema_madlib, source_table, output_table, independent_varname, else: num_output_nodes = get_col_dimension(source_table, dependent_varname, dim=2) - # Get the type of the original source table's dependent variable column. - dependent_type = get_expr_type(pp_summary_dict['dependent_varname'], - pp_summary_dict['source_table']) + + # This variable is used for creating the classes_str column in the model + # summary table. We append [] when we create this column in the create + # summary table command so we need to strip it out here. + dependent_type = get_expr_type(mlp_preprocessor.CLASS_VALUES, + mlp_preprocessor.summary_table) + if dependent_type[-2:] == '[]': + dependent_type = dependent_type[:-2] else: x_mean_table = unique_string(desp='x_mean_table') tbl_data_scaled = unique_string(desp="tbl_data_scaled") @@ -184,6 +189,7 @@ def mlp(schema_madlib, source_table, output_table, independent_varname, dependent_varname = "ARRAY[" + col_dep_var_norm_new + "]" num_output_nodes = get_col_dimension(tbl_data_scaled, dependent_varname, dim=1) + # Need layers sizes before validating for warm_start layer_sizes = [num_input_nodes] + hidden_layer_sizes + [num_output_nodes] col_grp_key = unique_string(desp='col_grp_key') @@ -451,8 +457,10 @@ def _create_summary_table(args): if args['warm_start']: plpy.execute("DROP TABLE IF EXISTS {0}".format(args['summary_table'])) + classes_str = PY2SQL([strip_end_quotes(cl, "'") for cl in args['classes']], array_type=args['dependent_type']) + minibatch_summary_col_names = '' minibatch_summary_col_vals = '' if args['is_minibatch_enabled']: @@ -678,7 +686,7 @@ def _validate_dependent_var(source_table, dependent_varname, if is_classification: _assert(("[]" in expr_type \ and is_psql_numeric_type(expr_type[:-2]) \ - and not _is_dep_var_multi_dim(dependent_varname, source_table) \ + and not _get_dep_var_second_dim(dependent_varname, source_table) \ ) \ or expr_type in classification_types, "Dependent variable column should either be a numeric 1-D" @@ -687,14 +695,16 @@ def _validate_dependent_var(source_table, dependent_varname, _assert("[]" in expr_type or is_psql_numeric_type(expr_type), "Dependent variable column should be of numeric type.") -def _is_dep_var_multi_dim(dependent_varname, source_table): +def _get_dep_var_second_dim(dependent_varname, source_table): # Check if dependent variable is an array of two or higher dimension + # Return back the value of the second dimension, returns None if it less + # than 2-D. dep_array_sec_dim = plpy.execute(""" SELECT array_upper({0}, 2) AS n_y FROM {1} LIMIT 1 """.format(dependent_varname, source_table)) - return bool(dep_array_sec_dim[0]['n_y']) + return dep_array_sec_dim[0]['n_y'] def _validate_params_based_on_minibatch(source_table, independent_varname, dependent_varname, weights, diff --git a/src/ports/postgres/modules/convex/test/mlp.sql_in b/src/ports/postgres/modules/convex/test/mlp.sql_in index 8a2c92b69..a40d35a98 100644 --- a/src/ports/postgres/modules/convex/test/mlp.sql_in +++ b/src/ports/postgres/modules/convex/test/mlp.sql_in @@ -213,8 +213,15 @@ CREATE TABLE iris_data_batch_summary( num_rows_skipped integer, grouping_cols text ); +-- The availability of the original source table should not be a condition for +-- MLP to work correctly. It should work fine even the original source table is +-- deleted (this basically ensures that all the necessary info is captured in +-- the summary table). So name the original source table as +-- 'iris_data_does_not_exist' instead of the original 'iris_data', to mimic the +-- scenario where the original source table is deleted and MLP is trained with +-- the preprocessed table. INSERT INTO iris_data_batch_summary VALUES -('iris_data','iris_data_batch','class::TEXT','attributes',30,ARRAY[1,2,3],141,0,'grp'); +('iris_data_does_not_exist','iris_data_batch','class::TEXT','attributes',30,ARRAY[1,2,3],141,0,'grp'); -- Create the corresponding standardization table for preprocessed data CREATE TABLE iris_data_batch_standardization( grp text,