From 29fe759ac2511024f1afab1f7da767e8a62ffd8c Mon Sep 17 00:00:00 2001
From: Nandish Jayaram <njayaram@apache.org>
Date: Tue, 20 Mar 2018 15:43:25 -0700
Subject: [PATCH 1/2] MLP: Allow one-hot encoded dependent var for
 classification

JIRA:MADLIB-1222

MLP currently automatically encodes categorical variables for
classification but does not allow already encoded arrays for dependent
variables in mlp_classification. This commit lets users have an already
encoded array for the dependent variable and train a model.

Closes #250
---
 src/modules/convex/mlp_igd.cpp                |   6 +-
 src/modules/convex/task/mlp.hpp               |  20 ++-
 src/ports/postgres/modules/convex/mlp.sql_in  |   3 +-
 .../postgres/modules/convex/mlp_igd.py_in     | 146 +++++++++++-------
 .../postgres/modules/convex/test/mlp.sql_in   |  48 +++++-
 5 files changed, 156 insertions(+), 67 deletions(-)
diff --git a/src/modules/convex/mlp_igd.cpp b/src/modules/convex/mlp_igd.cpp
index b8738d43c..e914c41de 100644
--- a/src/modules/convex/mlp_igd.cpp
+++ b/src/modules/convex/mlp_igd.cpp
@@ -384,7 +384,8 @@ internal_predict_mlp::run(AnyType &args) {
     size_t numberOfStages = layerSizes.size()-1;
     double is_classification = args[2].getAs<double>();
     double activation = args[3].getAs<double>();
-    bool get_class = is_classification && is_response;
+    int is_dep_var_array_for_classification = args[8].getAs<int>();
+    bool is_classification_response = is_classification && is_response;
 
     model.rebind(&is_classification, &activation, &coeff.data()[0],
                  numberOfStages, &layerSizes.data()[0]);
@@ -393,7 +394,8 @@ internal_predict_mlp::run(AnyType &args) {
     } catch (const ArrayWithNullException &e) {
         return args[0];
     }
-    ColumnVector prediction = MLPTask::predict(model, indVar, get_class);
+    ColumnVector prediction = MLPTask::predict(model, indVar, is_classification_response,
+                                               is_dep_var_array_for_classification);
     return prediction;
 }
 
diff --git a/src/modules/convex/task/mlp.hpp b/src/modules/convex/task/mlp.hpp
index 8a68aaa56..adf87a762 100644
--- a/src/modules/convex/task/mlp.hpp
+++ b/src/modules/convex/task/mlp.hpp
@@ -66,7 +66,8 @@ class MLP {
     static ColumnVector predict(
             const model_type                    &model,
             const independent_variables_type    &x,
-            const bool                          get_class);
+            const bool                          is_classification_response,
+            const bool                          is_dep_var_array_for_classification);
 
     const static int RELU = 0;
     const static int SIGMOID = 1;
@@ -219,17 +220,26 @@ ColumnVector
 MLP<Model, Tuple>::predict(
         const model_type                    &model,
         const independent_variables_type    &x,
-        const bool                          get_class) {
+        const bool                          is_classification_response,
+        const bool                          is_dep_var_array_for_classification) {
     std::vector<ColumnVector> net, o;
 
     feedForward(model, x, net, o);
     ColumnVector output = o.back();
 
-    if(get_class){ // Return a length 1 array with the predicted index
+    if(is_classification_response){
         int max_idx;
         output.maxCoeff(&max_idx);
-        output.resize(1);
-        output[0] = (double) max_idx;
+        if(is_dep_var_array_for_classification) {
+            // Return the entire array, but with 1 for the class level with
+            // largest probability and 0s for the rest.
+            output.setZero();
+            output[max_idx] = 1;
+        } else {
+            // Return a length 1 array with the predicted index
+            output.resize(1);
+            output[0] = (double) max_idx;
+        }
     }
     return output;
 }
diff --git a/src/ports/postgres/modules/convex/mlp.sql_in b/src/ports/postgres/modules/convex/mlp.sql_in
index 739007e50..f153722eb 100644
--- a/src/ports/postgres/modules/convex/mlp.sql_in
+++ b/src/ports/postgres/modules/convex/mlp.sql_in
@@ -1669,7 +1669,8 @@ CREATE FUNCTION MADLIB_SCHEMA.internal_predict_mlp(
         layer_sizes DOUBLE PRECISION[],
         is_response INTEGER,
         x_means DOUBLE PRECISION[],
-        x_stds DOUBLE PRECISION[]
+        x_stds DOUBLE PRECISION[],
+        array_dep_var_for_classification INTEGER
     )
 RETURNS DOUBLE PRECISION[]
 AS 'MODULE_PATHNAME'
diff --git a/src/ports/postgres/modules/convex/mlp_igd.py_in b/src/ports/postgres/modules/convex/mlp_igd.py_in
index 4a1416cce..800ec29d8 100644
--- a/src/ports/postgres/modules/convex/mlp_igd.py_in
+++ b/src/ports/postgres/modules/convex/mlp_igd.py_in
@@ -130,12 +130,14 @@ def mlp(schema_madlib, source_table, output_table, independent_varname,
         num_input_nodes = get_col_dimension(source_table, independent_varname,
                                             dim=2)
         if is_classification:
-            _assert(pp_summary_dict["class_values"],
-                    "MLP Error: The pre-processed table created using"
-                    " madlib.minibatch_preprocessor was probably run"
-                    " without casting depedent variable to ::TEXT.")
-            classes = pp_summary_dict["class_values"]
-            num_output_nodes = len(classes)
+            if pp_summary_dict["class_values"]:
+                classes = pp_summary_dict["class_values"]
+                num_output_nodes = len(classes)
+            else:
+                # Assume that the dependent variable is already one-hot-encoded
+                num_output_nodes = get_col_dimension(source_table,
+                                                     dependent_varname,
+                                                     dim=2)
         else:
             num_output_nodes = get_col_dimension(source_table,
                                                  dependent_varname, dim=2)
@@ -156,18 +158,27 @@ def mlp(schema_madlib, source_table, output_table, independent_varname,
         dependent_type = get_expr_type(dependent_varname, source_table)
 
         if is_classification:
-            labels = plpy.execute("SELECT DISTINCT {0} FROM {1}".
-                                  format(dependent_varname, source_table))
-            num_output_nodes = len(labels)
-            for label_obj in labels:
-                label = _format_label(label_obj[dependent_varname])
-                classes.append(label)
-            classes.sort()
-            level_vals_str = ','.join(["{0}={1}".format(
-                                       col_dep_var_norm_new, str(c))
-                                       for c in classes])
-            # dependent_varname should be replaced with one-hot encoded varname
-            dependent_varname = "ARRAY[{0}]::integer[]".format(level_vals_str)
+            # If dependent variable is an array during classification, assume
+            # that it is already one-hot-encoded.
+            if "[]" in dependent_type:
+                # We are now using tbl_data_scaled, so change the dependent
+                # varname accordingly.
+                dependent_varname = col_dep_var_norm_new
+                num_output_nodes = get_col_dimension(tbl_data_scaled,
+                                                     dependent_varname)
+            else:
+                labels = plpy.execute("SELECT DISTINCT {0} FROM {1}".
+                                      format(dependent_varname, source_table))
+                num_output_nodes = len(labels)
+                for label_obj in labels:
+                    label = _format_label(label_obj[dependent_varname])
+                    classes.append(label)
+                classes.sort()
+                level_vals_str = ','.join(["{0}={1}".format(
+                                           col_dep_var_norm_new, str(c))
+                                           for c in classes])
+                # dependent_varname should be replaced with one-hot encoded varname
+                dependent_varname = "ARRAY[{0}]::integer[]".format(level_vals_str)
         else:
             if "[]" not in dependent_type:
                 dependent_varname = "ARRAY[" + col_dep_var_norm_new + "]"
@@ -665,15 +676,26 @@ def _validate_dependent_var(source_table, dependent_varname,
                 "Dependent variable column should be of numeric type.")
     else:
         if is_classification:
-            # Currently, classification doesn't accept an
-            # array for dep type in IGD
-            _assert("[]" not in expr_type and expr_type in classification_types,
-                    "Dependent variable column should be of type: "
-                    "{0}".format(classification_types))
+            _assert(("[]" in expr_type \
+                     and is_psql_numeric_type(expr_type[:-2]) \
+                     and not _is_dep_var_multi_dim(dependent_varname, source_table) \
+                    ) \
+                    or expr_type in classification_types,
+                    "Dependent variable column should either be a numeric 1-D"
+                    " array, or be of type: {0}".format(classification_types))
         else:
             _assert("[]" in expr_type or is_psql_numeric_type(expr_type),
                     "Dependent variable column should be of numeric type.")
 
+def _is_dep_var_multi_dim(dependent_varname, source_table):
+    # Check if dependent variable is an array of two or higher dimension
+    dep_array_sec_dim = plpy.execute("""
+            SELECT array_upper({0}, 2) AS n_y
+            FROM {1}
+            LIMIT 1
+        """.format(dependent_varname, source_table))
+    return bool(dep_array_sec_dim[0]['n_y'])
+
 def _validate_params_based_on_minibatch(source_table, independent_varname,
                                         dependent_varname, weights,
                                         is_classification,
@@ -856,8 +878,16 @@ def mlp_predict(schema_madlib, model_table, data_table, id_col_name,
     activation = _get_activation_index(summary['activation'])
     layer_sizes = PY2SQL(
         summary['layer_sizes'], array_type="DOUBLE PRECISION")
-    is_classification = int(summary["is_classification"])
     is_response = int(pred_type == 'response')
+    is_classification = int(summary["is_classification"])
+    classes = summary['classes']
+    # Set a flag to indicate that it is a classification model, with an array
+    # as the dependent var. The only scenario where classification allows for
+    # an array dep var is when the user has provided a one-hot encoded dep var
+    # during training, and mlp_classification does not one-hot encode
+    # (and hence classes column in model's summary table is NULL).
+    is_dep_var_an_array_for_classification = int(is_classification and not classes)
+
     # Fix to ensure that 1.12 models run on 1.13 or higher.
     # As a result of adding grouping support in 1.13, some changes were
     # made wrt standardization.
@@ -922,7 +952,6 @@ def mlp_predict(schema_madlib, model_table, data_table, id_col_name,
     else:
         # if not grouping, then directly read out the coeff, mean
         # and std values from the model and standardization tables.
-
         if is_pre_113_model:
             # Get mean and std from the summary table
             standardization = plpy.execute("""
@@ -952,37 +981,22 @@ def mlp_predict(schema_madlib, model_table, data_table, id_col_name,
             {layer_sizes},
             {is_response},
             {mean_col},
-            {std_col}
+            {std_col},
+            {is_dep_var_an_array_for_classification}
             )
         """.format(**locals())
-    if not is_classification:
-        dependent_type = get_expr_type(dependent_varname, source_table)
-        unnest_if_not_array = ""
-        # Return the same type as the user provided.  Internally we always
-        # use an array, but if they provided a scalar, unnest it for
-        # the user
-        if "[]" not in dependent_type:
-            unnest_if_not_array = "UNNEST"
-        sql = header + """
-                SELECT {grouping_col_comma}
-                       {id_col_name},
-                       {unnest_if_not_array}({predict_uda_query}) AS {pred_name}
-                FROM {data_table}
-                {join_str}
-                {group_by_predict_str}
-            """
-    else:
-        summary_query = """
-            SELECT classes FROM {0}
-        """.format(summary_table)
-        classes = plpy.execute(summary_query)[0]['classes']
+    if is_classification:
         if pred_type == "response":
-            classes_with_index_table = unique_string()
-            classes_table = unique_string()
+            if classes:
+                prediction_select_clause = "(ARRAY{0})[pred_idx[1]+1] AS {1}".format(classes, pred_name)
+            else:
+                # Case when the training step did not have to one-hot encode
+                # the dependent var.
+                prediction_select_clause = "pred_idx AS {0}".format(pred_name)
             sql = header + """
                     SELECT {select_grouping_col}
                            q.{id_col_name},
-                           (ARRAY{classes})[pred_idx[1]+1] as {pred_name}
+                           {prediction_select_clause}
                     FROM (
                         SELECT {grouping_col_comma}
                             {id_col_name},
@@ -994,11 +1008,16 @@ def mlp_predict(schema_madlib, model_table, data_table, id_col_name,
                 """
         else:
             intermediate_col = unique_string()
-            score_format = ',\n'.join([
-                'CAST({interim}[{j}] as DOUBLE PRECISION) as "estimated_prob_{c_str}"'.
-                format(j=i + 1, c_str=str(c).strip(' "'),
-                       interim=intermediate_col)
-                for i, c in enumerate(classes)])
+            if classes:
+                score_format = ',\n'.join([
+                    'CAST({interim}[{j}] as DOUBLE PRECISION) as "estimated_prob_{c_str}"'.
+                    format(j=i + 1, c_str=str(c).strip(' "'),
+                           interim=intermediate_col)
+                    for i, c in enumerate(classes)])
+            else:
+                # Case when the training step did not have to one-hot encode
+                # the dependent var.
+                score_format = '{0} AS estimated_prob'.format(intermediate_col)
             sql = header + """
                     SELECT {select_grouping_col}
                         {id_col_name},
@@ -1012,6 +1031,23 @@ def mlp_predict(schema_madlib, model_table, data_table, id_col_name,
                         {group_by_predict_str}
                     ) q
                 """
+    else:
+        # Regression
+        dependent_type = get_expr_type(dependent_varname, source_table)
+        unnest_if_not_array = ""
+        # Return the same type as the user provided.  Internally we always
+        # use an array, but if they provided a scalar, unnest it for
+        # the user
+        if "[]" not in dependent_type:
+            unnest_if_not_array = "UNNEST"
+        sql = header + """
+                SELECT {grouping_col_comma}
+                       {id_col_name},
+                       {unnest_if_not_array}({predict_uda_query}) AS {pred_name}
+                FROM {data_table}
+                {join_str}
+                {group_by_predict_str}
+            """
     sql = sql.format(**locals())
     plpy.execute(sql)
 
diff --git a/src/ports/postgres/modules/convex/test/mlp.sql_in b/src/ports/postgres/modules/convex/test/mlp.sql_in
index da4117204..8a2c92b69 100644
--- a/src/ports/postgres/modules/convex/test/mlp.sql_in
+++ b/src/ports/postgres/modules/convex/test/mlp.sql_in
@@ -215,7 +215,6 @@ CREATE TABLE iris_data_batch_summary(
 );
 INSERT INTO iris_data_batch_summary VALUES
 ('iris_data','iris_data_batch','class::TEXT','attributes',30,ARRAY[1,2,3],141,0,'grp');
-
 -- Create the corresponding standardization table for preprocessed data
 CREATE TABLE iris_data_batch_standardization(
     grp text,
@@ -227,7 +226,6 @@ INSERT INTO iris_data_batch_standardization VALUES
 ('1',ARRAY[5.74893617021,3.02482269504,3.6865248227,1.18014184397],ARRAY[0.785472439601,0.396287027644,1.68671151195,0.750245336531]),
 ('2',ARRAY[5.74893617021,3.02482269504,3.6865248227,1.18014184397],ARRAY[0.785472439601,0.396287027644,1.68671151195,0.750245336531]);
 
--- without minibatch without grouping and without warm start
 DROP TABLE IF EXISTS mlp_class, mlp_class_summary, mlp_class_standardization;
 SELECT mlp_classification(
     'iris_data',    -- Source table
@@ -246,7 +244,7 @@ SELECT mlp_classification(
     False,
     'grp'
 );
-DROP TABLE IF EXISTS mlp_prediction_output, mlp_prediction_output_summary, mlp_prediction_output_standardization;;
+DROP TABLE IF EXISTS mlp_prediction_output;
 SELECT mlp_predict(
     'mlp_class',
     'iris_data',
@@ -301,7 +299,9 @@ SELECT mlp_classification(
     False,
     'grp'
 );
-DROP TABLE IF EXISTS mlp_prediction_batch_output, mlp_prediction_batch_output_summary, mlp_prediction_batch_output_standardization;
+
+DROP TABLE IF EXISTS mlp_prediction_batch_output, mlp_prediction_output;
+-- See prediction accuracy for training data
 SELECT mlp_predict(
     'mlp_class_batch',
     'iris_data',
@@ -336,6 +336,46 @@ SELECT mlp_predict(
     'response');
 
 ------------------------------------------------ Regression ------------------------------------------------------------
+DROP TABLE IF EXISTS mlp_class_batch, mlp_class_batch_summary, mlp_class_batch_standardization;
+
+-- Set class_values column value to NULL so that encoding info is not captured, to test
+-- case where dependent variable is an array for classification.
+UPDATE iris_data_batch_summary SET class_values = NULL WHERE source_table='iris_data';
+SELECT mlp_classification(
+    'iris_data_batch',    -- Source table
+    'mlp_class_batch',    -- Desination table
+    'independent_varname',   -- Input features
+    'dependent_varname',        -- Label
+    ARRAY[5],   -- Number of units per layer
+    'learning_rate_init=0.1,
+    learning_rate_policy=constant,
+    n_iterations=5,
+    tolerance=0,
+    n_epochs=20',
+    'sigmoid',
+    '',
+    False,
+    False
+);
+DROP TABLE IF EXISTS mlp_prediction_batch_output, mlp_prediction_output;
+SELECT mlp_predict(
+    'mlp_class_batch',
+    'iris_data',
+    'id',
+    'mlp_prediction_batch_output',
+    'response');
+SELECT * FROM mlp_prediction_batch_output;
+DROP TABLE IF EXISTS mlp_prediction_batch_output;
+SELECT mlp_predict(
+    'mlp_class_batch',
+    'iris_data',
+    'id',
+    'mlp_prediction_batch_output',
+    'prob');
+SELECT * FROM mlp_prediction_batch_output;
+DROP TABLE IF EXISTS mlp_prediction_batch_output;
+DROP TABLE IF EXISTS mlp_class_batch, mlp_class_batch_summary, mlp_class_batch_standardization;
+
 
 DROP TABLE IF EXISTS lin_housing_wi CASCADE;
 CREATE TABLE lin_housing_wi (id serial, x float8[], grp int, y float8);

From 1670923bf155395caf51cd7b07edbe9533a6908b Mon Sep 17 00:00:00 2001
From: Nandish Jayaram <njayaram@apache.org>
Date: Tue, 3 Apr 2018 14:32:03 -0700
Subject: [PATCH 2/2] MLP: Remove minibatch training dependency on original
 source

The original source table used in minibatch preprocessor is stored in a
column named 'original_source_table' in the summary table. Ideally, this
table doesn't need to exist after the preprocessed table is created. The
current MLP training code had a dependency on it and would fail if that
table was deleted. This commit removes that dependency.

Co-authored-by: Nikhil Kak <nkak@pivotal.io>
---
 .../postgres/modules/convex/mlp_igd.py_in     | 22 ++++++++++++++-----
 .../postgres/modules/convex/test/mlp.sql_in   |  9 +++++++-
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/src/ports/postgres/modules/convex/mlp_igd.py_in b/src/ports/postgres/modules/convex/mlp_igd.py_in
index 800ec29d8..687011cca 100644
--- a/src/ports/postgres/modules/convex/mlp_igd.py_in
+++ b/src/ports/postgres/modules/convex/mlp_igd.py_in
@@ -141,9 +141,14 @@ def mlp(schema_madlib, source_table, output_table, independent_varname,
         else:
             num_output_nodes = get_col_dimension(source_table,
                                                  dependent_varname, dim=2)
-        # Get the type of the original source table's dependent variable column.
-        dependent_type = get_expr_type(pp_summary_dict['dependent_varname'],
-                                       pp_summary_dict['source_table'])
+
+        # This variable is used for creating the classes_str column in the model
+        # summary table. We append [] when we create this column in the create
+        # summary table command so we need to strip it out here.
+        dependent_type = get_expr_type(mlp_preprocessor.CLASS_VALUES,
+                                       mlp_preprocessor.summary_table)
+        if dependent_type[-2:] == '[]':
+            dependent_type = dependent_type[:-2]
     else:
         x_mean_table = unique_string(desp='x_mean_table')
         tbl_data_scaled = unique_string(desp="tbl_data_scaled")
@@ -184,6 +189,7 @@ def mlp(schema_madlib, source_table, output_table, independent_varname,
                 dependent_varname = "ARRAY[" + col_dep_var_norm_new + "]"
             num_output_nodes = get_col_dimension(tbl_data_scaled,
                                                  dependent_varname, dim=1)
+
     # Need layers sizes before validating for warm_start
     layer_sizes = [num_input_nodes] + hidden_layer_sizes + [num_output_nodes]
     col_grp_key = unique_string(desp='col_grp_key')
@@ -451,8 +457,10 @@ def _create_summary_table(args):
     if args['warm_start']:
         plpy.execute("DROP TABLE IF EXISTS {0}".format(args['summary_table']))
 
+
     classes_str = PY2SQL([strip_end_quotes(cl, "'") for cl in args['classes']],
                          array_type=args['dependent_type'])
+
     minibatch_summary_col_names = ''
     minibatch_summary_col_vals = ''
     if args['is_minibatch_enabled']:
@@ -678,7 +686,7 @@ def _validate_dependent_var(source_table, dependent_varname,
         if is_classification:
             _assert(("[]" in expr_type \
                      and is_psql_numeric_type(expr_type[:-2]) \
-                     and not _is_dep_var_multi_dim(dependent_varname, source_table) \
+                     and not _get_dep_var_second_dim(dependent_varname, source_table) \
                     ) \
                     or expr_type in classification_types,
                     "Dependent variable column should either be a numeric 1-D"
@@ -687,14 +695,16 @@ def _validate_dependent_var(source_table, dependent_varname,
             _assert("[]" in expr_type or is_psql_numeric_type(expr_type),
                     "Dependent variable column should be of numeric type.")
 
-def _is_dep_var_multi_dim(dependent_varname, source_table):
+def _get_dep_var_second_dim(dependent_varname, source_table):
     # Check if dependent variable is an array of two or higher dimension
+    # Return back the value of the second dimension, returns None if it less
+    # than 2-D.
     dep_array_sec_dim = plpy.execute("""
             SELECT array_upper({0}, 2) AS n_y
             FROM {1}
             LIMIT 1
         """.format(dependent_varname, source_table))
-    return bool(dep_array_sec_dim[0]['n_y'])
+    return dep_array_sec_dim[0]['n_y']
 
 def _validate_params_based_on_minibatch(source_table, independent_varname,
                                         dependent_varname, weights,
diff --git a/src/ports/postgres/modules/convex/test/mlp.sql_in b/src/ports/postgres/modules/convex/test/mlp.sql_in
index 8a2c92b69..a40d35a98 100644
--- a/src/ports/postgres/modules/convex/test/mlp.sql_in
+++ b/src/ports/postgres/modules/convex/test/mlp.sql_in
@@ -213,8 +213,15 @@ CREATE TABLE iris_data_batch_summary(
     num_rows_skipped integer,
     grouping_cols text
 );
+-- The availability of the original source table should not be a condition for
+-- MLP to work correctly. It should work fine even the original source table is
+-- deleted (this basically ensures that all the necessary info is captured in
+-- the summary table). So name the original source table as
+-- 'iris_data_does_not_exist' instead of the original 'iris_data', to mimic the
+-- scenario where the original source table is deleted and MLP is trained with
+-- the preprocessed table.
 INSERT INTO iris_data_batch_summary VALUES
-('iris_data','iris_data_batch','class::TEXT','attributes',30,ARRAY[1,2,3],141,0,'grp');
+('iris_data_does_not_exist','iris_data_batch','class::TEXT','attributes',30,ARRAY[1,2,3],141,0,'grp');
 -- Create the corresponding standardization table for preprocessed data
 CREATE TABLE iris_data_batch_standardization(
     grp text,