From 5100dfc76f192812d8ba7ede3fc78c8aa65563cd Mon Sep 17 00:00:00 2001
From: Nandish Jayaram <njayaram@pivotal.io>
Date: Tue, 5 Apr 2016 16:39:38 -0700
Subject: [PATCH 1/2]         Elastic Net: Skip arrays with NULL values in
 train

        Jira: MADLIB-978

        Having NULL values in the input array of the training
        data was leading to an unhandled exception. This fix
        now catches the exception and skips such input arrays.
        The fix also modifies the code which was used to normalize
        the input data (independent variable), which now ignores
        such arrays with NULLs while normalizing.
        The number of rows in the input table was used while
        normalizing the data. The query used to get the number
        of rows is now changed to count only those rows that have
        no NULL values in the array.
        The mean of the dependent variable was still computed using
        an SQL command which was not ignoring the independent variables
        (arrays) with NULLs. They are ignored which computing the mean now.
---
 src/modules/convex/utils_regularization.cpp   | 14 ++++-
 .../elastic_net_optimizer_fista.hpp           | 28 +++++-----
 .../elastic_net/elastic_net_optimizer_igd.hpp | 52 +++++++++++--------
 src/modules/elastic_net/elastic_net_utils.cpp |  4 +-
 .../modules/convex/utils_regularization.py_in | 11 +++-
 .../elastic_net_gaussian_fista.py_in          |  2 +-
 .../elastic_net_gaussian_igd.py_in            |  2 +-
 .../elastic_net_optimizer_fista.py_in         |  2 +-
 .../elastic_net_optimizer_igd.py_in           |  2 +-
 .../elastic_net/elastic_net_utils.py_in       | 10 ++--
 .../test/elastic_net_install_check.sql_in     |  7 +++
 11 files changed, 87 insertions(+), 47 deletions(-)
diff --git a/src/modules/convex/utils_regularization.cpp b/src/modules/convex/utils_regularization.cpp
index 7d49b8f6e..845e9cbd6 100644
--- a/src/modules/convex/utils_regularization.cpp
+++ b/src/modules/convex/utils_regularization.cpp
@@ -79,8 +79,13 @@ class ScalesState
 AnyType utils_var_scales_transition::run (AnyType& args)
 {
     ScalesState<MutableArrayHandle<double> > state = args[0];
+    try {
+        args[1].getAs<MappedColumnVector>();
+    } catch(const ArrayWithNullException &e) {
+      //  warning("Input array contains NULL values, skipping it for mean and std computations");
+        return state;
+    }
     MappedColumnVector x = args[1].getAs<MappedColumnVector>();
-
     if (state.numRows == 0)
     {
         int dimension = args[2].getAs<int>();
@@ -99,7 +104,6 @@ AnyType utils_var_scales_transition::run (AnyType& args)
     }
 
     state.numRows++;
-
     return state;
 }
 
@@ -153,6 +157,12 @@ AnyType __utils_var_scales_result::run (AnyType& args)
 
 AnyType utils_normalize_data::run (AnyType& args)
 {
+    try {
+        args[0].getAs<MutableMappedColumnVector>();
+    } catch(const ArrayWithNullException &e) {
+       // warning("Input array contains NULL values, skipping this array normalization");
+        return Null();
+    }
     MutableMappedColumnVector x = args[0].getAs<MutableMappedColumnVector>();
     MappedColumnVector mean = args[1].getAs<MappedColumnVector>();
     MappedColumnVector std = args[2].getAs<MappedColumnVector>();
diff --git a/src/modules/elastic_net/elastic_net_optimizer_fista.hpp b/src/modules/elastic_net/elastic_net_optimizer_fista.hpp
index daae10cda..a82eed81f 100644
--- a/src/modules/elastic_net/elastic_net_optimizer_fista.hpp
+++ b/src/modules/elastic_net/elastic_net_optimizer_fista.hpp
@@ -130,19 +130,23 @@ AnyType Fista<Model>::fista_transition (AnyType& args, const Allocator& inAlloca
         state.is_active = args[11].getAs<int>();
     }
 
-    MappedColumnVector x = args[1].getAs<MappedColumnVector>();
-    double y;
-
-    Model::get_y(y, args);
-
-    if (state.use_active_set == 1 && state.is_active == 1)
-        Model::active_transition(state, x, y);
-    else
-        Model::normal_transition(state, x, y);
-
-    Model::update_loglikelihood(state, x, y);
-    state.numRows++;
+    try {
+        MappedColumnVector x = args[1].getAs<MappedColumnVector>();
+        double y;
+        Model::get_y(y, args);
 
+        if (state.use_active_set == 1 && state.is_active == 1)
+            Model::active_transition(state, x, y);
+        else
+            Model::normal_transition(state, x, y);
+
+        Model::update_loglikelihood(state, x, y);
+        state.numRows++;
+    } catch(const ArrayWithNullException &e) {
+      //  warning("Input array most likely contains NULL, skipping this array.");
+    } catch(const std::invalid_argument &ia) {
+        //warning("Input array is invalid (with NULL values), skipping this array.");
+    }
     return state;
 }
 
diff --git a/src/modules/elastic_net/elastic_net_optimizer_igd.hpp b/src/modules/elastic_net/elastic_net_optimizer_igd.hpp
index 6441231b7..d75006e00 100644
--- a/src/modules/elastic_net/elastic_net_optimizer_igd.hpp
+++ b/src/modules/elastic_net/elastic_net_optimizer_igd.hpp
@@ -89,35 +89,41 @@ AnyType Igd<Model>::igd_transition (AnyType& args, const Allocator& inAllocator)
 
     state.stepsize = state.stepsize / exp(state.step_decay);
 
-    MappedColumnVector x = args[1].getAs<MappedColumnVector>();
-    double y;
+    try {
+        MappedColumnVector x = args[1].getAs<MappedColumnVector>();
+        double y;
 
-    Model::get_y(y, args);
+        Model::get_y(y, args);
 
-    ColumnVector gradient(state.dimension); // gradient for coef only, not for intercept
-    Model::compute_gradient(gradient, state, x, y);
+        ColumnVector gradient(state.dimension); // gradient for coef only, not for intercept
+        Model::compute_gradient(gradient, state, x, y);
 
-    double a = state.stepsize / static_cast<double>(state.totalRows);
-    double b = state.stepsize * state.alpha * state.lambda
-        / static_cast<double>(state.totalRows);
-    for (uint32_t i = 0; i < state.dimension; i++)
-    {
-        // step 1
-        state.theta(i) -= a * gradient(i);
-        double step1_sign = sign(state.theta(i));
-        // step 2
-        state.theta(i) -= b * sign(state.theta(i));
-        // set to 0 if the value crossed zero during the two steps
-        if (step1_sign != sign(state.theta(i))) state.theta(i) = 0;
-    }
+        double a = state.stepsize / static_cast<double>(state.totalRows);
+        double b = state.stepsize * state.alpha * state.lambda
+            / static_cast<double>(state.totalRows);
+        for (uint32_t i = 0; i < state.dimension; i++)
+        {
+            // step 1
+            state.theta(i) -= a * gradient(i);
+            double step1_sign = sign(state.theta(i));
+            // step 2
+            state.theta(i) -= b * sign(state.theta(i));
+            // set to 0 if the value crossed zero during the two steps
+            if (step1_sign != sign(state.theta(i))) state.theta(i) = 0;
+        }
 
-    link_fn(state.theta, state.coef, state.p);
+        link_fn(state.theta, state.coef, state.p);
 
-    Model::update_intercept(state, x, y); // intercept is updated separately
+        Model::update_intercept(state, x, y); // intercept is updated separately
 
-    Model::update_loglikelihood(state, x, y);
-    // state.loss += r * r / 2.;
-    state.numRows ++;
+        Model::update_loglikelihood(state, x, y);
+        // state.loss += r * r / 2.;
+        state.numRows ++;
+    } catch(const ArrayWithNullException &e) {
+      //  warning("Input array most likely contains NULL, skipping this array.");
+    } catch(const std::invalid_argument &ia) {
+        //warning("Input array is invalid (with NULL values), skipping this array.");
+    }
 
     return state;
 }
diff --git a/src/modules/elastic_net/elastic_net_utils.cpp b/src/modules/elastic_net/elastic_net_utils.cpp
index 354c21664..ee1640a2a 100644
--- a/src/modules/elastic_net/elastic_net_utils.cpp
+++ b/src/modules/elastic_net/elastic_net_utils.cpp
@@ -30,13 +30,15 @@ AnyType __elastic_net_gaussian_predict::run (AnyType& args)
     }
 
     MappedColumnVector coef = args[0].getAs<MappedColumnVector>();
-    double intercept = args[1].getAs<double>();
     MappedColumnVector x = args[2].getAs<MappedColumnVector>();
+    double intercept = args[1].getAs<double>();
 
     double predict = intercept + sparse_dot(coef, x);
     return predict;
 }
 
+
+
 // ------------------------------------------------------------------------
 
 /**
diff --git a/src/ports/postgres/modules/convex/utils_regularization.py_in b/src/ports/postgres/modules/convex/utils_regularization.py_in
index 72c293cce..f28ab60ad 100644
--- a/src/ports/postgres/modules/convex/utils_regularization.py_in
+++ b/src/ports/postgres/modules/convex/utils_regularization.py_in
@@ -38,7 +38,6 @@ def __utils_ind_var_scales(**kwargs):
     return x_scales
 # ========================================================================
 
-
 def __utils_dep_var_scale(**kwargs):
     """
     The mean and standard deviation for each element of the dependent variable,
@@ -47,14 +46,22 @@ def __utils_dep_var_scale(**kwargs):
     The output will be stored in a temp table: a mean array and a std array
 
     This function is also used in lasso.
+
+    Parameters:
+    schema_madlib -- madlib schema
+    tbl_data -- original data
+    col_ind_var -- independent variables column
+    col_dep_var -- dependent variable column
     """
+
     y_scale = plpy.execute(
         """
         select
-            avg({col_dep_var}) as mean,
+            avg(case when not {schema_madlib}.array_contains_null({col_ind_var}) then {col_dep_var} end) as mean,
             1 as std
         from {tbl_data}
         """.format(**kwargs))[0]
+
     return y_scale
 # ========================================================================
 
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_gaussian_fista.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_gaussian_fista.py_in
index 3c0c20be1..fd56fa01b 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net_gaussian_fista.py_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net_gaussian_fista.py_in
@@ -282,7 +282,7 @@ def __elastic_net_gaussian_fista_train_compute(
                                  """)[0]['setting']
     plpy.execute("set client_min_messages to error")
 
-    (dimension, row_num) = __tbl_dimension_rownum(tbl_source, col_ind_var)
+    (dimension, row_num) = __tbl_dimension_rownum(schema_madlib, tbl_source, col_ind_var)
 
     # generate a full dict to ease the following string format
     # including several temporary table names
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_gaussian_igd.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_gaussian_igd.py_in
index cbf8c7cb2..d9f9c2d5e 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net_gaussian_igd.py_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net_gaussian_igd.py_in
@@ -268,7 +268,7 @@ def __elastic_net_gaussian_igd_train_compute(schema_madlib, tbl_source, col_ind_
                                  """)[0]['setting']
     plpy.execute("set client_min_messages to error")
 
-    (dimension, row_num) = __tbl_dimension_rownum(tbl_source, col_ind_var)
+    (dimension, row_num) = __tbl_dimension_rownum(schema_madlib, tbl_source, col_ind_var)
 
     # generate a full dict to ease the following string format
     # including several temporary table names
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_fista.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_fista.py_in
index e3e59778f..86604cbe5 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_fista.py_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_fista.py_in
@@ -326,7 +326,7 @@ def __elastic_net_fista_train_compute(schema_madlib, func_step_aggregate,
                                  """)[0]['setting']
     plpy.execute("set client_min_messages to error")
 
-    (dimension, row_num) = __tbl_dimension_rownum(tbl_source, col_ind_var)
+    (dimension, row_num) = __tbl_dimension_rownum(schema_madlib, tbl_source, col_ind_var)
 
     # generate a full dict to ease the following string format
     # including several temporary table names
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in
index 1ef3553e0..7371d5e9b 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in
@@ -298,7 +298,7 @@ def __elastic_net_igd_train_compute(schema_madlib, func_step_aggregate,
                                  """)[0]['setting']
     plpy.execute("set client_min_messages to error")
 
-    (dimension, row_num) = __tbl_dimension_rownum(tbl_source, col_ind_var)
+    (dimension, row_num) = __tbl_dimension_rownum(schema_madlib, tbl_source, col_ind_var)
 
     # generate a full dict to ease the following string format
     # including several temporary table names
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in
index df97d47ae..aa7d49387 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in
@@ -212,7 +212,8 @@ def __compute_data_scales(args):
     if args["family"] == "binomial":
         args["y_scale"] = dict(mean=0, std=1)
     else:
-        args["y_scale"] = __utils_dep_var_scale(tbl_data=args["tbl_source"], col_dep_var=args["col_dep_var"])
+        args["y_scale"] = __utils_dep_var_scale(schema_madlib=args["schema_madlib"], tbl_data=args["tbl_source"], 
+            col_ind_var=args["col_ind_var"], col_dep_var=args["col_dep_var"])
 
     args["xmean_str"] = _array_to_string(args["x_scales"]["mean"])
 # ========================================================================
@@ -246,7 +247,7 @@ def __normalize_data(args):
 # ========================================================================
 
 
-def __tbl_dimension_rownum(tbl_source, col_ind_var):
+def __tbl_dimension_rownum(schema_madlib, tbl_source, col_ind_var):
     """
     Measure the dimension and row number of source data table
     """
@@ -259,7 +260,10 @@ def __tbl_dimension_rownum(tbl_source, col_ind_var):
     # total row number of data source table
     row_num = plpy.execute("""
                            select count(*) from {tbl_source}
-                           """.format(tbl_source=tbl_source))[0]["count"]
+                           WHERE not {schema_madlib}.array_contains_null({col_ind_var})
+                           """.format(tbl_source=tbl_source,
+                           schema_madlib=schema_madlib,
+                           col_ind_var=col_ind_var))[0]["count"]
 
     return (dimension, row_num)
 # ========================================================================
diff --git a/src/ports/postgres/modules/elastic_net/test/elastic_net_install_check.sql_in b/src/ports/postgres/modules/elastic_net/test/elastic_net_install_check.sql_in
index a3ddf347b..87c1e3436 100644
--- a/src/ports/postgres/modules/elastic_net/test/elastic_net_install_check.sql_in
+++ b/src/ports/postgres/modules/elastic_net/test/elastic_net_install_check.sql_in
@@ -6,6 +6,7 @@
 DROP TABLE IF EXISTS lin_housing_wi;
 CREATE TABLE lin_housing_wi (id serial, x float8[],y float8);
 COPY lin_housing_wi (x, y) FROM STDIN NULL '?';
+{1,0.00632,18.00,2.310,NULL,0.5380,6.5750,NULL,4.0900,1,296.0,15.30,396.90,4.98}	24.00
 {1,0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,396.90,4.98}	24.00
 {1,0.02731,0.00,7.070,0,0.4690,6.4210,78.90,4.9671,2,242.0,17.80,396.90,9.14}	21.60
 {1,0.02729,0.00,7.070,0,0.4690,7.1850,61.10,4.9671,2,242.0,17.80,392.83,4.03}	34.70
@@ -512,6 +513,9 @@ COPY lin_housing_wi (x, y) FROM STDIN NULL '?';
 {1,0.06076,0.00,11.930,0,0.5730,6.9760,91.00,2.1675,1,273.0,21.00,396.90,5.64}	23.90
 {1,0.10959,0.00,11.930,0,0.5730,6.7940,89.30,2.3889,1,273.0,21.00,393.45,6.48}	22.00
 {1,0.04741,0.00,11.930,0,0.5730,6.0300,80.80,2.5050,1,273.0,21.00,396.90,7.88}	11.90
+{1,0.06076,0.00,11.930,0,0.5730,6.9760,91.00,2.1675,1,273.0,21.00,396.90,NULL}	23.90
+{1,0.10959,0.00,11.930,NULL,0.5730,6.7940,89.30,2.3889,1,273.0,21.00,NULL,6.48}	22.00
+{NULL,0.04741,0.00,11.930,0,0.5730,6.0300,80.80,2.5050,1,273.0,21.00,396.90,7.88}	11.90
 \.
 
 DROP TABLE IF EXISTS elastic_type_src;
@@ -632,6 +636,7 @@ begin
                                 'id',
                                 'house_test_null_gaussian');
 
+
     EXECUTE 'drop table if exists house_en';
     PERFORM elastic_net_train(
         'lin_housing_wi',
@@ -666,6 +671,7 @@ begin
                                 'id',
                                 'house_test_null_binomial');
 
+
     EXECUTE 'drop table if exists house_en';
     PERFORM elastic_net_train(
         'lin_housing_wi',
@@ -700,6 +706,7 @@ begin
                                 'id',
                                 'house_test_null_binomial');
 
+
     EXECUTE 'DROP TABLE IF EXISTS elastic_type_res';
     PERFORM elastic_net_train('elastic_type_src',
 		'elastic_type_res',

From 488ef3c58509de9fc8d5b78e08d3816010c6342b Mon Sep 17 00:00:00 2001
From: Nandish Jayaram <njayaram@pivotal.io>
Date: Fri, 8 Apr 2016 11:02:30 -0700
Subject: [PATCH 2/2] This commit incoporates the feedback obtained for the
 previous pull request. - A more efficient way of initializing x. - Added a
 comment indicating a strong assumption made about the input data. There is no
 NULL check made for the dependent variable (y) while computing the mean and
 std for the independent variable (x), since one of the hard assumptions made
 of the input data to elastic_net is that the dependent variable should not be
 NULL.

---
 src/modules/convex/utils_regularization.cpp               | 8 ++++----
 .../postgres/modules/elastic_net/elastic_net_utils.py_in  | 4 ++++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/modules/convex/utils_regularization.cpp b/src/modules/convex/utils_regularization.cpp
index 845e9cbd6..fc7e44d8a 100644
--- a/src/modules/convex/utils_regularization.cpp
+++ b/src/modules/convex/utils_regularization.cpp
@@ -79,13 +79,13 @@ class ScalesState
 AnyType utils_var_scales_transition::run (AnyType& args)
 {
     ScalesState<MutableArrayHandle<double> > state = args[0];
+    MappedColumnVector x(NULL);
     try {
-        args[1].getAs<MappedColumnVector>();
+        new (&x) MappedColumnVector(args[1].getAs<MappedColumnVector>());
     } catch(const ArrayWithNullException &e) {
       //  warning("Input array contains NULL values, skipping it for mean and std computations");
         return state;
     }
-    MappedColumnVector x = args[1].getAs<MappedColumnVector>();
     if (state.numRows == 0)
     {
         int dimension = args[2].getAs<int>();
@@ -157,13 +157,13 @@ AnyType __utils_var_scales_result::run (AnyType& args)
 
 AnyType utils_normalize_data::run (AnyType& args)
 {
+    MutableMappedColumnVector x(NULL);
     try {
-        args[0].getAs<MutableMappedColumnVector>();
+        new (&x) MutableMappedColumnVector(args[0].getAs<MutableMappedColumnVector>());
     } catch(const ArrayWithNullException &e) {
        // warning("Input array contains NULL values, skipping this array normalization");
         return Null();
     }
-    MutableMappedColumnVector x = args[0].getAs<MutableMappedColumnVector>();
     MappedColumnVector mean = args[1].getAs<MappedColumnVector>();
     MappedColumnVector std = args[2].getAs<MappedColumnVector>();
 
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in
index aa7d49387..144fe2772 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in
@@ -258,6 +258,10 @@ def __tbl_dimension_rownum(schema_madlib, tbl_source, col_ind_var):
                              """.format(tbl_source=tbl_source,
                                         col_ind_var=col_ind_var))[0]["dimension"]
     # total row number of data source table
+    # The WHERE clause here ignores rows in the table that contain one or more NULLs in the
+    # independent variable (x). There is no NULL check made for the dependent variable (y),
+    # since one of the hard requirements/assumptions of the input data to elastic_net is that the
+    # dependent variable cannot be NULL.
     row_num = plpy.execute("""
                            select count(*) from {tbl_source}
                            WHERE not {schema_madlib}.array_contains_null({col_ind_var})