From 5100dfc76f192812d8ba7ede3fc78c8aa65563cd Mon Sep 17 00:00:00 2001 From: Nandish Jayaram Date: Tue, 5 Apr 2016 16:39:38 -0700 Subject: [PATCH 1/2] Elastic Net: Skip arrays with NULL values in train Jira: MADLIB-978 Having NULL values in the input array of the training data was leading to an unhandled exception. This fix now catches the exception and skips such input arrays. The fix also modifies the code which was used to normalize the input data (independent variable), which now ignores such arrays with NULLs while normalizing. The number of rows in the input table was used while normalizing the data. The query used to get the number of rows is now changed to count only those rows that have no NULL values in the array. The mean of the dependent variable was still computed using an SQL command which was not ignoring the independent variables (arrays) with NULLs. They are ignored which computing the mean now. --- src/modules/convex/utils_regularization.cpp | 14 ++++- .../elastic_net_optimizer_fista.hpp | 28 +++++----- .../elastic_net/elastic_net_optimizer_igd.hpp | 52 +++++++++++-------- src/modules/elastic_net/elastic_net_utils.cpp | 4 +- .../modules/convex/utils_regularization.py_in | 11 +++- .../elastic_net_gaussian_fista.py_in | 2 +- .../elastic_net_gaussian_igd.py_in | 2 +- .../elastic_net_optimizer_fista.py_in | 2 +- .../elastic_net_optimizer_igd.py_in | 2 +- .../elastic_net/elastic_net_utils.py_in | 10 ++-- .../test/elastic_net_install_check.sql_in | 7 +++ 11 files changed, 87 insertions(+), 47 deletions(-) diff --git a/src/modules/convex/utils_regularization.cpp b/src/modules/convex/utils_regularization.cpp index 7d49b8f6e..845e9cbd6 100644 --- a/src/modules/convex/utils_regularization.cpp +++ b/src/modules/convex/utils_regularization.cpp @@ -79,8 +79,13 @@ class ScalesState AnyType utils_var_scales_transition::run (AnyType& args) { ScalesState > state = args[0]; + try { + args[1].getAs(); + } catch(const ArrayWithNullException &e) { + // warning("Input array contains NULL values, skipping it for mean and std computations"); + return state; + } MappedColumnVector x = args[1].getAs(); - if (state.numRows == 0) { int dimension = args[2].getAs(); @@ -99,7 +104,6 @@ AnyType utils_var_scales_transition::run (AnyType& args) } state.numRows++; - return state; } @@ -153,6 +157,12 @@ AnyType __utils_var_scales_result::run (AnyType& args) AnyType utils_normalize_data::run (AnyType& args) { + try { + args[0].getAs(); + } catch(const ArrayWithNullException &e) { + // warning("Input array contains NULL values, skipping this array normalization"); + return Null(); + } MutableMappedColumnVector x = args[0].getAs(); MappedColumnVector mean = args[1].getAs(); MappedColumnVector std = args[2].getAs(); diff --git a/src/modules/elastic_net/elastic_net_optimizer_fista.hpp b/src/modules/elastic_net/elastic_net_optimizer_fista.hpp index daae10cda..a82eed81f 100644 --- a/src/modules/elastic_net/elastic_net_optimizer_fista.hpp +++ b/src/modules/elastic_net/elastic_net_optimizer_fista.hpp @@ -130,19 +130,23 @@ AnyType Fista::fista_transition (AnyType& args, const Allocator& inAlloca state.is_active = args[11].getAs(); } - MappedColumnVector x = args[1].getAs(); - double y; - - Model::get_y(y, args); - - if (state.use_active_set == 1 && state.is_active == 1) - Model::active_transition(state, x, y); - else - Model::normal_transition(state, x, y); - - Model::update_loglikelihood(state, x, y); - state.numRows++; + try { + MappedColumnVector x = args[1].getAs(); + double y; + Model::get_y(y, args); + if (state.use_active_set == 1 && state.is_active == 1) + Model::active_transition(state, x, y); + else + Model::normal_transition(state, x, y); + + Model::update_loglikelihood(state, x, y); + state.numRows++; + } catch(const ArrayWithNullException &e) { + // warning("Input array most likely contains NULL, skipping this array."); + } catch(const std::invalid_argument &ia) { + //warning("Input array is invalid (with NULL values), skipping this array."); + } return state; } diff --git a/src/modules/elastic_net/elastic_net_optimizer_igd.hpp b/src/modules/elastic_net/elastic_net_optimizer_igd.hpp index 6441231b7..d75006e00 100644 --- a/src/modules/elastic_net/elastic_net_optimizer_igd.hpp +++ b/src/modules/elastic_net/elastic_net_optimizer_igd.hpp @@ -89,35 +89,41 @@ AnyType Igd::igd_transition (AnyType& args, const Allocator& inAllocator) state.stepsize = state.stepsize / exp(state.step_decay); - MappedColumnVector x = args[1].getAs(); - double y; + try { + MappedColumnVector x = args[1].getAs(); + double y; - Model::get_y(y, args); + Model::get_y(y, args); - ColumnVector gradient(state.dimension); // gradient for coef only, not for intercept - Model::compute_gradient(gradient, state, x, y); + ColumnVector gradient(state.dimension); // gradient for coef only, not for intercept + Model::compute_gradient(gradient, state, x, y); - double a = state.stepsize / static_cast(state.totalRows); - double b = state.stepsize * state.alpha * state.lambda - / static_cast(state.totalRows); - for (uint32_t i = 0; i < state.dimension; i++) - { - // step 1 - state.theta(i) -= a * gradient(i); - double step1_sign = sign(state.theta(i)); - // step 2 - state.theta(i) -= b * sign(state.theta(i)); - // set to 0 if the value crossed zero during the two steps - if (step1_sign != sign(state.theta(i))) state.theta(i) = 0; - } + double a = state.stepsize / static_cast(state.totalRows); + double b = state.stepsize * state.alpha * state.lambda + / static_cast(state.totalRows); + for (uint32_t i = 0; i < state.dimension; i++) + { + // step 1 + state.theta(i) -= a * gradient(i); + double step1_sign = sign(state.theta(i)); + // step 2 + state.theta(i) -= b * sign(state.theta(i)); + // set to 0 if the value crossed zero during the two steps + if (step1_sign != sign(state.theta(i))) state.theta(i) = 0; + } - link_fn(state.theta, state.coef, state.p); + link_fn(state.theta, state.coef, state.p); - Model::update_intercept(state, x, y); // intercept is updated separately + Model::update_intercept(state, x, y); // intercept is updated separately - Model::update_loglikelihood(state, x, y); - // state.loss += r * r / 2.; - state.numRows ++; + Model::update_loglikelihood(state, x, y); + // state.loss += r * r / 2.; + state.numRows ++; + } catch(const ArrayWithNullException &e) { + // warning("Input array most likely contains NULL, skipping this array."); + } catch(const std::invalid_argument &ia) { + //warning("Input array is invalid (with NULL values), skipping this array."); + } return state; } diff --git a/src/modules/elastic_net/elastic_net_utils.cpp b/src/modules/elastic_net/elastic_net_utils.cpp index 354c21664..ee1640a2a 100644 --- a/src/modules/elastic_net/elastic_net_utils.cpp +++ b/src/modules/elastic_net/elastic_net_utils.cpp @@ -30,13 +30,15 @@ AnyType __elastic_net_gaussian_predict::run (AnyType& args) } MappedColumnVector coef = args[0].getAs(); - double intercept = args[1].getAs(); MappedColumnVector x = args[2].getAs(); + double intercept = args[1].getAs(); double predict = intercept + sparse_dot(coef, x); return predict; } + + // ------------------------------------------------------------------------ /** diff --git a/src/ports/postgres/modules/convex/utils_regularization.py_in b/src/ports/postgres/modules/convex/utils_regularization.py_in index 72c293cce..f28ab60ad 100644 --- a/src/ports/postgres/modules/convex/utils_regularization.py_in +++ b/src/ports/postgres/modules/convex/utils_regularization.py_in @@ -38,7 +38,6 @@ def __utils_ind_var_scales(**kwargs): return x_scales # ======================================================================== - def __utils_dep_var_scale(**kwargs): """ The mean and standard deviation for each element of the dependent variable, @@ -47,14 +46,22 @@ def __utils_dep_var_scale(**kwargs): The output will be stored in a temp table: a mean array and a std array This function is also used in lasso. + + Parameters: + schema_madlib -- madlib schema + tbl_data -- original data + col_ind_var -- independent variables column + col_dep_var -- dependent variable column """ + y_scale = plpy.execute( """ select - avg({col_dep_var}) as mean, + avg(case when not {schema_madlib}.array_contains_null({col_ind_var}) then {col_dep_var} end) as mean, 1 as std from {tbl_data} """.format(**kwargs))[0] + return y_scale # ======================================================================== diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_gaussian_fista.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_gaussian_fista.py_in index 3c0c20be1..fd56fa01b 100644 --- a/src/ports/postgres/modules/elastic_net/elastic_net_gaussian_fista.py_in +++ b/src/ports/postgres/modules/elastic_net/elastic_net_gaussian_fista.py_in @@ -282,7 +282,7 @@ def __elastic_net_gaussian_fista_train_compute( """)[0]['setting'] plpy.execute("set client_min_messages to error") - (dimension, row_num) = __tbl_dimension_rownum(tbl_source, col_ind_var) + (dimension, row_num) = __tbl_dimension_rownum(schema_madlib, tbl_source, col_ind_var) # generate a full dict to ease the following string format # including several temporary table names diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_gaussian_igd.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_gaussian_igd.py_in index cbf8c7cb2..d9f9c2d5e 100644 --- a/src/ports/postgres/modules/elastic_net/elastic_net_gaussian_igd.py_in +++ b/src/ports/postgres/modules/elastic_net/elastic_net_gaussian_igd.py_in @@ -268,7 +268,7 @@ def __elastic_net_gaussian_igd_train_compute(schema_madlib, tbl_source, col_ind_ """)[0]['setting'] plpy.execute("set client_min_messages to error") - (dimension, row_num) = __tbl_dimension_rownum(tbl_source, col_ind_var) + (dimension, row_num) = __tbl_dimension_rownum(schema_madlib, tbl_source, col_ind_var) # generate a full dict to ease the following string format # including several temporary table names diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_fista.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_fista.py_in index e3e59778f..86604cbe5 100644 --- a/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_fista.py_in +++ b/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_fista.py_in @@ -326,7 +326,7 @@ def __elastic_net_fista_train_compute(schema_madlib, func_step_aggregate, """)[0]['setting'] plpy.execute("set client_min_messages to error") - (dimension, row_num) = __tbl_dimension_rownum(tbl_source, col_ind_var) + (dimension, row_num) = __tbl_dimension_rownum(schema_madlib, tbl_source, col_ind_var) # generate a full dict to ease the following string format # including several temporary table names diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in index 1ef3553e0..7371d5e9b 100644 --- a/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in +++ b/src/ports/postgres/modules/elastic_net/elastic_net_optimizer_igd.py_in @@ -298,7 +298,7 @@ def __elastic_net_igd_train_compute(schema_madlib, func_step_aggregate, """)[0]['setting'] plpy.execute("set client_min_messages to error") - (dimension, row_num) = __tbl_dimension_rownum(tbl_source, col_ind_var) + (dimension, row_num) = __tbl_dimension_rownum(schema_madlib, tbl_source, col_ind_var) # generate a full dict to ease the following string format # including several temporary table names diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in index df97d47ae..aa7d49387 100644 --- a/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in +++ b/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in @@ -212,7 +212,8 @@ def __compute_data_scales(args): if args["family"] == "binomial": args["y_scale"] = dict(mean=0, std=1) else: - args["y_scale"] = __utils_dep_var_scale(tbl_data=args["tbl_source"], col_dep_var=args["col_dep_var"]) + args["y_scale"] = __utils_dep_var_scale(schema_madlib=args["schema_madlib"], tbl_data=args["tbl_source"], + col_ind_var=args["col_ind_var"], col_dep_var=args["col_dep_var"]) args["xmean_str"] = _array_to_string(args["x_scales"]["mean"]) # ======================================================================== @@ -246,7 +247,7 @@ def __normalize_data(args): # ======================================================================== -def __tbl_dimension_rownum(tbl_source, col_ind_var): +def __tbl_dimension_rownum(schema_madlib, tbl_source, col_ind_var): """ Measure the dimension and row number of source data table """ @@ -259,7 +260,10 @@ def __tbl_dimension_rownum(tbl_source, col_ind_var): # total row number of data source table row_num = plpy.execute(""" select count(*) from {tbl_source} - """.format(tbl_source=tbl_source))[0]["count"] + WHERE not {schema_madlib}.array_contains_null({col_ind_var}) + """.format(tbl_source=tbl_source, + schema_madlib=schema_madlib, + col_ind_var=col_ind_var))[0]["count"] return (dimension, row_num) # ======================================================================== diff --git a/src/ports/postgres/modules/elastic_net/test/elastic_net_install_check.sql_in b/src/ports/postgres/modules/elastic_net/test/elastic_net_install_check.sql_in index a3ddf347b..87c1e3436 100644 --- a/src/ports/postgres/modules/elastic_net/test/elastic_net_install_check.sql_in +++ b/src/ports/postgres/modules/elastic_net/test/elastic_net_install_check.sql_in @@ -6,6 +6,7 @@ DROP TABLE IF EXISTS lin_housing_wi; CREATE TABLE lin_housing_wi (id serial, x float8[],y float8); COPY lin_housing_wi (x, y) FROM STDIN NULL '?'; +{1,0.00632,18.00,2.310,NULL,0.5380,6.5750,NULL,4.0900,1,296.0,15.30,396.90,4.98} 24.00 {1,0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,396.90,4.98} 24.00 {1,0.02731,0.00,7.070,0,0.4690,6.4210,78.90,4.9671,2,242.0,17.80,396.90,9.14} 21.60 {1,0.02729,0.00,7.070,0,0.4690,7.1850,61.10,4.9671,2,242.0,17.80,392.83,4.03} 34.70 @@ -512,6 +513,9 @@ COPY lin_housing_wi (x, y) FROM STDIN NULL '?'; {1,0.06076,0.00,11.930,0,0.5730,6.9760,91.00,2.1675,1,273.0,21.00,396.90,5.64} 23.90 {1,0.10959,0.00,11.930,0,0.5730,6.7940,89.30,2.3889,1,273.0,21.00,393.45,6.48} 22.00 {1,0.04741,0.00,11.930,0,0.5730,6.0300,80.80,2.5050,1,273.0,21.00,396.90,7.88} 11.90 +{1,0.06076,0.00,11.930,0,0.5730,6.9760,91.00,2.1675,1,273.0,21.00,396.90,NULL} 23.90 +{1,0.10959,0.00,11.930,NULL,0.5730,6.7940,89.30,2.3889,1,273.0,21.00,NULL,6.48} 22.00 +{NULL,0.04741,0.00,11.930,0,0.5730,6.0300,80.80,2.5050,1,273.0,21.00,396.90,7.88} 11.90 \. DROP TABLE IF EXISTS elastic_type_src; @@ -632,6 +636,7 @@ begin 'id', 'house_test_null_gaussian'); + EXECUTE 'drop table if exists house_en'; PERFORM elastic_net_train( 'lin_housing_wi', @@ -666,6 +671,7 @@ begin 'id', 'house_test_null_binomial'); + EXECUTE 'drop table if exists house_en'; PERFORM elastic_net_train( 'lin_housing_wi', @@ -700,6 +706,7 @@ begin 'id', 'house_test_null_binomial'); + EXECUTE 'DROP TABLE IF EXISTS elastic_type_res'; PERFORM elastic_net_train('elastic_type_src', 'elastic_type_res', From 488ef3c58509de9fc8d5b78e08d3816010c6342b Mon Sep 17 00:00:00 2001 From: Nandish Jayaram Date: Fri, 8 Apr 2016 11:02:30 -0700 Subject: [PATCH 2/2] This commit incoporates the feedback obtained for the previous pull request. - A more efficient way of initializing x. - Added a comment indicating a strong assumption made about the input data. There is no NULL check made for the dependent variable (y) while computing the mean and std for the independent variable (x), since one of the hard assumptions made of the input data to elastic_net is that the dependent variable should not be NULL. --- src/modules/convex/utils_regularization.cpp | 8 ++++---- .../postgres/modules/elastic_net/elastic_net_utils.py_in | 4 ++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/modules/convex/utils_regularization.cpp b/src/modules/convex/utils_regularization.cpp index 845e9cbd6..fc7e44d8a 100644 --- a/src/modules/convex/utils_regularization.cpp +++ b/src/modules/convex/utils_regularization.cpp @@ -79,13 +79,13 @@ class ScalesState AnyType utils_var_scales_transition::run (AnyType& args) { ScalesState > state = args[0]; + MappedColumnVector x(NULL); try { - args[1].getAs(); + new (&x) MappedColumnVector(args[1].getAs()); } catch(const ArrayWithNullException &e) { // warning("Input array contains NULL values, skipping it for mean and std computations"); return state; } - MappedColumnVector x = args[1].getAs(); if (state.numRows == 0) { int dimension = args[2].getAs(); @@ -157,13 +157,13 @@ AnyType __utils_var_scales_result::run (AnyType& args) AnyType utils_normalize_data::run (AnyType& args) { + MutableMappedColumnVector x(NULL); try { - args[0].getAs(); + new (&x) MutableMappedColumnVector(args[0].getAs()); } catch(const ArrayWithNullException &e) { // warning("Input array contains NULL values, skipping this array normalization"); return Null(); } - MutableMappedColumnVector x = args[0].getAs(); MappedColumnVector mean = args[1].getAs(); MappedColumnVector std = args[2].getAs(); diff --git a/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in b/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in index aa7d49387..144fe2772 100644 --- a/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in +++ b/src/ports/postgres/modules/elastic_net/elastic_net_utils.py_in @@ -258,6 +258,10 @@ def __tbl_dimension_rownum(schema_madlib, tbl_source, col_ind_var): """.format(tbl_source=tbl_source, col_ind_var=col_ind_var))[0]["dimension"] # total row number of data source table + # The WHERE clause here ignores rows in the table that contain one or more NULLs in the + # independent variable (x). There is no NULL check made for the dependent variable (y), + # since one of the hard requirements/assumptions of the input data to elastic_net is that the + # dependent variable cannot be NULL. row_num = plpy.execute(""" select count(*) from {tbl_source} WHERE not {schema_madlib}.array_contains_null({col_ind_var})