From e07d833d3aafe6ba100c90f1e2a1f592ecabee84 Mon Sep 17 00:00:00 2001 From: Nandish Jayaram Date: Mon, 17 Dec 2018 17:54:42 -0800 Subject: [PATCH 1/6] Minibatch Preprocessor for Deep learning The minibatch preprocessor we currently have in MADlib is bloated for DL tasks. This feature adds a simplified version of creating buffers, and divides each element of the independent array by a normalizing constant for standardization (which is 255.0 by default). This is standard practice with image data. Co-authored-by: Arvind Sridhar Co-authored-by: Domino Valdano --- doc/mainpage.dox.in | 6 + .../utilities/minibatch_preprocessing.py_in | 177 ++++++++- .../minibatch_preprocessing_dl.sql_in | 376 ++++++++++++++++++ .../test/minibatch_preprocessing_dl.sql_in | 67 ++++ .../modules/utilities/utilities.py_in | 4 +- 5 files changed, 626 insertions(+), 4 deletions(-) create mode 100644 src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in create mode 100644 src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in index aab37afad..5568da6b6 100644 --- a/doc/mainpage.dox.in +++ b/doc/mainpage.dox.in @@ -285,6 +285,12 @@ complete matrix stored as a distributed table. Interface and implementation are subject to change. @{ @defgroup grp_cg Conjugate Gradient + @defgroup grp_dl Deep Learning + @brief A collection of deep learning interfaces. + @details A collection of deep learning interfaces. + @{ + @defgroup grp_minibatch_preprocessing_dl Mini-Batch Preprocessor for Deep Learning + @} @defgroup grp_knn k-Nearest Neighbors @defgroup grp_bayes Naive Bayes Classification @defgroup grp_sample Random Sampling diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in index 88433c937..12a16ac9c 100644 --- a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in +++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in @@ -39,7 +39,7 @@ from utilities import py_list_to_sql_string from utilities import split_quoted_delimited_str from utilities import unique_string from utilities import validate_module_input_params -from utilities import NUMERIC, INTEGER, TEXT, BOOLEAN, INCLUDE_ARRAY +from utilities import NUMERIC, INTEGER, TEXT, BOOLEAN, INCLUDE_ARRAY, ONLY_ARRAY from mean_std_dev_calculator import MeanStdDevCalculator from validate_args import get_expr_type @@ -51,6 +51,100 @@ m4_changequote(`') MINIBATCH_OUTPUT_DEPENDENT_COLNAME = "dependent_varname" MINIBATCH_OUTPUT_INDEPENDENT_COLNAME = "independent_varname" +class MiniBatchPreProcessorDL: + def __init__(self, schema_madlib, source_table, output_table, + dependent_varname, independent_varname, buffer_size, + normalizing_const, **kwargs): + self.schema_madlib = schema_madlib + self.source_table = source_table + self.output_table = output_table + self.dependent_varname = dependent_varname + self.independent_varname = independent_varname + self.buffer_size = buffer_size + self.normalizing_const = normalizing_const + self.module_name = "minibatch_preprocessor_DL" + self.output_summary_table = add_postfix(self.output_table, "_summary") + self._validate_args() + self.num_of_buffers = self._get_num_buffers() + + def minibatch_preprocessor_dl(self): + norm_tbl = unique_string(desp='normalized') + # Create a temp table that has independent var normalized. + scalar_mult_sql = """ + CREATE TEMP TABLE {norm_tbl} AS + SELECT {self.schema_madlib}.array_scalar_mult( + {self.independent_varname}::REAL[], (1/{self.normalizing_const})::REAL) AS x_norm, + {self.dependent_varname} AS y, + row_number() over() AS row_id + FROM {self.source_table} + """.format(**locals()) + plpy.execute(scalar_mult_sql) + # Create the mini-batched output table + sql = """ + CREATE TABLE {self.output_table} AS + SELECT * FROM + ( + SELECT {self.schema_madlib}.agg_array_concat( + ARRAY[{norm_tbl}.x_norm::REAL[]]) AS {x}, + array_agg({norm_tbl}.y) AS {y}, + ({norm_tbl}.row_id%{self.num_of_buffers})::smallint AS buffer_id + FROM {norm_tbl} + GROUP BY buffer_id + ) b + DISTRIBUTED BY (buffer_id) + """.format(x=MINIBATCH_OUTPUT_INDEPENDENT_COLNAME, + y=MINIBATCH_OUTPUT_DEPENDENT_COLNAME, **locals()) + plpy.execute(sql) + plpy.execute("DROP TABLE {}".format(norm_tbl)) + # Create summary table + self._create_output_summary_table() + + def _create_output_summary_table(self): + query = """ + CREATE TABLE {self.output_summary_table} AS + SELECT + $__madlib__${self.source_table}$__madlib__$::TEXT AS source_table, + $__madlib__${self.output_table}$__madlib__$::TEXT AS output_table, + $__madlib__${self.dependent_varname}$__madlib__$::TEXT AS dependent_varname, + $__madlib__${self.independent_varname}$__madlib__$::TEXT AS independent_varname, + $__madlib__${self.dependent_vartype}$__madlib__$::TEXT AS dependent_vartype, + {self.buffer_size} AS buffer_size + """.format(self=self) + plpy.execute(query) + + def _validate_args(self): + validate_module_input_params( + self.source_table, self.output_table, self.independent_varname, + self.dependent_varname, self.module_name, None, + [self.output_summary_table]) + self.independent_vartype = get_expr_type( + self.independent_varname, self.source_table) + _assert(is_valid_psql_type(self.independent_vartype, + NUMERIC | ONLY_ARRAY), + "Invalid independent variable type, should be an array of " \ + "one of {0}".format(','.join(NUMERIC))) + self.dependent_vartype = get_expr_type( + self.dependent_varname, self.source_table) + dep_valid_types = NUMERIC | TEXT | BOOLEAN + _assert(is_valid_psql_type(self.dependent_vartype, dep_valid_types), + "Invalid dependent variable type, should be one of {0}". + format(','.join(dep_valid_types))) + if self.buffer_size is not None: + _assert(self.buffer_size > 0, + "minibatch_preprocessor_dl: The buffer size has to be a " \ + "positive integer or NULL.") + + def _get_num_buffers(self): + num_rows_in_tbl = plpy.execute(""" + SELECT count(*) AS cnt FROM {} + """.format(self.source_table))[0]['cnt'] + buffer_size_calculator = MiniBatchBufferSizeCalculator() + indepdent_var_dim = _tbl_dimension_rownum( + self.schema_madlib, self.source_table, self.independent_varname, + skip_row_count=True) + self.buffer_size = buffer_size_calculator.calculate_default_buffer_size( + self.buffer_size, num_rows_in_tbl, indepdent_var_dim[0]) + return ceil((1.0*num_rows_in_tbl)/self.buffer_size) class MiniBatchPreProcessor: """ @@ -509,7 +603,7 @@ class MiniBatchDocumentation: SELECT {schema_madlib}.{method}( source_table, -- TEXT. Name of the table containing input data. Can also be a view - output_table , -- TEXT. Name of the output table for + output_table, -- TEXT. Name of the output table for mini-batching dependent_varname, -- TEXT. Name of the dependent variable column independent_varname, -- TEXT. Name of the independent variable @@ -580,3 +674,82 @@ class MiniBatchDocumentation: for help. """.format(**locals()) # --------------------------------------------------------------------- + @staticmethod + def minibatch_preprocessor_dl_help(schema_madlib, message): + method = "minibatch_preprocessor_dl" + summary = """ + ---------------------------------------------------------------- + SUMMARY + ---------------------------------------------------------------- + For Deep Learning based techniques such as Convolutional Neural Nets, + the input data is mostly images. These images can be represented as an + array of numbers where all elements are between 0 and 255 in value. + It is standard practice to divide each of these numbers by 255.0 to + normalize the image data. minibatch_preprocessor() is for general + use-cases, but for deep learning based use-cases we provide + minibatch_preprocessor_dl() that is light-weight and is + specific to image datasets. + + The normalizing constant is parameterized, and can be specified based + on the kind of image data used. + + For more details on function usage: + SELECT {schema_madlib}.{method}('usage') + """.format(**locals()) + + usage = """ + --------------------------------------------------------------------------- + USAGE + --------------------------------------------------------------------------- + SELECT {schema_madlib}.{method}( + source_table, -- TEXT. Name of the table containing input + data. Can also be a view + output_table, -- TEXT. Name of the output table for + mini-batching + dependent_varname, -- TEXT. Name of the dependent variable column + independent_varname, -- TEXT. Name of the independent variable + column + buffer_size -- INTEGER. Default computed automatically. + Number of source input rows to pack into a buffer + normalizing_const -- DOUBLE PRECISON. Default 255.0. The + normalizing constant to use for + standardizing arrays in independent_varname. + ); + + + --------------------------------------------------------------------------- + OUTPUT + --------------------------------------------------------------------------- + The output table produced by MiniBatch Preprocessor contains the + following columns: + + buffer_id -- INTEGER. Unique id for packed table. + dependent_varname -- FLOAT8[]. Packed array of dependent variables. + independent_varname -- FLOAT8[]. Packed array of independent + variables. + + --------------------------------------------------------------------------- + The algorithm also creates a summary table named _summary + that has the following columns: + + source_table -- Source table name. + output_table -- Output table name from preprocessor. + dependent_varname -- Dependent variable from the original table. + independent_varname -- Independent variables from the original + table. + dependent_vartype -- Type of the dependent variable from the + original table. + buffer_size -- Buffer size used in preprocessing step. + + --------------------------------------------------------------------------- + """.format(**locals()) + + + if not message: + return summary + elif message.lower() in ('usage', 'help', '?'): + return usage + return """ + No such option. Use "SELECT {schema_madlib}.minibatch_preprocessor_dl()" + for help. + """.format(**locals()) diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in new file mode 100644 index 000000000..ed85a217a --- /dev/null +++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in @@ -0,0 +1,376 @@ +/* ----------------------------------------------------------------------- */ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * @file minibatch_preprocessing_dl.sql_in + * @brief TODO + * @date December 2018 + * + */ +/* ----------------------------------------------------------------------- */ + +m4_include(`SQLCommon.m4') + +/** +@addtogroup grp_minibatch_preprocessing_dl + + + +For Deep Learning based techniques such as Convolutional Neural Nets, the input +data is mostly images. These images can be represented as an array of numbers +where all elements are between 0 and 255 in value. It is standard practice +to divide each of these numbers by 255.0 to normalize the image data. +minibatch_preprocessor() is for general use-cases, but for deep learning based +use-cases we provide minibatch_preprocessor_dl() that is light-weight and is +specific to image datasets. The normalizing constant is parameterized, and can +be specified based on the kind of image data used. +
+minibatch_preprocessor_dl(source_table,
+                        output_table,
+                        dependent_varname,
+                        independent_varname,
+                        buffer_size,
+                        normalizing_const
+                        )
+
+ +\b Arguments +
+
source_table
+
TEXT. Name of the table containing input data. Can also be a view. +
+ +
output_table
+
TEXT. Name of the output table from the preprocessor which + will be used as input to algorithms that support mini-batching. + Note that the arrays packed into the output table are randomized + and normalized (by dividing each element in the independent variable array + by the normalizing_const), so they will not match up in an obvious way with + the rows in the source table. +
+ +
dependent_varname
+
TEXT. Name of the dependent variable column. +
+ +
independent_varname
+
TEXT. Name of the independent variable column. The column must be of + a numeric array type. +
+ +
buffer_size (optional)
+
INTEGER, default: computed. Buffer size is the + number of rows from the + source table that are packed into one row of the preprocessor + output table. The default value is computed considering size of + the source table, number of independent variables, number of groups, + and number of segments in the database cluster. For larger data sets, + the computed buffer size will typically be a value in the millions. +
+ +
normalizing_const (optional)
+
DOUBLE PRECISION, default: 255.0. The normalizing constant to divide + each value in the independent_varname array by. +
+
+ +Output tables +
+ The output table produced by the mini-batch preprocessor contains the following columns: + + + + + + + + + + + + + +
buffer_idINTEGER. Unique id for packed table. +
dependent_varnameANYARRAY[]. Packed array of dependent variables. The type + of the array is the same as the type of the dependent variable from + the source table. +
independent_varnameREAL[]. Packed array of independent variables. +
+ +A summary table named \_summary is also created, which has the following columns: + + + + + + + + + + + + + + + + + + + + + + + + + +
source_tableName of the source table.
output_tableName of output table generated by preprocessor.
dependent_varnameDependent variable from the source table.
independent_varnameIndependent variable from the source table.
dependent_vartypeType of the dependent varialbe from the source table.
buffer_sizeBuffer size used in preprocessing step.
+ +@anchor example +@par Examples +-# Create an input data set based on the well known iris data set: +
+DROP TABLE IF EXISTS iris_data;
+CREATE TABLE iris_data(
+    id serial,
+    attributes numeric[],
+    class_text varchar,
+    class integer,
+    state varchar
+);
+INSERT INTO iris_data(id, attributes, class_text, class, state) VALUES
+(1,ARRAY[5.0,3.2,1.2,0.2],'Iris_setosa',1,'Alaska'),
+(2,ARRAY[5.5,3.5,1.3,0.2],'Iris_setosa',1,'Alaska'),
+(3,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Alaska'),
+(4,ARRAY[4.4,3.0,1.3,0.2],'Iris_setosa',1,'Alaska'),
+(5,ARRAY[5.1,3.4,1.5,0.2],'Iris_setosa',1,'Alaska'),
+(6,ARRAY[5.0,3.5,1.3,0.3],'Iris_setosa',1,'Alaska'),
+(7,ARRAY[4.5,2.3,1.3,0.3],'Iris_setosa',1,'Alaska'),
+(8,ARRAY[4.4,3.2,1.3,0.2],'Iris_setosa',1,'Alaska'),
+(9,ARRAY[5.0,3.5,1.6,0.6],'Iris_setosa',1,'Alaska'),
+(10,ARRAY[5.1,3.8,1.9,0.4],'Iris_setosa',1,'Alaska'),
+(11,ARRAY[4.8,3.0,1.4,0.3],'Iris_setosa',1,'Alaska'),
+(12,ARRAY[5.1,3.8,1.6,0.2],'Iris_setosa',1,'Alaska'),
+(13,ARRAY[5.7,2.8,4.5,1.3],'Iris_versicolor',2,'Alaska'),
+(14,ARRAY[6.3,3.3,4.7,1.6],'Iris_versicolor',2,'Alaska'),
+(15,ARRAY[4.9,2.4,3.3,1.0],'Iris_versicolor',2,'Alaska'),
+(16,ARRAY[6.6,2.9,4.6,1.3],'Iris_versicolor',2,'Alaska'),
+(17,ARRAY[5.2,2.7,3.9,1.4],'Iris_versicolor',2,'Alaska'),
+(18,ARRAY[5.0,2.0,3.5,1.0],'Iris_versicolor',2,'Alaska'),
+(19,ARRAY[5.9,3.0,4.2,1.5],'Iris_versicolor',2,'Alaska'),
+(20,ARRAY[6.0,2.2,4.0,1.0],'Iris_versicolor',2,'Alaska'),
+(21,ARRAY[6.1,2.9,4.7,1.4],'Iris_versicolor',2,'Alaska'),
+(22,ARRAY[5.6,2.9,3.6,1.3],'Iris_versicolor',2,'Alaska'),
+(23,ARRAY[6.7,3.1,4.4,1.4],'Iris_versicolor',2,'Alaska'),
+(24,ARRAY[5.6,3.0,4.5,1.5],'Iris_versicolor',2,'Alaska'),
+(25,ARRAY[5.8,2.7,4.1,1.0],'Iris_versicolor',2,'Alaska'),
+(26,ARRAY[6.2,2.2,4.5,1.5],'Iris_versicolor',2,'Alaska'),
+(27,ARRAY[5.6,2.5,3.9,1.1],'Iris_versicolor',2,'Alaska'),
+(28,ARRAY[5.0,3.4,1.5,0.2],'Iris_setosa',1,'Tennessee'),
+(29,ARRAY[4.4,2.9,1.4,0.2],'Iris_setosa',1,'Tennessee'),
+(30,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Tennessee'),
+(31,ARRAY[5.4,3.7,1.5,0.2],'Iris_setosa',1,'Tennessee'),
+(32,ARRAY[4.8,3.4,1.6,0.2],'Iris_setosa',1,'Tennessee'),
+(33,ARRAY[4.8,3.0,1.4,0.1],'Iris_setosa',1,'Tennessee'),
+(34,ARRAY[4.3,3.0,1.1,0.1],'Iris_setosa',1,'Tennessee'),
+(35,ARRAY[5.8,4.0,1.2,0.2],'Iris_setosa',1,'Tennessee'),
+(36,ARRAY[5.7,4.4,1.5,0.4],'Iris_setosa',1,'Tennessee'),
+(37,ARRAY[5.4,3.9,1.3,0.4],'Iris_setosa',1,'Tennessee'),
+(38,ARRAY[6.0,2.9,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
+(39,ARRAY[5.7,2.6,3.5,1.0],'Iris_versicolor',2,'Tennessee'),
+(40,ARRAY[5.5,2.4,3.8,1.1],'Iris_versicolor',2,'Tennessee'),
+(41,ARRAY[5.5,2.4,3.7,1.0],'Iris_versicolor',2,'Tennessee'),
+(42,ARRAY[5.8,2.7,3.9,1.2],'Iris_versicolor',2,'Tennessee'),
+(43,ARRAY[6.0,2.7,5.1,1.6],'Iris_versicolor',2,'Tennessee'),
+(44,ARRAY[5.4,3.0,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
+(45,ARRAY[6.0,3.4,4.5,1.6],'Iris_versicolor',2,'Tennessee'),
+(46,ARRAY[6.7,3.1,4.7,1.5],'Iris_versicolor',2,'Tennessee'),
+(47,ARRAY[6.3,2.3,4.4,1.3],'Iris_versicolor',2,'Tennessee'),
+(48,ARRAY[5.6,3.0,4.1,1.3],'Iris_versicolor',2,'Tennessee'),
+(49,ARRAY[5.5,2.5,4.0,1.3],'Iris_versicolor',2,'Tennessee'),
+(50,ARRAY[5.5,2.6,4.4,1.2],'Iris_versicolor',2,'Tennessee'),
+(51,ARRAY[6.1,3.0,4.6,1.4],'Iris_versicolor',2,'Tennessee'),
+(52,ARRAY[5.8,2.6,4.0,1.2],'Iris_versicolor',2,'Tennessee');
+
+ +-# Run the preprocessor: +
+DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary;
+SELECT madlib.minibatch_preprocessor_dl('iris_data',         -- Source table
+                                     'iris_data_packed',  -- Output table
+                                     'class_text',        -- Dependent variable
+                                     'attributes',        -- Independent variables
+                                     NULL,                -- buffer size
+                                     2                    -- normalizing constant
+                                     );
+
+For small datasets like in this example, buffer size is mainly +determined by the number of segments in the database. +This example is run on a Greenplum database with 3 segments, +so there are 3 rows with a buffer size of 18. +For PostgresSQL, there would be only one row with a buffer +size of 52 since it is a single node database. +For larger data sets, other factors go into +computing buffers size besides number of segments. +Also, note that the dependent variable has +been one-hot encoded since it is categorical. +Here is a sample of the packed output table: +
+\\x on
+SELECT * FROM iris_data_packed;
+
+
+-[ RECORD 1 ]-------+-------------------------------------
+independent_varname | {{2.55,1.7,0.75,0.1},{2.55,1.9,0.95,0.2},{2.9,1.35,1.95,0.6},{3.35,1.55,2.2,0.7},{2.85,1.3,1.75,0.5},{3.1,1.1,2.25,0.75},...}}
+dependent_varname   | {Iris_versicolor,Iris_versicolor,Iris_versicolor,Iris_versicolor,Iris_setosa,Iris_setosa,...}
+buffer_id           | 0
+-[ RECORD 2 ]-------+-------------------------------------
+independent_varname | {{2.4,1.5,0.7,0.15},{2.2,1.6,0.65,0.1},{2.8,1.45,1.8,0.65},{2.9,1.3,2,0.6},{2.2,1.45,0.7,0.1},{2.85,1.4,2.25,0.65},{2.8,1.25,1.95,0.55},...}}
+dependent_varname   | {Iris_setosa,Iris_setosa,Iris_versicolor,Iris_versicolor,Iris_setosa,Iris_versicolor,...}
+buffer_id           | 1
+-[ RECORD 2 ]-------+-------------------------------------
+independent_varname | {{2.4,1.7,0.8,0.1},{2.9,1.35,2.05,0.5},{3.15,1.65,2.35,0.8},{2.7,1.95,0.65,0.2},{2.75,1.3,2.2,0.6},{3,1.1,2,0.5},{2.55,1.9,0.8,0.1},...}}
+dependent_varname   | {Iris_versicolor,Iris_setosa,Iris_versicolor,Iris_versicolor,Iris_setosa,Iris_setosa,...}
+buffer_id           | 2
+
+Review the output summary table: +
+\\x on
+SELECT * FROM iris_data_packed_summary;
+
+
+-[ RECORD 1 ]-------+------------------
+source_table        | iris_data
+output_table        | iris_data_packed
+dependent_varname   | class_text
+independent_varname | attributes
+dependent_vartype   | character varying
+buffer_size         | 18
+
+ +-# Generally the default buffer size will work well, +but if you have occasion to change it: +
+DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary;
+SELECT madlib.minibatch_preprocessor_dl('iris_data',         -- Source table
+                                     'iris_data_packed',  -- Output table
+                                     'class_text',        -- Dependent variable
+                                     'attributes',        -- Independent variables
+                                     10                   -- Buffer size
+                                     );
+
+Review the number of buffers in the output table: +
+SELECT COUNT(*) FROM iris_data_packed;
+
+
+-[ RECORD 1 ]
+count | 6
+
+Review the output summary table: +
+\\x on
+SELECT * FROM iris_data_packed_summary;
+
+
+-[ RECORD 1 ]-------+------------------
+source_table        | iris_data
+output_table        | iris_data_packed
+dependent_varname   | class_text
+independent_varname | attributes
+dependent_vartype   | character varying
+buffer_size         | 10
+
+ +*/ + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( + source_table VARCHAR, + output_table VARCHAR, + dependent_varname VARCHAR, + independent_varname VARCHAR, + buffer_size INTEGER, + normalizing_const DOUBLE PRECISION +) RETURNS VOID AS $$ + PythonFunctionBodyOnly(utilities, minibatch_preprocessing) + from utilities.control import MinWarning + with AOControl(False): + with MinWarning('error'): + minibatch_preprocessor_obj = minibatch_preprocessing.MiniBatchPreProcessorDL(**globals()) + minibatch_preprocessor_obj.minibatch_preprocessor_dl() +$$ LANGUAGE plpythonu VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( + source_table VARCHAR, + output_table VARCHAR, + dependent_varname VARCHAR, + independent_varname VARCHAR, + buffer_size INTEGER +) RETURNS VOID AS $$ + SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, $5, 255.0); +$$ LANGUAGE sql VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( + source_table VARCHAR, + output_table VARCHAR, + dependent_varname VARCHAR, + independent_varname VARCHAR +) RETURNS VOID AS $$ + SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, NULL, 255.0); +$$ LANGUAGE sql VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( + message VARCHAR +) RETURNS VARCHAR AS $$ + PythonFunctionBodyOnly(utilities, minibatch_preprocessing) + return minibatch_preprocessing.MiniBatchDocumentation.minibatch_preprocessor_dl_help(schema_madlib, message) +$$ LANGUAGE plpythonu VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl() +RETURNS VARCHAR AS $$ + PythonFunctionBodyOnly(utilities, minibatch_preprocessing) + return minibatch_preprocessing.MiniBatchDocumentation.minibatch_preprocessor_dl_help(schema_madlib, '') +$$ LANGUAGE plpythonu VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.agg_array_concat_transition(anyarray, anyarray) + RETURNS anyarray + AS 'select $1 || $2' + LANGUAGE SQL + IMMUTABLE; + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.agg_array_concat_merge(anyarray, anyarray) + RETURNS anyarray + AS 'select $1 || $2' + LANGUAGE SQL + IMMUTABLE + RETURNS NULL ON NULL INPUT; + +DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.agg_array_concat(anyarray); +CREATE AGGREGATE MADLIB_SCHEMA.agg_array_concat(anyarray) ( + SFUNC = MADLIB_SCHEMA.agg_array_concat_transition, + STYPE = anyarray, + PREFUNC = MADLIB_SCHEMA.agg_array_concat_merge + ); diff --git a/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in b/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in new file mode 100644 index 000000000..347ba2dee --- /dev/null +++ b/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in @@ -0,0 +1,67 @@ +/* ----------------------------------------------------------------------- *//** + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + *//* ----------------------------------------------------------------------- */ + +DROP TABLE IF EXISTS minibatch_preprocessor_dl_input; +CREATE TABLE minibatch_preprocessor_dl_input(id serial, x double precision[], label TEXT); +INSERT INTO minibatch_preprocessor_dl_input(x, label) VALUES +(ARRAY[1,2,3,4,5,6], 'a'), +(ARRAY[11,2,3,4,5,6], 'a'), +(ARRAY[11,22,33,4,5,6], 'a'), +(ARRAY[11,22,33,44,5,6], 'a'), +(ARRAY[11,22,33,44,65,6], 'a'), +(ARRAY[11,22,33,44,65,56], 'a'), +(ARRAY[11,22,33,44,65,56], 'a'), +(ARRAY[11,22,33,44,65,56], 'a'), +(ARRAY[11,22,33,44,65,56], 'a'), +(ARRAY[11,22,33,44,65,56], 'a'), +(ARRAY[11,22,33,44,65,56], 'a'), +(ARRAY[11,22,33,44,65,56], 'a'), +(ARRAY[11,22,33,144,65,56], 'a'), +(ARRAY[11,22,233,44,65,56], 'a'), +(ARRAY[11,22,33,44,65,56], 'b'), +(ARRAY[11,22,33,44,65,56], 'b'), +(ARRAY[11,22,33,44,65,56], 'b'); + +DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary; +SELECT minibatch_preprocessor_dl( + 'minibatch_preprocessor_dl_input', + 'minibatch_preprocessor_dl_batch', + 'id', + 'x', + 5); + +SELECT assert(count(*)=4, 'Incorrect number of buffers in minibatch_preprocessor_dl_batch.') +FROM minibatch_preprocessor_dl_batch; + +SELECT assert(array_upper(independent_varname, 1)=5, 'Incorrect buffer size.') +FROM minibatch_preprocessor_dl_batch WHERE buffer_id=1; + +SELECT assert(array_upper(independent_varname, 1)=2, 'Incorrect buffer size.') +FROM minibatch_preprocessor_dl_batch WHERE buffer_id=4; + +DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary; +SELECT minibatch_preprocessor_dl( + 'minibatch_preprocessor_dl_input', + 'minibatch_preprocessor_dl_batch', + 'label', + 'x'); +SELECT assert(count(*)=3, 'Incorrect number of buffers in minibatch_preprocessor_dl_batch with default buffer size.') +FROM minibatch_preprocessor_dl_batch; diff --git a/src/ports/postgres/modules/utilities/utilities.py_in b/src/ports/postgres/modules/utilities/utilities.py_in index 50c426b74..89907b391 100644 --- a/src/ports/postgres/modules/utilities/utilities.py_in +++ b/src/ports/postgres/modules/utilities/utilities.py_in @@ -875,8 +875,8 @@ def collate_plpy_result(plpy_result_rows): def validate_module_input_params(source_table, output_table, independent_varname, - dependent_varname, module_name, grouping_cols, - other_output_tables=None): + dependent_varname, module_name, + grouping_cols=None, other_output_tables=None): """ This function is supposed to be used for validating params for supervised learning like algos, e.g. linear regression, mlp, etc. since all From 9a43a4f77520eced28d68c7447e8d5699dc6da8a Mon Sep 17 00:00:00 2001 From: Nandish Jayaram Date: Thu, 20 Dec 2018 15:11:20 -0800 Subject: [PATCH 2/6] Do not use distributed by for postgres. --- .../modules/utilities/minibatch_preprocessing.py_in | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in index 12a16ac9c..fff3061da 100644 --- a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in +++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in @@ -80,6 +80,10 @@ class MiniBatchPreProcessorDL: """.format(**locals()) plpy.execute(scalar_mult_sql) # Create the mini-batched output table + if is_platform_pg(): + distributed_by_clause = '' + else: + distributed_by_clause= ' DISTRIBUTED BY (buffer_id) ' sql = """ CREATE TABLE {self.output_table} AS SELECT * FROM @@ -91,11 +95,12 @@ class MiniBatchPreProcessorDL: FROM {norm_tbl} GROUP BY buffer_id ) b - DISTRIBUTED BY (buffer_id) + {distributed_by_clause} """.format(x=MINIBATCH_OUTPUT_INDEPENDENT_COLNAME, - y=MINIBATCH_OUTPUT_DEPENDENT_COLNAME, **locals()) + y=MINIBATCH_OUTPUT_DEPENDENT_COLNAME, + **locals()) plpy.execute(sql) - plpy.execute("DROP TABLE {}".format(norm_tbl)) + plpy.execute("DROP TABLE {0}".format(norm_tbl)) # Create summary table self._create_output_summary_table() @@ -136,7 +141,7 @@ class MiniBatchPreProcessorDL: def _get_num_buffers(self): num_rows_in_tbl = plpy.execute(""" - SELECT count(*) AS cnt FROM {} + SELECT count(*) AS cnt FROM {0} """.format(self.source_table))[0]['cnt'] buffer_size_calculator = MiniBatchBufferSizeCalculator() indepdent_var_dim = _tbl_dimension_rownum( From c582d8c75344f2de1dbb3f9c00be140028173125 Mon Sep 17 00:00:00 2001 From: Nandish Jayaram Date: Thu, 20 Dec 2018 17:29:37 -0800 Subject: [PATCH 3/6] Remove greenplum specific assert in dev check --- .../modules/utilities/test/minibatch_preprocessing_dl.sql_in | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in b/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in index 347ba2dee..1dc93dc5a 100644 --- a/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in +++ b/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in @@ -63,5 +63,3 @@ SELECT minibatch_preprocessor_dl( 'minibatch_preprocessor_dl_batch', 'label', 'x'); -SELECT assert(count(*)=3, 'Incorrect number of buffers in minibatch_preprocessor_dl_batch with default buffer size.') -FROM minibatch_preprocessor_dl_batch; From 89b3e1e07c8bb35c54c06aba6d818f428545079a Mon Sep 17 00:00:00 2001 From: Domino Valdano Date: Thu, 10 Jan 2019 15:19:35 -0800 Subject: [PATCH 4/6] Add dependent_offset optional param for shifting dependent variable values. --- .../utilities/minibatch_preprocessing.py_in | 18 +++++++++--- .../minibatch_preprocessing_dl.sql_in | 28 ++++++++++++++++--- 2 files changed, 38 insertions(+), 8 deletions(-) diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in index fff3061da..b39add2a5 100644 --- a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in +++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in @@ -54,7 +54,7 @@ MINIBATCH_OUTPUT_INDEPENDENT_COLNAME = "independent_varname" class MiniBatchPreProcessorDL: def __init__(self, schema_madlib, source_table, output_table, dependent_varname, independent_varname, buffer_size, - normalizing_const, **kwargs): + normalizing_const, dependent_offset, **kwargs): self.schema_madlib = schema_madlib self.source_table = source_table self.output_table = output_table @@ -62,6 +62,7 @@ class MiniBatchPreProcessorDL: self.independent_varname = independent_varname self.buffer_size = buffer_size self.normalizing_const = normalizing_const + self.dependent_offset = dependent_offset self.module_name = "minibatch_preprocessor_DL" self.output_summary_table = add_postfix(self.output_table, "_summary") self._validate_args() @@ -70,11 +71,16 @@ class MiniBatchPreProcessorDL: def minibatch_preprocessor_dl(self): norm_tbl = unique_string(desp='normalized') # Create a temp table that has independent var normalized. + + dependent_varname_with_offset = self.dependent_varname + if self.dependent_offset: + dependent_varname_with_offset = '{0} + {1}'.format(self.dependent_varname, self.dependent_offset) + scalar_mult_sql = """ CREATE TEMP TABLE {norm_tbl} AS SELECT {self.schema_madlib}.array_scalar_mult( {self.independent_varname}::REAL[], (1/{self.normalizing_const})::REAL) AS x_norm, - {self.dependent_varname} AS y, + {dependent_varname_with_offset} AS y, row_number() over() AS row_id FROM {self.source_table} """.format(**locals()) @@ -719,6 +725,9 @@ class MiniBatchDocumentation: normalizing_const -- DOUBLE PRECISON. Default 255.0. The normalizing constant to use for standardizing arrays in independent_varname. + dependent_offset -- INTEGER. If non-zero, shifts all dependent + variable values by this number (should + only be used for numeric types). ); @@ -739,8 +748,9 @@ class MiniBatchDocumentation: source_table -- Source table name. output_table -- Output table name from preprocessor. - dependent_varname -- Dependent variable from the original table. - independent_varname -- Independent variables from the original + dependent_varname -- Dependent variable values from the original table + (shifted by dependent_offset, if specified). + independent_varname -- Independent variable values from the original table. dependent_vartype -- Type of the dependent variable from the original table. diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in index ed85a217a..bcd81c617 100644 --- a/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in +++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in @@ -48,7 +48,8 @@ minibatch_preprocessor_dl(source_table, dependent_varname, independent_varname, buffer_size, - normalizing_const + normalizing_const, + dependent_offset ) @@ -90,6 +91,12 @@ minibatch_preprocessor_dl(source_table,
DOUBLE PRECISION, default: 255.0. The normalizing constant to divide each value in the independent_varname array by.
+ +
dependent_offset (optional)
+
INTEGER, default: 0. If non-zero, shifts all dependent + variable values by this number (should only be used for numeric types). +
+ Output tables @@ -307,7 +314,8 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( dependent_varname VARCHAR, independent_varname VARCHAR, buffer_size INTEGER, - normalizing_const DOUBLE PRECISION + normalizing_const DOUBLE PRECISION, + dependent_offset INTEGER ) RETURNS VOID AS $$ PythonFunctionBodyOnly(utilities, minibatch_preprocessing) from utilities.control import MinWarning @@ -318,6 +326,18 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( $$ LANGUAGE plpythonu VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( + source_table VARCHAR, + output_table VARCHAR, + dependent_varname VARCHAR, + independent_varname VARCHAR, + buffer_size INTEGER, + normalizing_const DOUBLE PRECISION +) RETURNS VOID AS $$ + SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, $5, $6, 0); +$$ LANGUAGE sql VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( source_table VARCHAR, output_table VARCHAR, @@ -325,7 +345,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( independent_varname VARCHAR, buffer_size INTEGER ) RETURNS VOID AS $$ - SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, $5, 255.0); + SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, $5, 255.0, 0); $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); @@ -335,7 +355,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( dependent_varname VARCHAR, independent_varname VARCHAR ) RETURNS VOID AS $$ - SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, NULL, 255.0); + SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, NULL, 255.0, 0); $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); From da84debc29207588d489d3e6ec9543d2022cc703 Mon Sep 17 00:00:00 2001 From: Domino Valdano Date: Thu, 10 Jan 2019 17:04:41 -0800 Subject: [PATCH 5/6] Fix documentation in .py_in to match sql_in --- .../postgres/modules/utilities/minibatch_preprocessing.py_in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in index b39add2a5..6d761528a 100644 --- a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in +++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in @@ -738,8 +738,8 @@ class MiniBatchDocumentation: following columns: buffer_id -- INTEGER. Unique id for packed table. - dependent_varname -- FLOAT8[]. Packed array of dependent variables. - independent_varname -- FLOAT8[]. Packed array of independent + dependent_varname -- ANYARRAY[]. Packed array of dependent variables. + independent_varname -- REAL[]. Packed array of independent variables. --------------------------------------------------------------------------- From 5a025fc38269d9ceb22a06587667dd398751a437 Mon Sep 17 00:00:00 2001 From: Domino Valdano Date: Thu, 24 Jan 2019 16:33:14 -0800 Subject: [PATCH 6/6] Added test cases for dependent_offset optional param Co-authored-by: Jingyi Mei --- .../utilities/minibatch_preprocessing.py_in | 2 +- .../minibatch_preprocessing_dl.sql_in | 8 +-- .../test/minibatch_preprocessing_dl.sql_in | 60 +++++++++++++++++++ 3 files changed, 65 insertions(+), 5 deletions(-) diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in index 6d761528a..be568a930 100644 --- a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in +++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in @@ -725,7 +725,7 @@ class MiniBatchDocumentation: normalizing_const -- DOUBLE PRECISON. Default 255.0. The normalizing constant to use for standardizing arrays in independent_varname. - dependent_offset -- INTEGER. If non-zero, shifts all dependent + dependent_offset -- INTEGER. If specified, shifts all dependent variable values by this number (should only be used for numeric types). ); diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in index bcd81c617..994e458cc 100644 --- a/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in +++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in @@ -93,7 +93,7 @@ minibatch_preprocessor_dl(source_table,
dependent_offset (optional)
-
INTEGER, default: 0. If non-zero, shifts all dependent +
INTEGER, default: NULL. If specified, shifts all dependent variable values by this number (should only be used for numeric types).
@@ -334,7 +334,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( buffer_size INTEGER, normalizing_const DOUBLE PRECISION ) RETURNS VOID AS $$ - SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, $5, $6, 0); + SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, $5, $6, NULL); $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); @@ -345,7 +345,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( independent_varname VARCHAR, buffer_size INTEGER ) RETURNS VOID AS $$ - SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, $5, 255.0, 0); + SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, $5, 255.0, NULL); $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); @@ -355,7 +355,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( dependent_varname VARCHAR, independent_varname VARCHAR ) RETURNS VOID AS $$ - SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, NULL, 255.0, 0); + SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, NULL, 255.0, NULL); $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); diff --git a/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in b/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in index 1dc93dc5a..dd9b0d537 100644 --- a/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in +++ b/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in @@ -63,3 +63,63 @@ SELECT minibatch_preprocessor_dl( 'minibatch_preprocessor_dl_batch', 'label', 'x'); + +DROP TABLE IF EXISTS minibatch_preprocessor_dl_input; +CREATE TABLE minibatch_preprocessor_dl_input(id serial, x double precision[], y INTEGER); +INSERT INTO minibatch_preprocessor_dl_input(x, y) VALUES +(ARRAY[1,2,3,4,5,6], 4), +(ARRAY[11,2,3,4,5,6], 3), +(ARRAY[11,22,33,4,5,6], 8), +(ARRAY[11,22,33,44,5,6], 2), +(ARRAY[11,22,33,44,65,6], 5), +(ARRAY[11,22,33,44,65,56], 6), +(ARRAY[11,22,33,44,65,56], 2), +(ARRAY[11,22,33,44,65,56], 10), +(ARRAY[11,22,33,44,65,56], 3), +(ARRAY[11,22,33,44,65,56], 7), +(ARRAY[11,22,33,44,65,56], 6), +(ARRAY[11,22,33,44,65,56], -6), +(ARRAY[11,22,33,144,65,56], 9), +(ARRAY[11,22,233,44,65,56], 0), +(ARRAY[11,22,33,44,65,56], 12), +(ARRAY[11,22,33,44,65,56], -3), +(ARRAY[11,22,33,44,65,56], -1); + +DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary; +SELECT minibatch_preprocessor_dl( + 'minibatch_preprocessor_dl_input', + 'minibatch_preprocessor_dl_batch', + 'y', + 'x', + 4, + 5); + +-- Test that indepdendent vars get divided by 5, by verifying min value goes from 1 to 0.2, and max value from 233 to 46.6 +SELECT assert(relative_error(MIN(x),0.2) < 0.00001, 'Independent var not normalized properly!') FROM (SELECT UNNEST(independent_varname) as x FROM minibatch_preprocessor_dl_batch) a; +SELECT assert(relative_error(MAX(x),46.6) < 0.00001, 'Independent var not normalized properly!') FROM (SELECT UNNEST(independent_varname) as x FROM minibatch_preprocessor_dl_batch) a; + +DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary; +SELECT minibatch_preprocessor_dl( + 'minibatch_preprocessor_dl_input', + 'minibatch_preprocessor_dl_batch', + 'y', + 'x', + 4, + 5, + 6); + +-- Test that dependent vars gets shifted by +6, by verifying minimum value goes from -6 to 0 +SELECT assert(abs(MIN(y))<0.00001, 'Dependent var not shifted properly!') FROM (SELECT UNNEST(dependent_varname) as y FROM minibatch_preprocessor_dl_batch) a; + +DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary; +SELECT minibatch_preprocessor_dl( + 'minibatch_preprocessor_dl_input', + 'minibatch_preprocessor_dl_batch', + 'y', + 'x', + 4, + 5, + -6); + +-- Test that dependent vars gets shifted by -6, by verifying minimum value goes from -6 to -12 +SELECT assert(relative_error(MIN(y), -12)<0.00001, 'Dependent var not shifted properly!') FROM (SELECT UNNEST(dependent_varname) as y FROM minibatch_preprocessor_dl_batch) a; \ No newline at end of file