From e07d833d3aafe6ba100c90f1e2a1f592ecabee84 Mon Sep 17 00:00:00 2001
From: Nandish Jayaram <njayaram@apache.org>
Date: Mon, 17 Dec 2018 17:54:42 -0800
Subject: [PATCH 1/6] Minibatch Preprocessor for Deep learning

The minibatch preprocessor we currently have in MADlib is bloated for DL
tasks. This feature adds a simplified version of creating buffers, and
divides each element of the independent array by a normalizing constant
for standardization (which is 255.0 by default). This is standard practice
with image data.

Co-authored-by: Arvind Sridhar <asridhar@pivotal.io>
Co-authored-by: Domino Valdano <dvaldano@pivotal.io>
---
 doc/mainpage.dox.in                           |   6 +
 .../utilities/minibatch_preprocessing.py_in   | 177 ++++++++-
 .../minibatch_preprocessing_dl.sql_in         | 376 ++++++++++++++++++
 .../test/minibatch_preprocessing_dl.sql_in    |  67 ++++
 .../modules/utilities/utilities.py_in         |   4 +-
 5 files changed, 626 insertions(+), 4 deletions(-)
 create mode 100644 src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in
 create mode 100644 src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in

diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index aab37afad..5568da6b6 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in
@@ -285,6 +285,12 @@ complete matrix stored as a distributed table.
 Interface and implementation are subject to change.
 @{
     @defgroup grp_cg Conjugate Gradient
+    @defgroup grp_dl Deep Learning
+    @brief A collection of deep learning interfaces.
+    @details A collection of deep learning interfaces.
+    @{
+        @defgroup grp_minibatch_preprocessing_dl Mini-Batch Preprocessor for Deep Learning
+    @}
     @defgroup grp_knn k-Nearest Neighbors
     @defgroup grp_bayes Naive Bayes Classification
     @defgroup grp_sample Random Sampling
diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
index 88433c937..12a16ac9c 100644
--- a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
+++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
@@ -39,7 +39,7 @@ from utilities import py_list_to_sql_string
 from utilities import split_quoted_delimited_str
 from utilities import unique_string
 from utilities import validate_module_input_params
-from utilities import NUMERIC, INTEGER, TEXT, BOOLEAN, INCLUDE_ARRAY
+from utilities import NUMERIC, INTEGER, TEXT, BOOLEAN, INCLUDE_ARRAY, ONLY_ARRAY
 
 from mean_std_dev_calculator import MeanStdDevCalculator
 from validate_args import get_expr_type
@@ -51,6 +51,100 @@ m4_changequote(`<!', `!>')
 MINIBATCH_OUTPUT_DEPENDENT_COLNAME = "dependent_varname"
 MINIBATCH_OUTPUT_INDEPENDENT_COLNAME = "independent_varname"
 
+class MiniBatchPreProcessorDL:
+    def __init__(self, schema_madlib, source_table, output_table,
+                 dependent_varname, independent_varname, buffer_size,
+                 normalizing_const, **kwargs):
+        self.schema_madlib = schema_madlib
+        self.source_table = source_table
+        self.output_table = output_table
+        self.dependent_varname = dependent_varname
+        self.independent_varname = independent_varname
+        self.buffer_size = buffer_size
+        self.normalizing_const = normalizing_const
+        self.module_name = "minibatch_preprocessor_DL"
+        self.output_summary_table = add_postfix(self.output_table, "_summary")
+        self._validate_args()
+        self.num_of_buffers = self._get_num_buffers()
+
+    def minibatch_preprocessor_dl(self):
+        norm_tbl = unique_string(desp='normalized')
+        # Create a temp table that has independent var normalized.
+        scalar_mult_sql = """
+            CREATE TEMP TABLE {norm_tbl} AS
+            SELECT {self.schema_madlib}.array_scalar_mult(
+                {self.independent_varname}::REAL[], (1/{self.normalizing_const})::REAL) AS x_norm,
+                {self.dependent_varname} AS y,
+                row_number() over() AS row_id
+            FROM {self.source_table}
+        """.format(**locals())
+        plpy.execute(scalar_mult_sql)
+        # Create the mini-batched output table
+        sql = """
+            CREATE TABLE {self.output_table} AS
+            SELECT * FROM
+            (
+                SELECT {self.schema_madlib}.agg_array_concat(
+                    ARRAY[{norm_tbl}.x_norm::REAL[]]) AS {x},
+                    array_agg({norm_tbl}.y) AS {y},
+                    ({norm_tbl}.row_id%{self.num_of_buffers})::smallint AS buffer_id
+                FROM {norm_tbl}
+                GROUP BY buffer_id
+            ) b
+            DISTRIBUTED BY (buffer_id)
+        """.format(x=MINIBATCH_OUTPUT_INDEPENDENT_COLNAME,
+                   y=MINIBATCH_OUTPUT_DEPENDENT_COLNAME, **locals())
+        plpy.execute(sql)
+        plpy.execute("DROP TABLE {}".format(norm_tbl))
+        # Create summary table
+        self._create_output_summary_table()
+
+    def _create_output_summary_table(self):
+        query = """
+            CREATE TABLE {self.output_summary_table} AS
+            SELECT
+                $__madlib__${self.source_table}$__madlib__$::TEXT AS source_table,
+                $__madlib__${self.output_table}$__madlib__$::TEXT AS output_table,
+                $__madlib__${self.dependent_varname}$__madlib__$::TEXT AS dependent_varname,
+                $__madlib__${self.independent_varname}$__madlib__$::TEXT AS independent_varname,
+                $__madlib__${self.dependent_vartype}$__madlib__$::TEXT AS dependent_vartype,
+                {self.buffer_size} AS buffer_size
+        """.format(self=self)
+        plpy.execute(query)
+
+    def _validate_args(self):
+        validate_module_input_params(
+            self.source_table, self.output_table, self.independent_varname,
+            self.dependent_varname, self.module_name, None,
+            [self.output_summary_table])
+        self.independent_vartype = get_expr_type(
+            self.independent_varname, self.source_table)
+        _assert(is_valid_psql_type(self.independent_vartype,
+                                   NUMERIC | ONLY_ARRAY),
+                "Invalid independent variable type, should be an array of " \
+                "one of {0}".format(','.join(NUMERIC)))
+        self.dependent_vartype = get_expr_type(
+            self.dependent_varname, self.source_table)
+        dep_valid_types = NUMERIC | TEXT | BOOLEAN
+        _assert(is_valid_psql_type(self.dependent_vartype, dep_valid_types),
+                "Invalid dependent variable type, should be one of {0}".
+                format(','.join(dep_valid_types)))
+        if self.buffer_size is not None:
+            _assert(self.buffer_size > 0,
+                    "minibatch_preprocessor_dl: The buffer size has to be a " \
+                    "positive integer or NULL.")
+
+    def _get_num_buffers(self):
+        num_rows_in_tbl = plpy.execute("""
+                SELECT count(*) AS cnt FROM {}
+            """.format(self.source_table))[0]['cnt']
+        buffer_size_calculator = MiniBatchBufferSizeCalculator()
+        indepdent_var_dim = _tbl_dimension_rownum(
+            self.schema_madlib, self.source_table, self.independent_varname,
+            skip_row_count=True)
+        self.buffer_size = buffer_size_calculator.calculate_default_buffer_size(
+            self.buffer_size, num_rows_in_tbl, indepdent_var_dim[0])
+        return ceil((1.0*num_rows_in_tbl)/self.buffer_size)
 
 class MiniBatchPreProcessor:
     """
@@ -509,7 +603,7 @@ class MiniBatchDocumentation:
         SELECT {schema_madlib}.{method}(
             source_table,          -- TEXT. Name of the table containing input
                                       data.  Can also be a view
-            output_table ,         -- TEXT. Name of the output table for
+            output_table,          -- TEXT. Name of the output table for
                                       mini-batching
             dependent_varname,     -- TEXT. Name of the dependent variable column
             independent_varname,   -- TEXT. Name of the independent variable
@@ -580,3 +674,82 @@ class MiniBatchDocumentation:
             for help.
         """.format(**locals())
 # ---------------------------------------------------------------------
+    @staticmethod
+    def minibatch_preprocessor_dl_help(schema_madlib, message):
+        method = "minibatch_preprocessor_dl"
+        summary = """
+        ----------------------------------------------------------------
+                            SUMMARY
+        ----------------------------------------------------------------
+        For Deep Learning based techniques such as Convolutional Neural Nets,
+        the input data is mostly images. These images can be represented as an
+        array of numbers where all elements are between 0 and 255 in value.
+        It is standard practice to divide each of these numbers by 255.0 to
+        normalize the image data. minibatch_preprocessor() is for general
+        use-cases, but for deep learning based use-cases we provide
+        minibatch_preprocessor_dl() that is light-weight and is
+        specific to image datasets.
+
+        The normalizing constant is parameterized, and can be specified based
+        on the kind of image data used.
+
+        For more details on function usage:
+        SELECT {schema_madlib}.{method}('usage')
+        """.format(**locals())
+
+        usage = """
+        ---------------------------------------------------------------------------
+                                        USAGE
+        ---------------------------------------------------------------------------
+        SELECT {schema_madlib}.{method}(
+            source_table,          -- TEXT. Name of the table containing input
+                                      data.  Can also be a view
+            output_table,          -- TEXT. Name of the output table for
+                                      mini-batching
+            dependent_varname,     -- TEXT. Name of the dependent variable column
+            independent_varname,   -- TEXT. Name of the independent variable
+                                      column
+            buffer_size            -- INTEGER. Default computed automatically.
+                                      Number of source input rows to pack into a buffer
+            normalizing_const      -- DOUBLE PRECISON. Default 255.0. The
+                                      normalizing constant to use for
+                                      standardizing arrays in independent_varname.
+        );
+
+
+        ---------------------------------------------------------------------------
+                                        OUTPUT
+        ---------------------------------------------------------------------------
+        The output table produced by MiniBatch Preprocessor contains the
+        following columns:
+
+        buffer_id               -- INTEGER.  Unique id for packed table.
+        dependent_varname       -- FLOAT8[]. Packed array of dependent variables.
+        independent_varname     -- FLOAT8[]. Packed array of independent
+                                   variables.
+
+        ---------------------------------------------------------------------------
+        The algorithm also creates a summary table named <output_table>_summary
+        that has the following columns:
+
+        source_table              -- Source table name.
+        output_table              -- Output table name from preprocessor.
+        dependent_varname         -- Dependent variable from the original table.
+        independent_varname       -- Independent variables from the original
+                                     table.
+        dependent_vartype         -- Type of the dependent variable from the
+                                     original table.
+        buffer_size               -- Buffer size used in preprocessing step.
+
+        ---------------------------------------------------------------------------
+        """.format(**locals())
+
+
+        if not message:
+            return summary
+        elif message.lower() in ('usage', 'help', '?'):
+            return usage
+        return """
+            No such option. Use "SELECT {schema_madlib}.minibatch_preprocessor_dl()"
+            for help.
+        """.format(**locals())
diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in
new file mode 100644
index 000000000..ed85a217a
--- /dev/null
+++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in
@@ -0,0 +1,376 @@
+/* ----------------------------------------------------------------------- */
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ * @file minibatch_preprocessing_dl.sql_in
+ * @brief TODO
+ * @date December 2018
+ *
+ */
+/* ----------------------------------------------------------------------- */
+
+m4_include(`SQLCommon.m4')
+
+/**
+@addtogroup grp_minibatch_preprocessing_dl
+
+<div class="toc"><b>Contents</b><ul>
+<li class="level1"><a href="#minibatch_preprocessor_dl">Mini-Batch Preprocessor for Deep Learning</a></li>
+<li class="level1"><a href="#example">Examples</a></li>
+</ul></div>
+
+For Deep Learning based techniques such as Convolutional Neural Nets, the input
+data is mostly images. These images can be represented as an array of numbers
+where all elements are between 0 and 255 in value. It is standard practice
+to divide each of these numbers by 255.0 to normalize the image data.
+minibatch_preprocessor() is for general use-cases, but for deep learning based
+use-cases we provide minibatch_preprocessor_dl() that is light-weight and is
+specific to image datasets. The normalizing constant is parameterized, and can
+be specified based on the kind of image data used.
+<pre class="syntax">
+minibatch_preprocessor_dl(source_table,
+                        output_table,
+                        dependent_varname,
+                        independent_varname,
+                        buffer_size,
+                        normalizing_const
+                        )
+</pre>
+
+\b Arguments
+<dl class="arglist">
+  <dt>source_table</dt>
+  <dd>TEXT. Name of the table containing input data.  Can also be a view.
+  </dd>
+
+  <dt>output_table</dt>
+  <dd>TEXT.  Name of the output table from the preprocessor which
+  will be used as input to algorithms that support mini-batching.
+  Note that the arrays packed into the output table are randomized
+  and normalized (by dividing each element in the independent variable array
+  by the normalizing_const), so they will not match up in an obvious way with
+  the rows in the source table.
+  </dd>
+
+  <dt>dependent_varname</dt>
+  <dd>TEXT. Name of the dependent variable column.
+  </dd>
+
+  <dt>independent_varname</dt>
+  <dd>TEXT. Name of the independent variable column. The column must be of
+  a numeric array type.
+  </dd>
+
+  <dt>buffer_size (optional)</dt>
+  <dd>INTEGER, default: computed.  Buffer size is the
+  number of rows from the
+  source table that are packed into one row of the preprocessor
+  output table.  The default value is computed considering size of
+  the source table, number of independent variables, number of groups,
+  and number of segments in the database cluster.  For larger data sets,
+  the computed buffer size will typically be a value in the millions.
+  </dd>
+
+  <dt>normalizing_const (optional)</dt>
+  <dd>DOUBLE PRECISION, default: 255.0. The normalizing constant to divide
+  each value in the independent_varname array by.
+  </dd>
+</dl>
+
+<b>Output tables</b>
+<br>
+    The output table produced by the mini-batch preprocessor contains the following columns:
+    <table class="output">
+      <tr>
+        <th>buffer_id</th>
+        <td>INTEGER. Unique id for packed table.
+        </td>
+      </tr>
+      <tr>
+        <th>dependent_varname</th>
+        <td>ANYARRAY[]. Packed array of dependent variables. The type
+        of the array is the same as the type of the dependent variable from
+        the source table.
+        </td>
+      </tr>
+      <tr>
+        <th>independent_varname</th>
+        <td>REAL[]. Packed array of independent variables.
+        </td>
+      </tr>
+    </table>
+
+A summary table named \<output_table\>_summary is also created, which has the following columns:
+    <table class="output">
+    <tr>
+        <th>source_table</th>
+        <td>Name of the source table.</td>
+    </tr>
+    <tr>
+        <th>output_table</th>
+        <td>Name of output table generated by preprocessor.</td>
+    </tr>
+    <tr>
+        <th>dependent_varname</th>
+        <td>Dependent variable from the source table.</td>
+    </tr>
+    <tr>
+        <th>independent_varname</th>
+        <td>Independent variable from the source table.</td>
+    </tr>
+    <tr>
+        <th>dependent_vartype</th>
+        <td>Type of the dependent varialbe from the source table.</td>
+    </tr>
+    <tr>
+        <th>buffer_size</th>
+        <td>Buffer size used in preprocessing step.</td>
+    </tr>
+   </table>
+
+@anchor example
+@par Examples
+-#  Create an input data set based on the well known iris data set:
+<pre class="example">
+DROP TABLE IF EXISTS iris_data;
+CREATE TABLE iris_data(
+    id serial,
+    attributes numeric[],
+    class_text varchar,
+    class integer,
+    state varchar
+);
+INSERT INTO iris_data(id, attributes, class_text, class, state) VALUES
+(1,ARRAY[5.0,3.2,1.2,0.2],'Iris_setosa',1,'Alaska'),
+(2,ARRAY[5.5,3.5,1.3,0.2],'Iris_setosa',1,'Alaska'),
+(3,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Alaska'),
+(4,ARRAY[4.4,3.0,1.3,0.2],'Iris_setosa',1,'Alaska'),
+(5,ARRAY[5.1,3.4,1.5,0.2],'Iris_setosa',1,'Alaska'),
+(6,ARRAY[5.0,3.5,1.3,0.3],'Iris_setosa',1,'Alaska'),
+(7,ARRAY[4.5,2.3,1.3,0.3],'Iris_setosa',1,'Alaska'),
+(8,ARRAY[4.4,3.2,1.3,0.2],'Iris_setosa',1,'Alaska'),
+(9,ARRAY[5.0,3.5,1.6,0.6],'Iris_setosa',1,'Alaska'),
+(10,ARRAY[5.1,3.8,1.9,0.4],'Iris_setosa',1,'Alaska'),
+(11,ARRAY[4.8,3.0,1.4,0.3],'Iris_setosa',1,'Alaska'),
+(12,ARRAY[5.1,3.8,1.6,0.2],'Iris_setosa',1,'Alaska'),
+(13,ARRAY[5.7,2.8,4.5,1.3],'Iris_versicolor',2,'Alaska'),
+(14,ARRAY[6.3,3.3,4.7,1.6],'Iris_versicolor',2,'Alaska'),
+(15,ARRAY[4.9,2.4,3.3,1.0],'Iris_versicolor',2,'Alaska'),
+(16,ARRAY[6.6,2.9,4.6,1.3],'Iris_versicolor',2,'Alaska'),
+(17,ARRAY[5.2,2.7,3.9,1.4],'Iris_versicolor',2,'Alaska'),
+(18,ARRAY[5.0,2.0,3.5,1.0],'Iris_versicolor',2,'Alaska'),
+(19,ARRAY[5.9,3.0,4.2,1.5],'Iris_versicolor',2,'Alaska'),
+(20,ARRAY[6.0,2.2,4.0,1.0],'Iris_versicolor',2,'Alaska'),
+(21,ARRAY[6.1,2.9,4.7,1.4],'Iris_versicolor',2,'Alaska'),
+(22,ARRAY[5.6,2.9,3.6,1.3],'Iris_versicolor',2,'Alaska'),
+(23,ARRAY[6.7,3.1,4.4,1.4],'Iris_versicolor',2,'Alaska'),
+(24,ARRAY[5.6,3.0,4.5,1.5],'Iris_versicolor',2,'Alaska'),
+(25,ARRAY[5.8,2.7,4.1,1.0],'Iris_versicolor',2,'Alaska'),
+(26,ARRAY[6.2,2.2,4.5,1.5],'Iris_versicolor',2,'Alaska'),
+(27,ARRAY[5.6,2.5,3.9,1.1],'Iris_versicolor',2,'Alaska'),
+(28,ARRAY[5.0,3.4,1.5,0.2],'Iris_setosa',1,'Tennessee'),
+(29,ARRAY[4.4,2.9,1.4,0.2],'Iris_setosa',1,'Tennessee'),
+(30,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Tennessee'),
+(31,ARRAY[5.4,3.7,1.5,0.2],'Iris_setosa',1,'Tennessee'),
+(32,ARRAY[4.8,3.4,1.6,0.2],'Iris_setosa',1,'Tennessee'),
+(33,ARRAY[4.8,3.0,1.4,0.1],'Iris_setosa',1,'Tennessee'),
+(34,ARRAY[4.3,3.0,1.1,0.1],'Iris_setosa',1,'Tennessee'),
+(35,ARRAY[5.8,4.0,1.2,0.2],'Iris_setosa',1,'Tennessee'),
+(36,ARRAY[5.7,4.4,1.5,0.4],'Iris_setosa',1,'Tennessee'),
+(37,ARRAY[5.4,3.9,1.3,0.4],'Iris_setosa',1,'Tennessee'),
+(38,ARRAY[6.0,2.9,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
+(39,ARRAY[5.7,2.6,3.5,1.0],'Iris_versicolor',2,'Tennessee'),
+(40,ARRAY[5.5,2.4,3.8,1.1],'Iris_versicolor',2,'Tennessee'),
+(41,ARRAY[5.5,2.4,3.7,1.0],'Iris_versicolor',2,'Tennessee'),
+(42,ARRAY[5.8,2.7,3.9,1.2],'Iris_versicolor',2,'Tennessee'),
+(43,ARRAY[6.0,2.7,5.1,1.6],'Iris_versicolor',2,'Tennessee'),
+(44,ARRAY[5.4,3.0,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
+(45,ARRAY[6.0,3.4,4.5,1.6],'Iris_versicolor',2,'Tennessee'),
+(46,ARRAY[6.7,3.1,4.7,1.5],'Iris_versicolor',2,'Tennessee'),
+(47,ARRAY[6.3,2.3,4.4,1.3],'Iris_versicolor',2,'Tennessee'),
+(48,ARRAY[5.6,3.0,4.1,1.3],'Iris_versicolor',2,'Tennessee'),
+(49,ARRAY[5.5,2.5,4.0,1.3],'Iris_versicolor',2,'Tennessee'),
+(50,ARRAY[5.5,2.6,4.4,1.2],'Iris_versicolor',2,'Tennessee'),
+(51,ARRAY[6.1,3.0,4.6,1.4],'Iris_versicolor',2,'Tennessee'),
+(52,ARRAY[5.8,2.6,4.0,1.2],'Iris_versicolor',2,'Tennessee');
+</pre>
+
+-#  Run the preprocessor:
+<pre class="example">
+DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary;
+SELECT madlib.minibatch_preprocessor_dl('iris_data',         -- Source table
+                                     'iris_data_packed',  -- Output table
+                                     'class_text',        -- Dependent variable
+                                     'attributes',        -- Independent variables
+                                     NULL,                -- buffer size
+                                     2                    -- normalizing constant
+                                     );
+</pre>
+For small datasets like in this example, buffer size is mainly
+determined by the number of segments in the database.
+This example is run on a Greenplum database with 3 segments,
+so there are 3 rows with a buffer size of 18.
+For PostgresSQL, there would be only one row with a buffer
+size of 52 since it is a single node database.
+For larger data sets, other factors go into
+computing buffers size besides number of segments.
+Also, note that the dependent variable has
+been one-hot encoded since it is categorical.
+Here is a sample of the packed output table:
+<pre class="example">
+\\x on
+SELECT * FROM iris_data_packed;
+</pre>
+<pre class="result">
+-[ RECORD 1 ]-------+-------------------------------------
+independent_varname | {{2.55,1.7,0.75,0.1},{2.55,1.9,0.95,0.2},{2.9,1.35,1.95,0.6},{3.35,1.55,2.2,0.7},{2.85,1.3,1.75,0.5},{3.1,1.1,2.25,0.75},...}}
+dependent_varname   | {Iris_versicolor,Iris_versicolor,Iris_versicolor,Iris_versicolor,Iris_setosa,Iris_setosa,...}
+buffer_id           | 0
+-[ RECORD 2 ]-------+-------------------------------------
+independent_varname | {{2.4,1.5,0.7,0.15},{2.2,1.6,0.65,0.1},{2.8,1.45,1.8,0.65},{2.9,1.3,2,0.6},{2.2,1.45,0.7,0.1},{2.85,1.4,2.25,0.65},{2.8,1.25,1.95,0.55},...}}
+dependent_varname   | {Iris_setosa,Iris_setosa,Iris_versicolor,Iris_versicolor,Iris_setosa,Iris_versicolor,...}
+buffer_id           | 1
+-[ RECORD 2 ]-------+-------------------------------------
+independent_varname | {{2.4,1.7,0.8,0.1},{2.9,1.35,2.05,0.5},{3.15,1.65,2.35,0.8},{2.7,1.95,0.65,0.2},{2.75,1.3,2.2,0.6},{3,1.1,2,0.5},{2.55,1.9,0.8,0.1},...}}
+dependent_varname   | {Iris_versicolor,Iris_setosa,Iris_versicolor,Iris_versicolor,Iris_setosa,Iris_setosa,...}
+buffer_id           | 2
+</pre>
+Review the output summary table:
+<pre class="example">
+\\x on
+SELECT * FROM iris_data_packed_summary;
+</pre>
+<pre class="result">
+-[ RECORD 1 ]-------+------------------
+source_table        | iris_data
+output_table        | iris_data_packed
+dependent_varname   | class_text
+independent_varname | attributes
+dependent_vartype   | character varying
+buffer_size         | 18
+</pre>
+
+-# Generally the default buffer size will work well,
+but if you have occasion to change it:
+<pre class="example">
+DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary;
+SELECT madlib.minibatch_preprocessor_dl('iris_data',         -- Source table
+                                     'iris_data_packed',  -- Output table
+                                     'class_text',        -- Dependent variable
+                                     'attributes',        -- Independent variables
+                                     10                   -- Buffer size
+                                     );
+</pre>
+Review the number of buffers in the output table:
+<pre class="example">
+SELECT COUNT(*) FROM iris_data_packed;
+</pre>
+<pre class="result">
+-[ RECORD 1 ]
+count | 6
+</pre>
+Review the output summary table:
+<pre class="example">
+\\x on
+SELECT * FROM iris_data_packed_summary;
+</pre>
+<pre class="result">
+-[ RECORD 1 ]-------+------------------
+source_table        | iris_data
+output_table        | iris_data_packed
+dependent_varname   | class_text
+independent_varname | attributes
+dependent_vartype   | character varying
+buffer_size         | 10
+</pre>
+
+*/
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl(
+    source_table                VARCHAR,
+    output_table                VARCHAR,
+    dependent_varname           VARCHAR,
+    independent_varname         VARCHAR,
+    buffer_size                 INTEGER,
+    normalizing_const           DOUBLE PRECISION
+) RETURNS VOID AS $$
+    PythonFunctionBodyOnly(utilities, minibatch_preprocessing)
+    from utilities.control import MinWarning
+    with AOControl(False):
+        with MinWarning('error'):
+            minibatch_preprocessor_obj = minibatch_preprocessing.MiniBatchPreProcessorDL(**globals())
+            minibatch_preprocessor_obj.minibatch_preprocessor_dl()
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl(
+    source_table            VARCHAR,
+    output_table            VARCHAR,
+    dependent_varname       VARCHAR,
+    independent_varname     VARCHAR,
+    buffer_size             INTEGER
+) RETURNS VOID AS $$
+  SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, $5, 255.0);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl(
+    source_table            VARCHAR,
+    output_table            VARCHAR,
+    dependent_varname       VARCHAR,
+    independent_varname     VARCHAR
+) RETURNS VOID AS $$
+  SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, NULL, 255.0);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl(
+    message VARCHAR
+) RETURNS VARCHAR AS $$
+    PythonFunctionBodyOnly(utilities, minibatch_preprocessing)
+    return minibatch_preprocessing.MiniBatchDocumentation.minibatch_preprocessor_dl_help(schema_madlib, message)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl()
+RETURNS VARCHAR AS $$
+    PythonFunctionBodyOnly(utilities, minibatch_preprocessing)
+    return minibatch_preprocessing.MiniBatchDocumentation.minibatch_preprocessor_dl_help(schema_madlib, '')
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.agg_array_concat_transition(anyarray, anyarray)
+  RETURNS anyarray
+   AS 'select $1 || $2'
+   LANGUAGE SQL
+   IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.agg_array_concat_merge(anyarray, anyarray)
+  RETURNS anyarray
+   AS 'select $1 || $2'
+   LANGUAGE SQL
+   IMMUTABLE
+   RETURNS NULL ON NULL INPUT;
+
+DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.agg_array_concat(anyarray);
+CREATE AGGREGATE MADLIB_SCHEMA.agg_array_concat(anyarray) (
+   SFUNC = MADLIB_SCHEMA.agg_array_concat_transition,
+   STYPE = anyarray,
+   PREFUNC = MADLIB_SCHEMA.agg_array_concat_merge
+   );
diff --git a/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in b/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in
new file mode 100644
index 000000000..347ba2dee
--- /dev/null
+++ b/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in
@@ -0,0 +1,67 @@
+/* ----------------------------------------------------------------------- *//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *//* ----------------------------------------------------------------------- */
+
+DROP TABLE IF EXISTS minibatch_preprocessor_dl_input;
+CREATE TABLE minibatch_preprocessor_dl_input(id serial, x double precision[], label TEXT);
+INSERT INTO minibatch_preprocessor_dl_input(x, label) VALUES
+(ARRAY[1,2,3,4,5,6], 'a'),
+(ARRAY[11,2,3,4,5,6], 'a'),
+(ARRAY[11,22,33,4,5,6], 'a'),
+(ARRAY[11,22,33,44,5,6], 'a'),
+(ARRAY[11,22,33,44,65,6], 'a'),
+(ARRAY[11,22,33,44,65,56], 'a'),
+(ARRAY[11,22,33,44,65,56], 'a'),
+(ARRAY[11,22,33,44,65,56], 'a'),
+(ARRAY[11,22,33,44,65,56], 'a'),
+(ARRAY[11,22,33,44,65,56], 'a'),
+(ARRAY[11,22,33,44,65,56], 'a'),
+(ARRAY[11,22,33,44,65,56], 'a'),
+(ARRAY[11,22,33,144,65,56], 'a'),
+(ARRAY[11,22,233,44,65,56], 'a'),
+(ARRAY[11,22,33,44,65,56], 'b'),
+(ARRAY[11,22,33,44,65,56], 'b'),
+(ARRAY[11,22,33,44,65,56], 'b');
+
+DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary;
+SELECT minibatch_preprocessor_dl(
+  'minibatch_preprocessor_dl_input',
+  'minibatch_preprocessor_dl_batch',
+  'id',
+  'x',
+  5);
+
+SELECT assert(count(*)=4, 'Incorrect number of buffers in minibatch_preprocessor_dl_batch.')
+FROM minibatch_preprocessor_dl_batch;
+
+SELECT assert(array_upper(independent_varname, 1)=5, 'Incorrect buffer size.')
+FROM minibatch_preprocessor_dl_batch WHERE buffer_id=1;
+
+SELECT assert(array_upper(independent_varname, 1)=2, 'Incorrect buffer size.')
+FROM minibatch_preprocessor_dl_batch WHERE buffer_id=4;
+
+DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary;
+SELECT minibatch_preprocessor_dl(
+  'minibatch_preprocessor_dl_input',
+  'minibatch_preprocessor_dl_batch',
+  'label',
+  'x');
+SELECT assert(count(*)=3, 'Incorrect number of buffers in minibatch_preprocessor_dl_batch with default buffer size.')
+FROM minibatch_preprocessor_dl_batch;
diff --git a/src/ports/postgres/modules/utilities/utilities.py_in b/src/ports/postgres/modules/utilities/utilities.py_in
index 50c426b74..89907b391 100644
--- a/src/ports/postgres/modules/utilities/utilities.py_in
+++ b/src/ports/postgres/modules/utilities/utilities.py_in
@@ -875,8 +875,8 @@ def collate_plpy_result(plpy_result_rows):
 
 
 def validate_module_input_params(source_table, output_table, independent_varname,
-                                 dependent_varname, module_name, grouping_cols,
-                                 other_output_tables=None):
+                                 dependent_varname, module_name,
+                                 grouping_cols=None, other_output_tables=None):
     """
     This function is supposed to be used for validating params for
     supervised learning like algos, e.g. linear regression, mlp, etc. since all

From 9a43a4f77520eced28d68c7447e8d5699dc6da8a Mon Sep 17 00:00:00 2001
From: Nandish Jayaram <njayaram@apache.org>
Date: Thu, 20 Dec 2018 15:11:20 -0800
Subject: [PATCH 2/6] Do not use distributed by for postgres.

---
 .../modules/utilities/minibatch_preprocessing.py_in | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
index 12a16ac9c..fff3061da 100644
--- a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
+++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
@@ -80,6 +80,10 @@ class MiniBatchPreProcessorDL:
         """.format(**locals())
         plpy.execute(scalar_mult_sql)
         # Create the mini-batched output table
+        if is_platform_pg():
+            distributed_by_clause = ''
+        else:
+            distributed_by_clause= ' DISTRIBUTED BY (buffer_id) '
         sql = """
             CREATE TABLE {self.output_table} AS
             SELECT * FROM
@@ -91,11 +95,12 @@ class MiniBatchPreProcessorDL:
                 FROM {norm_tbl}
                 GROUP BY buffer_id
             ) b
-            DISTRIBUTED BY (buffer_id)
+            {distributed_by_clause}
         """.format(x=MINIBATCH_OUTPUT_INDEPENDENT_COLNAME,
-                   y=MINIBATCH_OUTPUT_DEPENDENT_COLNAME, **locals())
+                   y=MINIBATCH_OUTPUT_DEPENDENT_COLNAME,
+                   **locals())
         plpy.execute(sql)
-        plpy.execute("DROP TABLE {}".format(norm_tbl))
+        plpy.execute("DROP TABLE {0}".format(norm_tbl))
         # Create summary table
         self._create_output_summary_table()
 
@@ -136,7 +141,7 @@ class MiniBatchPreProcessorDL:
 
     def _get_num_buffers(self):
         num_rows_in_tbl = plpy.execute("""
-                SELECT count(*) AS cnt FROM {}
+                SELECT count(*) AS cnt FROM {0}
             """.format(self.source_table))[0]['cnt']
         buffer_size_calculator = MiniBatchBufferSizeCalculator()
         indepdent_var_dim = _tbl_dimension_rownum(

From c582d8c75344f2de1dbb3f9c00be140028173125 Mon Sep 17 00:00:00 2001
From: Nandish Jayaram <njayaram@apache.org>
Date: Thu, 20 Dec 2018 17:29:37 -0800
Subject: [PATCH 3/6] Remove greenplum specific assert in dev check

---
 .../modules/utilities/test/minibatch_preprocessing_dl.sql_in    | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in b/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in
index 347ba2dee..1dc93dc5a 100644
--- a/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in
+++ b/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in
@@ -63,5 +63,3 @@ SELECT minibatch_preprocessor_dl(
   'minibatch_preprocessor_dl_batch',
   'label',
   'x');
-SELECT assert(count(*)=3, 'Incorrect number of buffers in minibatch_preprocessor_dl_batch with default buffer size.')
-FROM minibatch_preprocessor_dl_batch;

From 89b3e1e07c8bb35c54c06aba6d818f428545079a Mon Sep 17 00:00:00 2001
From: Domino Valdano <dvaldano@pivotal.io>
Date: Thu, 10 Jan 2019 15:19:35 -0800
Subject: [PATCH 4/6] Add dependent_offset optional param for shifting
 dependent variable values.

---
 .../utilities/minibatch_preprocessing.py_in   | 18 +++++++++---
 .../minibatch_preprocessing_dl.sql_in         | 28 ++++++++++++++++---
 2 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
index fff3061da..b39add2a5 100644
--- a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
+++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
@@ -54,7 +54,7 @@ MINIBATCH_OUTPUT_INDEPENDENT_COLNAME = "independent_varname"
 class MiniBatchPreProcessorDL:
     def __init__(self, schema_madlib, source_table, output_table,
                  dependent_varname, independent_varname, buffer_size,
-                 normalizing_const, **kwargs):
+                 normalizing_const, dependent_offset, **kwargs):
         self.schema_madlib = schema_madlib
         self.source_table = source_table
         self.output_table = output_table
@@ -62,6 +62,7 @@ class MiniBatchPreProcessorDL:
         self.independent_varname = independent_varname
         self.buffer_size = buffer_size
         self.normalizing_const = normalizing_const
+        self.dependent_offset = dependent_offset
         self.module_name = "minibatch_preprocessor_DL"
         self.output_summary_table = add_postfix(self.output_table, "_summary")
         self._validate_args()
@@ -70,11 +71,16 @@ class MiniBatchPreProcessorDL:
     def minibatch_preprocessor_dl(self):
         norm_tbl = unique_string(desp='normalized')
         # Create a temp table that has independent var normalized.
+
+        dependent_varname_with_offset = self.dependent_varname
+        if self.dependent_offset:
+            dependent_varname_with_offset = '{0} + {1}'.format(self.dependent_varname, self.dependent_offset)
+
         scalar_mult_sql = """
             CREATE TEMP TABLE {norm_tbl} AS
             SELECT {self.schema_madlib}.array_scalar_mult(
                 {self.independent_varname}::REAL[], (1/{self.normalizing_const})::REAL) AS x_norm,
-                {self.dependent_varname} AS y,
+                {dependent_varname_with_offset} AS y,
                 row_number() over() AS row_id
             FROM {self.source_table}
         """.format(**locals())
@@ -719,6 +725,9 @@ class MiniBatchDocumentation:
             normalizing_const      -- DOUBLE PRECISON. Default 255.0. The
                                       normalizing constant to use for
                                       standardizing arrays in independent_varname.
+            dependent_offset       -- INTEGER. If non-zero, shifts all dependent
+                                      variable values by this number (should
+                                      only be used for numeric types).
         );
 
 
@@ -739,8 +748,9 @@ class MiniBatchDocumentation:
 
         source_table              -- Source table name.
         output_table              -- Output table name from preprocessor.
-        dependent_varname         -- Dependent variable from the original table.
-        independent_varname       -- Independent variables from the original
+        dependent_varname         -- Dependent variable values from the original table
+                                     (shifted by dependent_offset, if specified).
+        independent_varname       -- Independent variable values from the original
                                      table.
         dependent_vartype         -- Type of the dependent variable from the
                                      original table.
diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in
index ed85a217a..bcd81c617 100644
--- a/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in
+++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in
@@ -48,7 +48,8 @@ minibatch_preprocessor_dl(source_table,
                         dependent_varname,
                         independent_varname,
                         buffer_size,
-                        normalizing_const
+                        normalizing_const,
+                        dependent_offset
                         )
 </pre>
 
@@ -90,6 +91,12 @@ minibatch_preprocessor_dl(source_table,
   <dd>DOUBLE PRECISION, default: 255.0. The normalizing constant to divide
   each value in the independent_varname array by.
   </dd>
+
+  <dt>dependent_offset (optional)</dt>
+  <dd>INTEGER, default: 0. If non-zero, shifts all dependent
+  variable values by this number (should only be used for numeric types).
+  </dd>
+
 </dl>
 
 <b>Output tables</b>
@@ -307,7 +314,8 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl(
     dependent_varname           VARCHAR,
     independent_varname         VARCHAR,
     buffer_size                 INTEGER,
-    normalizing_const           DOUBLE PRECISION
+    normalizing_const           DOUBLE PRECISION,
+    dependent_offset            INTEGER
 ) RETURNS VOID AS $$
     PythonFunctionBodyOnly(utilities, minibatch_preprocessing)
     from utilities.control import MinWarning
@@ -318,6 +326,18 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl(
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl(
+    source_table            VARCHAR,
+    output_table            VARCHAR,
+    dependent_varname       VARCHAR,
+    independent_varname     VARCHAR,
+    buffer_size             INTEGER,
+    normalizing_const       DOUBLE PRECISION
+) RETURNS VOID AS $$
+  SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, $5, $6, 0);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl(
     source_table            VARCHAR,
     output_table            VARCHAR,
@@ -325,7 +345,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl(
     independent_varname     VARCHAR,
     buffer_size             INTEGER
 ) RETURNS VOID AS $$
-  SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, $5, 255.0);
+  SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, $5, 255.0, 0);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -335,7 +355,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl(
     dependent_varname       VARCHAR,
     independent_varname     VARCHAR
 ) RETURNS VOID AS $$
-  SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, NULL, 255.0);
+  SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, NULL, 255.0, 0);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 

From da84debc29207588d489d3e6ec9543d2022cc703 Mon Sep 17 00:00:00 2001
From: Domino Valdano <dvaldano@pivotal.io>
Date: Thu, 10 Jan 2019 17:04:41 -0800
Subject: [PATCH 5/6] Fix documentation in .py_in to match sql_in

---
 .../postgres/modules/utilities/minibatch_preprocessing.py_in  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
index b39add2a5..6d761528a 100644
--- a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
+++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
@@ -738,8 +738,8 @@ class MiniBatchDocumentation:
         following columns:
 
         buffer_id               -- INTEGER.  Unique id for packed table.
-        dependent_varname       -- FLOAT8[]. Packed array of dependent variables.
-        independent_varname     -- FLOAT8[]. Packed array of independent
+        dependent_varname       -- ANYARRAY[]. Packed array of dependent variables.
+        independent_varname     -- REAL[]. Packed array of independent
                                    variables.
 
         ---------------------------------------------------------------------------

From 5a025fc38269d9ceb22a06587667dd398751a437 Mon Sep 17 00:00:00 2001
From: Domino Valdano <dominoplural@gmail.com>
Date: Thu, 24 Jan 2019 16:33:14 -0800
Subject: [PATCH 6/6] Added test cases for dependent_offset optional param

Co-authored-by: Jingyi Mei<jmei@pivotal.io>
---
 .../utilities/minibatch_preprocessing.py_in   |  2 +-
 .../minibatch_preprocessing_dl.sql_in         |  8 +--
 .../test/minibatch_preprocessing_dl.sql_in    | 60 +++++++++++++++++++
 3 files changed, 65 insertions(+), 5 deletions(-)

diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
index 6d761528a..be568a930 100644
--- a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
+++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
@@ -725,7 +725,7 @@ class MiniBatchDocumentation:
             normalizing_const      -- DOUBLE PRECISON. Default 255.0. The
                                       normalizing constant to use for
                                       standardizing arrays in independent_varname.
-            dependent_offset       -- INTEGER. If non-zero, shifts all dependent
+            dependent_offset       -- INTEGER. If specified, shifts all dependent
                                       variable values by this number (should
                                       only be used for numeric types).
         );
diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in
index bcd81c617..994e458cc 100644
--- a/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in
+++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in
@@ -93,7 +93,7 @@ minibatch_preprocessor_dl(source_table,
   </dd>
 
   <dt>dependent_offset (optional)</dt>
-  <dd>INTEGER, default: 0. If non-zero, shifts all dependent
+  <dd>INTEGER, default: NULL. If specified, shifts all dependent
   variable values by this number (should only be used for numeric types).
   </dd>
 
@@ -334,7 +334,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl(
     buffer_size             INTEGER,
     normalizing_const       DOUBLE PRECISION
 ) RETURNS VOID AS $$
-  SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, $5, $6, 0);
+  SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, $5, $6, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -345,7 +345,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl(
     independent_varname     VARCHAR,
     buffer_size             INTEGER
 ) RETURNS VOID AS $$
-  SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, $5, 255.0, 0);
+  SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, $5, 255.0, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -355,7 +355,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl(
     dependent_varname       VARCHAR,
     independent_varname     VARCHAR
 ) RETURNS VOID AS $$
-  SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, NULL, 255.0, 0);
+  SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, NULL, 255.0, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
diff --git a/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in b/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in
index 1dc93dc5a..dd9b0d537 100644
--- a/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in
+++ b/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in
@@ -63,3 +63,63 @@ SELECT minibatch_preprocessor_dl(
   'minibatch_preprocessor_dl_batch',
   'label',
   'x');
+
+DROP TABLE IF EXISTS minibatch_preprocessor_dl_input;
+CREATE TABLE minibatch_preprocessor_dl_input(id serial, x double precision[], y INTEGER);
+INSERT INTO minibatch_preprocessor_dl_input(x, y) VALUES
+(ARRAY[1,2,3,4,5,6], 4),
+(ARRAY[11,2,3,4,5,6], 3),
+(ARRAY[11,22,33,4,5,6], 8),
+(ARRAY[11,22,33,44,5,6], 2),
+(ARRAY[11,22,33,44,65,6], 5),
+(ARRAY[11,22,33,44,65,56], 6),
+(ARRAY[11,22,33,44,65,56], 2),
+(ARRAY[11,22,33,44,65,56], 10),
+(ARRAY[11,22,33,44,65,56], 3),
+(ARRAY[11,22,33,44,65,56], 7),
+(ARRAY[11,22,33,44,65,56], 6),
+(ARRAY[11,22,33,44,65,56], -6),
+(ARRAY[11,22,33,144,65,56], 9),
+(ARRAY[11,22,233,44,65,56], 0),
+(ARRAY[11,22,33,44,65,56], 12),
+(ARRAY[11,22,33,44,65,56], -3),
+(ARRAY[11,22,33,44,65,56], -1);
+
+DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary;
+SELECT minibatch_preprocessor_dl(
+  'minibatch_preprocessor_dl_input',
+  'minibatch_preprocessor_dl_batch',
+  'y',
+  'x',
+  4,
+  5);
+
+-- Test that indepdendent vars get divided by 5, by verifying min value goes from 1 to 0.2, and max value from 233 to 46.6
+SELECT assert(relative_error(MIN(x),0.2) < 0.00001, 'Independent var not normalized properly!') FROM (SELECT UNNEST(independent_varname) as x FROM minibatch_preprocessor_dl_batch) a;
+SELECT assert(relative_error(MAX(x),46.6) < 0.00001, 'Independent var not normalized properly!') FROM (SELECT UNNEST(independent_varname) as x FROM minibatch_preprocessor_dl_batch) a;
+
+DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary;
+SELECT minibatch_preprocessor_dl(
+  'minibatch_preprocessor_dl_input',
+  'minibatch_preprocessor_dl_batch',
+  'y',
+  'x',
+  4,
+  5,
+  6);
+
+-- Test that dependent vars gets shifted by +6, by verifying minimum value goes from -6 to 0
+SELECT assert(abs(MIN(y))<0.00001, 'Dependent var not shifted properly!') FROM (SELECT UNNEST(dependent_varname) as y FROM minibatch_preprocessor_dl_batch) a;
+
+DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary;
+SELECT minibatch_preprocessor_dl(
+  'minibatch_preprocessor_dl_input',
+  'minibatch_preprocessor_dl_batch',
+  'y',
+  'x',
+  4,
+  5,
+  -6);
+
+-- Test that dependent vars gets shifted by -6, by verifying minimum value goes from -6 to -12
+SELECT assert(relative_error(MIN(y), -12)<0.00001, 'Dependent var not shifted properly!') FROM (SELECT UNNEST(dependent_varname) as y FROM minibatch_preprocessor_dl_batch) a;
\ No newline at end of file