From e384c1fc7bb27b7c2401b17b6049cee1374fee1a Mon Sep 17 00:00:00 2001
From: Orhan Kislal <okislal@pivotal.io>
Date: Mon, 23 Jan 2017 15:45:08 -0800
Subject: [PATCH 1/5] RF: Fixes the online help and example

---
 .../random_forest.py_in                       | 83 +++++++++++--------
 1 file changed, 47 insertions(+), 36 deletions(-)

diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
index e006a3468..0eb5985c6 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
@@ -103,6 +103,10 @@ SELECT {schema_madlib}.forest_train(
                                 is an positive integer with the default 0.
     verbose,                -- Boolean, whether to print more info,
                               default is False
+    sample_ratio            -- Double precision, in the range of (0, 1], default: 1
+                                If sample_ratio is less than 1, a bootstrap sample
+                                size smaller than the data table is expected to be
+                                used for training each tree in the forest.
 );
 
 ------------------------------------------------------------
@@ -175,44 +179,51 @@ it has the following columns:
 ------------------------------------------------------------
                         EXAMPLE
 ------------------------------------------------------------
-DROP TABLE IF EXISTS dummy_dt_con_src CASCADE;
-CREATE TABLE dummy_dt_con_src (
-    id  INTEGER,
-    cat INTEGER[],
-    con FLOAT8[],
-    y   FLOAT8
+DROP TABLE IF EXISTS dt_golf;
+CREATE TABLE dt_golf (
+    id integer NOT NULL,
+    "OUTLOOK" text,
+    temperature double precision,
+    humidity double precision,
+    windy text,
+    class text
 );
 
-INSERT INTO dummy_dt_src VALUES
-(1, '{0}'::INTEGER[], ARRAY[0], 0.5),
-(2, '{0}'::INTEGER[], ARRAY[1], 0.5),
-(3, '{0}'::INTEGER[], ARRAY[4], 0.5),
-(4, '{0}'::INTEGER[], ARRAY[4], 0.5),
-(5, '{0}'::INTEGER[], ARRAY[4], 0.5),
-(6, '{0}'::INTEGER[], ARRAY[5], 0.1),
-(7, '{0}'::INTEGER[], ARRAY[6], 0.1),
-(8, '{1}'::INTEGER[], ARRAY[9], 0.1);
-(9, '{1}'::INTEGER[], ARRAY[9], 0.1);
-(10, '{1}'::INTEGER[], ARRAY[9], 0.1);
-(11, '{1}'::INTEGER[], ARRAY[9], 0.1);
-
-DROP TABLE IF EXISTS forest_out, forest_out_summary;
-SELECT madlib.forest_train(
-    'dummy_dt_src',
-    'forest_out',
-    'id',
-    'y',
-    'cat, con',
-    '',
-    'mse',
-    NULL::Text,
-    NULL::Text,
-    3,
-    2,
-    1,
-    5);
-
-SELECT madlib.forest_display('forest_out');
+INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) VALUES
+(1, 'sunny', 85, 85, 'false', 'Don''t Play'),
+(2, 'sunny', 80, 90, 'true', 'Don''t Play'),
+(3, 'overcast', 83, 78, 'false', 'Play'),
+(4, 'rain', 70, 96, 'false', 'Play'),
+(5, 'rain', 68, 80, 'false', 'Play'),
+(6, 'rain', 65, 70, 'true', 'Don''t Play'),
+(7, 'overcast', 64, 65, 'true', 'Play'),
+(8, 'sunny', 72, 95, 'false', 'Don''t Play'),
+(9, 'sunny', 69, 70, 'false', 'Play'),
+(10, 'rain', 75, 80, 'false', 'Play'),
+(11, 'sunny', 75, 70, 'true', 'Play'),
+(12, 'overcast', 72, 90, 'true', 'Play'),
+(13, 'overcast', 81, 75, 'false', 'Play'),
+(14, 'rain', 71, 80, 'true', 'Don''t Play');
+
+DROP TABLE IF EXISTS train_output, train_output_group, train_output_summary;
+SELECT madlib.forest_train('dt_golf',         -- source table
+    'train_output',    -- output model table
+    'id',              -- id column
+    'class',           -- response
+    '"OUTLOOK", temperature, humidity, windy',   -- features
+    NULL,              -- exclude columns
+    NULL,              -- grouping columns
+    20::integer,       -- number of trees
+    2::integer,        -- number of random features
+    TRUE::boolean,     -- variable importance
+    1::integer,        -- num_permutations
+    8::integer,        -- max depth
+    3::integer,        -- min split
+    1::integer,        -- min bucket
+    10::integer        -- number of splits per continuous variable
+);
+SELECT madlib.get_tree('train_output',1,2,FALSE);
+
         """
     else:
         help_string = "No such option. Use {schema_madlib}.forest_train('usage')"

From dcf27d4dc32d643ee15cd8e00b054426d07b3626 Mon Sep 17 00:00:00 2001
From: Nandish Jayaram <njayaram2@users.noreply.github.com>
Date: Fri, 13 Jan 2017 09:41:43 -0800
Subject: [PATCH 2/5] Association Rules: Add rule counts to assoc_rules results

JIRA: MADLIB-1044

The output column currently lists support, along with other columns.
This commit now lists the count too, which is essentially
support*number_of_transactions for any rule. This can certainly be
post-processed too, but listing it explicitly might be useful.
---
 .../modules/assoc_rules/assoc_rules.py_in     |  5 +-
 .../modules/assoc_rules/assoc_rules.sql_in    | 48 ++++++++++++-------
 2 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/src/ports/postgres/modules/assoc_rules/assoc_rules.py_in b/src/ports/postgres/modules/assoc_rules/assoc_rules.py_in
index d5eee487d..356e70e9c 100644
--- a/src/ports/postgres/modules/assoc_rules/assoc_rules.py_in
+++ b/src/ports/postgres/modules/assoc_rules/assoc_rules.py_in
@@ -113,6 +113,7 @@ def assoc_rules(madlib_schema, support, confidence, tid_col,
             ruleId      INT,
             pre         TEXT[],
             post        TEXT[],
+            count       INT,
             support     FLOAT8,
             confidence  FLOAT8,
             lift        FLOAT8,
@@ -459,14 +460,14 @@ def assoc_rules(madlib_schema, support, confidence, tid_col,
 
         plpy.execute("""
              INSERT INTO {0}.assoc_rules
-             SELECT t1.ruleId, t2.pre, t3.post, support,
+             SELECT t1.ruleId, t2.pre, t3.post, t1.support*{1}::INT AS count, support,
                     confidence, lift, conviction
              FROM
                 assoc_rules_aux_tmp t1,
                 pre_tmp_table t2,
                 post_tmp_table t3
              WHERE t1.ruleId = t2.ruleId AND t1.ruleId = t3.ruleId
-             """.format(output_schema)
+             """.format(output_schema, num_tranx)
              );
 
         # if in verbose mode, we will keep all the intermediate tables
diff --git a/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in b/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in
index 99b8f00da..846e3e044 100644
--- a/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in
+++ b/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in
@@ -81,18 +81,28 @@ Transactions with multiple items will span multiple rows with one row per item.
 @par Rules
 
 Association rules take the form "If X, then Y", where X and Y are non-empty
-itemsets. X and Y are called the antecedent and consequent, or the left-hand-side 
+itemsets. X and Y are called the antecedent and consequent, or the left-hand-side
 and right-hand-side, of the rule respectively. Using our previous example,
 the association rule may state "If {diapers}, then {beer}" with .2 support and
 .85 confidence.
 
-Given any association rule "If X, then Y", the association rules function will
-also calculate the following metrics:
+The following metrics are defined for any given itemset "X".
+- Count: The number of transactions that contain X
+
 - Support: The ratio of transactions that contain X to all transactions, T
 \f[
 S (X) = \frac{Total X}{Total transactions}
 \f]
 
+Given any association rule "If X, then Y", the association rules function will
+also calculate the following metrics:
+- Count: The number of transactions that contain X U Y
+
+- Support: The ratio of transactions that contain X U Y to all transactions, T
+\f[
+S (X \Rightarrow Y) = \frac{Total(X \cup Y)}{Total transactions}
+\f]
+
 - Confidence: The ratio of transactions that contain \f$ X,Y \f$ to
 transactions that contain \f$ X \f$. One could view this metric as the
 conditional probability of \f$ Y \f$ , given \f$ X \f$ . \f$ P(Y|X) \f$
@@ -199,7 +209,7 @@ This generates all association rules that satisfy the specified minimum
   The schema must be created before calling the function.  Alternatively, use
   <tt>NULL</tt> to output to the current schema.
 
-  The results containing the rules, support, confidence, lift, and
+  The results containing the rules, support, count, confidence, lift, and
   conviction are stored in the table \c assoc_rules in the schema
   specified by \c output_schema.
 
@@ -221,6 +231,10 @@ This generates all association rules that satisfy the specified minimum
         <th>support</th>
         <td>double</td>
       </tr>
+      <tr>
+        <th>count</th>
+        <td>integer</td>
+      </tr>
       <tr>
         <th>confidence</th>
         <td>double</td>
@@ -306,15 +320,15 @@ ORDER BY support DESC, confidence DESC;
 </pre>
 Result:
 <pre class="result">
- ruleid |       pre       |      post      |      support      |    confidence     |       lift        |    conviction     
---------+-----------------+----------------+-------------------+-------------------+-------------------+-------------------
-      4 | {diapers}       | {beer}         | 0.714285714285714 |                 1 |                 1 |                 0
-      3 | {beer}          | {diapers}      | 0.714285714285714 | 0.714285714285714 |                 1 |                 1
-      1 | {chips}         | {beer}         | 0.428571428571429 |                 1 |                 1 |                 0
-      7 | {diapers,chips} | {beer}         | 0.285714285714286 |                 1 |                 1 |                 0
-      2 | {chips}         | {diapers}      | 0.285714285714286 | 0.666666666666667 | 0.933333333333333 | 0.857142857142857
-      5 | {chips}         | {beer,diapers} | 0.285714285714286 | 0.666666666666667 | 0.933333333333333 | 0.857142857142857
-      6 | {beer,chips}    | {diapers}      | 0.285714285714286 | 0.666666666666667 | 0.933333333333333 | 0.857142857142857
+ ruleid |       pre       |      post      | count |      support      |    confidence     |       lift        |    conviction
+--------+-----------------+----------------+-------+-------------------+-------------------+-------------------+-------------------
+      4 | {diapers}       | {beer}         |     5 | 0.714285714285714 |                 1 |                 1 |                 0
+      3 | {beer}          | {diapers}      |     5 | 0.714285714285714 | 0.714285714285714 |                 1 |                 1
+      1 | {chips}         | {beer}         |     3 | 0.428571428571429 |                 1 |                 1 |                 0
+      7 | {diapers,chips} | {beer}         |     2 | 0.285714285714286 |                 1 |                 1 |                 0
+      2 | {chips}         | {diapers}      |     2 | 0.285714285714286 | 0.666666666666667 | 0.933333333333333 | 0.857142857142857
+      5 | {chips}         | {beer,diapers} |     2 | 0.285714285714286 | 0.666666666666667 | 0.933333333333333 | 0.857142857142857
+      6 | {beer,chips}    | {diapers}      |     2 | 0.285714285714286 | 0.666666666666667 | 0.933333333333333 | 0.857142857142857
 (7 rows)
 </pre>
 
@@ -326,10 +340,10 @@ SELECT * FROM assoc_rules WHERE array_upper(pre,1) = 1 AND post = array['beer'];
 </pre>
 Result:
 <pre class="result">
- ruleid |    pre    |  post  |      support      | confidence | lift | conviction 
---------+-----------+--------+-------------------+------------+------+------------
-      1 | {chips}   | {beer} | 0.428571428571429 |          1 |    1 |          0
-      4 | {diapers} | {beer} | 0.714285714285714 |          1 |    1 |          0
+ ruleid |    pre    |  post  | count |      support      | confidence | lift | conviction
+--------+-----------+--------+-------+-------------------+------------+------+------------
+      1 | {chips}   | {beer} |     3 | 0.428571428571429 |          1 |    1 |          0
+      4 | {diapers} | {beer} |     5 | 0.714285714285714 |          1 |    1 |          0
 (2 rows)
 </pre>
 

From fcde22226a2db6772dcb9c7f4f0007c2836a02e2 Mon Sep 17 00:00:00 2001
From: Nandish Jayaram <njayaram2@users.noreply.github.com>
Date: Wed, 18 Jan 2017 10:25:51 -0800
Subject: [PATCH 3/5] Add parameter to limit itemset size in Apriori

JIRA: MADLIB-1031

- Add a new optional parameter named max_itemset_size that determines
the maximum size of frequent itemsets that is to be considered for
generating association rules from. Value lower than 2 will throw
and error, and the default behavior is to generate rules from all
frequent itemsets found.
- This commit also includes improved online and user documentation
with examples of usage.
---
 .../modules/assoc_rules/assoc_rules.py_in     | 128 +++++++++++++++++-
 .../modules/assoc_rules/assoc_rules.sql_in    | 101 +++++++++++++-
 .../assoc_rules/test/assoc_rules.sql_in       |   9 ++
 3 files changed, 229 insertions(+), 9 deletions(-)

diff --git a/src/ports/postgres/modules/assoc_rules/assoc_rules.py_in b/src/ports/postgres/modules/assoc_rules/assoc_rules.py_in
index 356e70e9c..c6eff3589 100644
--- a/src/ports/postgres/modules/assoc_rules/assoc_rules.py_in
+++ b/src/ports/postgres/modules/assoc_rules/assoc_rules.py_in
@@ -7,7 +7,6 @@
 
 @namespace assoc_rules
 """
-
 import time
 import plpy
 from utilities.validate_args import columns_exist_in_table
@@ -55,13 +54,19 @@ def __float_le(val1, val2):
 @param input_table     name of the table where the data is stored
 @param output_schema   name of the schema where the final results will be stored
 @param verbose         determining if output contains comments
+@param max_itemset_size determines the maximum size of frequent itemsets allowed
+                        to generate association rules from
 """
 def assoc_rules(madlib_schema, support, confidence, tid_col,
-                item_col, input_table, output_schema, verbose):
+                item_col, input_table, output_schema, verbose, max_itemset_size):
 
     begin_func_exec = time.time();
     begin_step_exec = time.time();
     cal_itemsets_time = 0;
+    if max_itemset_size is None:
+        max_itemset_size = float('inf')
+    elif max_itemset_size <= 1:
+        plpy.error("ERROR: max_itemset_size has to be greater than 1.")
 
     #check parameters
     __assert(
@@ -305,7 +310,7 @@ def assoc_rules(madlib_schema, support, confidence, tid_col,
 
     iter = 0;
 
-    while num_item_loop > 0 :
+    while num_item_loop > 0 and iter < max_itemset_size:
         begin_step_exec = time.time();
         iter = iter + 1;
 
@@ -505,3 +510,120 @@ def assoc_rules(madlib_schema, support, confidence, tid_col,
             total_rules,
             time.time() - begin_func_exec
            );
+
+def assoc_rules_help_message(schema_madlib, message=None, **kwargs):
+    """
+    Given a help string, provide usage information
+
+    Args:
+        @param schema_madlib Name of the MADlib schema
+        @param message  Helper message to print
+
+    Returns:
+        None
+    """
+    if message is not None and \
+            message.lower() in ("usage", "help", "?"):
+        return """
+-----------------------------------------------------------------------
+                                USAGE
+-----------------------------------------------------------------------
+SELECT {schema_madlib}.assoc_rules(
+    support,            -- FLOAT8, minimum level of support needed for each itemset to be included in result
+    confidence,         -- FLOAT8, minimum level of confidence needed for each rule to be included in result
+    tid_col,            -- TEXT, name of the column storing the transaction ids
+    item_col,           -- TEXT, name of the column storing the products
+    input_table,        -- TEXT, name of the table containing the input data
+    output_schema,      -- TEXT, name of the schema where the final results will be stored.
+                                The schema must be created before calling the function.  Alternatively, use
+                                <tt>NULL</tt> to output to the current schema.
+    verbose,            -- BOOLEAN, (optional, default: False) determines if details are printed for each
+                                iteration as the algorithm progresses
+    max_itemset_size    -- INTEGER, (optional, default: itemsets of all sizes) determines the maximum size of frequent
+                                itemsets allowed that are used for generating association rules. Value less
+                                than 2 throws an error.
+);
+-------------------------------------------------------------------------
+                                OUTPUT TABLES
+-------------------------------------------------------------------------
+The output table "assoc_rules" in the "output_schema" contains a unique rule of the form "If X, then Y
+(i.e., X => Y)" in each row. X and Y are non-empty itemsets, called the antecedent and consequent, or
+the left-hand-side (LHS) and right-hand-side (LHS), of the rule respectively.
+
+in each row, with the following columns:
+    ruleid,     -- INTEGER, row number
+    pre,        -- TEXT, specifies the antecedent, or the LHS of the rule
+    post,       -- DOUBLE, specifies the consequent, or the RHS of the rule
+    support,    -- DOUBLE, support of the frequent itemset X,Y
+    count,      -- INTEGER, number of transactions in the input table that contain X,Y
+    confidence, -- DOUBLE, the ratio of number of transactions that contain X,Y to the number of transactions
+                        that contain X
+    lift,       -- DOUBLE, the ratio of observed support of X,Y to the expected support of X,Y, assuming X and
+                        Y are independent.
+    conviction  -- DOUBLE, the ratio of expected support of X occurring without Y assuming X and Y are
+                        independent, to the observed support of X occuring without Y
+        """.format(schema_madlib=schema_madlib)
+    else:
+        if message.lower() in ("example", "examples"):
+            return """
+------------------------------------------------------------------------
+                                EXAMPLES
+------------------------------------------------------------------------
+DROP TABLE IF EXISTS test_data;
+CREATE TABLE test_data (
+    trans_id INT,
+    product TEXT
+);
+INSERT INTO test_data VALUES (1, 'beer');
+INSERT INTO test_data VALUES (1, 'diapers');
+INSERT INTO test_data VALUES (1, 'chips');
+INSERT INTO test_data VALUES (2, 'beer');
+INSERT INTO test_data VALUES (2, 'diapers');
+INSERT INTO test_data VALUES (3, 'beer');
+INSERT INTO test_data VALUES (3, 'diapers');
+INSERT INTO test_data VALUES (4, 'beer');
+INSERT INTO test_data VALUES (4, 'chips');
+INSERT INTO test_data VALUES (5, 'beer');
+INSERT INTO test_data VALUES (6, 'beer');
+INSERT INTO test_data VALUES (6, 'diapers');
+INSERT INTO test_data VALUES (6, 'chips');
+INSERT INTO test_data VALUES (7, 'beer');
+INSERT INTO test_data VALUES (7, 'diapers');
+\.
+
+Find all association rules with a support and threshold value of
+at least 0.25 and 0.5 respectively.
+
+SELECT * FROM {schema_madlib}.assoc_rules( .25,
+                          .5,
+                          'trans_id',
+                          'product',
+                          'test_data',
+                          NULL,
+                          TRUE
+                        );
+
+View output results:
+SELECT * FROM assoc_rules;
+
+Find association rules generated from itemsets of size at most 2,
+and a support and threshold value of at least 0.25 and 0.5 respectively.
+
+SELECT * FROM {schema_madlib}.assoc_rules( .25,
+                          .5,
+                          'trans_id',
+                          'product',
+                          'test_data',
+                          NULL,
+                          TRUE,
+                          2
+                        );
+
+View output results:
+SELECT * FROM assoc_rules;
+            """.format(schema_madlib=schema_madlib)
+        else:
+            return """
+For an overview on usage, run: SELECT {schema_madlib}.assoc_rules('usage');
+For an example of using assoc_rules, run: SELECT {schema_madlib}.assoc_rules('example');
+            """.format(schema_madlib=schema_madlib)
diff --git a/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in b/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in
index 846e3e044..5bfd8130f 100644
--- a/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in
+++ b/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in
@@ -96,9 +96,9 @@ S (X) = \frac{Total X}{Total transactions}
 
 Given any association rule "If X, then Y", the association rules function will
 also calculate the following metrics:
-- Count: The number of transactions that contain X U Y
+- Count: The number of transactions that contain X,Y
 
-- Support: The ratio of transactions that contain X U Y to all transactions, T
+- Support: The ratio of transactions that contain X,Y to all transactions, T
 \f[
 S (X \Rightarrow Y) = \frac{Total(X \cup Y)}{Total transactions}
 \f]
@@ -257,8 +257,13 @@ This generates all association rules that satisfy the specified minimum
   </dd>
 
   <dt>verbose</dt>
-  <dd>BOOLEAN, default FALSE. Determines if details are printed for each iteration
+  <dd>BOOLEAN, default: FALSE. Determines if details are printed for each iteration
   as the algorithm progresses.</dd>
+
+  <dt>max_itemset_size</dt>
+  <dd>INTEGER, default: itemsets of ALL sizes. Determines the maximum size of frequent
+  itemsets allowed that are used for generating association rules. Value less
+  than 2 throws an error..</dd>
 </dl>
 
 
@@ -331,6 +336,39 @@ Result:
       6 | {beer,chips}    | {diapers}      |     2 | 0.285714285714286 | 0.666666666666667 | 0.933333333333333 | 0.857142857142857
 (7 rows)
 </pre>
+Let us now find association rules generated from itemsets of size at most 2:
+<pre class="example">
+SELECT * FROM madlib.assoc_rules( .25,
+                                  .5,
+                                  'trans_id',
+                                  'product',
+                                  'test_data',
+                                  NULL,
+                                  TRUE, 2
+                                );
+</pre>
+Result (iteration details not shown):
+<pre class="result">
+ output_schema | output_table | total_rules |   total_time
+---------------+--------------+-------------+-----------------
+ public        | assoc_rules  |           4 | 00:00:00.053899
+(1 row)
+</pre>
+The association rules are again stored in the assoc_rules table:
+<pre class="example">
+SELECT * FROM assoc_rules
+ORDER BY support DESC, confidence DESC;
+</pre>
+Result:
+<pre class="result">
+ ruleid |    pre    |   post    | count |      support      |    confidence     |       lift        |    conviction
+--------+-----------+-----------+-------+-------------------+-------------------+-------------------+-------------------
+      4 | {diapers} | {beer}    |     5 | 0.714285714285714 |                 1 |                 1 |                 0
+      3 | {beer}    | {diapers} |     5 | 0.714285714285714 | 0.714285714285714 |                 1 |                 1
+      1 | {chips}   | {beer}    |     3 | 0.428571428571429 |                 1 |                 1 |                 0
+      2 | {chips}   | {diapers} |     2 | 0.285714285714286 | 0.666666666666667 | 0.933333333333333 | 0.857142857142857
+(4 rows)
+</pre>
 
 -# Post-processing can now be done on the output table in the case that
 you want to filter the results.  For example, if you want any single item on the left hand side 
@@ -447,7 +485,8 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.assoc_rules
     item_col TEXT,
     input_table TEXT,
     output_schema TEXT,
-    verbose BOOLEAN
+    verbose BOOLEAN,
+    max_itemset_size INTEGER
    )
 RETURNS MADLIB_SCHEMA.assoc_rules_results
 AS $$
@@ -465,7 +504,8 @@ AS $$
         item_col,
         input_table,
         output_schema,
-        verbose
+        verbose,
+        max_itemset_size
         );
 
 $$ LANGUAGE plpythonu
@@ -502,8 +542,57 @@ AS $$
         item_col,
         input_table,
         output_schema,
-        False
+        False,
+        'NULL'
+        );
+
+$$ LANGUAGE plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.assoc_rules
+    (
+    support FLOAT8,
+    confidence FLOAT8,
+    tid_col TEXT,
+    item_col TEXT,
+    input_table TEXT,
+    output_schema TEXT,
+    verbose BOOLEAN
+    )
+RETURNS MADLIB_SCHEMA.assoc_rules_results
+AS $$
+
+    PythonFunctionBodyOnly(`assoc_rules', `assoc_rules')
+
+    plpy.execute("SET client_min_messages = error;")
+
+    # schema_madlib comes from PythonFunctionBodyOnly
+    return assoc_rules.assoc_rules(
+        schema_madlib,
+        support,
+        confidence,
+        tid_col,
+        item_col,
+        input_table,
+        output_schema,
+        verbose,
+        'NULL'
         );
 
 $$ LANGUAGE plpythonu
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+--------------------------------------------------------------------------
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.assoc_rules(message TEXT)
+RETURNS text AS $$
+PythonFunction(assoc_rules, assoc_rules, assoc_rules_help_message)
+$$ language plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.assoc_rules()
+RETURNS text AS $$
+    SELECT MADLIB_SCHEMA.assoc_rules('');
+$$ language SQL
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
diff --git a/src/ports/postgres/modules/assoc_rules/test/assoc_rules.sql_in b/src/ports/postgres/modules/assoc_rules/test/assoc_rules.sql_in
index 59aba05f4..5a6275ece 100644
--- a/src/ports/postgres/modules/assoc_rules/test/assoc_rules.sql_in
+++ b/src/ports/postgres/modules/assoc_rules/test/assoc_rules.sql_in
@@ -28,6 +28,7 @@ declare
     result1        TEXT;
     result2        TEXT;
     result3        TEXT;
+    result_maxiter TEXT;
     res            MADLIB_SCHEMA.assoc_rules_results;
     output_schema  TEXT;
     output_table   TEXT;
@@ -137,6 +138,10 @@ begin
           abs(t1.support - t2.support) < 1E-10 AND
           abs(t1.confidence - t2.confidence) < 1E-10;
 
+    PERFORM MADLIB_SCHEMA.assoc_rules (.1, .5, 'trans_id', 'product', 'test_data2','madlib_installcheck_assoc_rules', false, 2);
+    SELECT INTO result_maxiter CASE WHEN count(*) = 4 then 'PASS' ELSE 'FAIL' END
+    FROM assoc_rules;
+
     DROP TABLE IF EXISTS test_data1;
     DROP TABLE IF EXISTS test_data2;
     DROP TABLE IF EXISTS test2_exp_result;
@@ -150,6 +155,10 @@ begin
         RAISE EXCEPTION 'Association rules mining failed. No results were returned.';
     END IF;
 
+    IF result_maxiter = 'FAIL' THEN
+        RAISE EXCEPTION 'Association rules mining error when max_iter parameter specified.';
+    END IF;
+
     RAISE INFO 'Association rules install check passed.';
     RETURN;
 

From 9532c69cb8677aea558ab5118cbd6733a116973e Mon Sep 17 00:00:00 2001
From: Nandish Jayaram <njayaram2@users.noreply.github.com>
Date: Fri, 20 Jan 2017 13:19:56 -0800
Subject: [PATCH 4/5] Remove an unwanted symbol from example string.

---
 src/ports/postgres/modules/assoc_rules/assoc_rules.py_in | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/ports/postgres/modules/assoc_rules/assoc_rules.py_in b/src/ports/postgres/modules/assoc_rules/assoc_rules.py_in
index c6eff3589..abc8b5080 100644
--- a/src/ports/postgres/modules/assoc_rules/assoc_rules.py_in
+++ b/src/ports/postgres/modules/assoc_rules/assoc_rules.py_in
@@ -589,7 +589,6 @@ INSERT INTO test_data VALUES (6, 'diapers');
 INSERT INTO test_data VALUES (6, 'chips');
 INSERT INTO test_data VALUES (7, 'beer');
 INSERT INTO test_data VALUES (7, 'diapers');
-\.
 
 Find all association rules with a support and threshold value of
 at least 0.25 and 0.5 respectively.

From a8acfe8a4947038bd427e30501cc10a4157f6efd Mon Sep 17 00:00:00 2001
From: Nandish Jayaram <njayaram2@users.noreply.github.com>
Date: Wed, 25 Jan 2017 15:55:08 -0800
Subject: [PATCH 5/5] doc changes

---
 .../modules/assoc_rules/assoc_rules.sql_in    | 107 +++++++++---------
 1 file changed, 55 insertions(+), 52 deletions(-)

diff --git a/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in b/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in
index 5bfd8130f..fc87b1673 100644
--- a/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in
+++ b/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in
@@ -43,7 +43,7 @@ support and confidence values, this function generates all single and
 multidimensional association rules that meet the minimum thresholds.
 
 Association rule mining is a widely used technique for discovering relationships
-between variables in a large data set (e.g items in a store that are commonly
+between variables in a large data set (e.g., items in a store that are commonly
 purchased together). The classic market basket analysis example using
 association rules is the "beer and diapers" rule. According to data mining urban
 legend, a study of customer purchase behavior in a supermarket found that men
@@ -62,7 +62,7 @@ data is stored in two columns with one item and transaction id per row.
 Transactions with multiple items will span multiple rows with one row per item.
 
 <pre>
-     tran_id | product
+    trans_id | product
     ---------+---------
            1 | 1
            1 | 2
@@ -134,7 +134,7 @@ Conv (X \Rightarrow Y) = \frac{1 - S(Y)}{1 - C(X \Rightarrow Y)}
 @par Apriori Algorithm
 
 Although there are many algorithms that generate association rules, the classic
-algorithm used is called Apriori [1] which we have implemented in this module. It is a
+algorithm is called Apriori [1] which we have implemented in this module. It is a
 breadth-first search, as opposed to depth-first searches like Eclat. Frequent
 itemsets of order \f$ n \f$ are generated from sets of order \f$ n - 1 \f$.
 Using the downward closure property, all sets must have frequent subsets. There
@@ -153,7 +153,7 @@ itemsets of order \f$ n - 1 \f$.
 This is done by doing the union of two itemsets that have identical items except one.
 -# Eliminate itemsets that have (n-1) order subsets with insufficient support.
 -# Eliminate itemsets with insufficient support.
--# Repeat until itemsets cannot be generated.
+-# Repeat until itemsets cannot be generated, or maximum itemset size is exceeded.
 
 \e Association \e rule \e generation
 
@@ -172,7 +172,8 @@ assoc_rules( support,
              item_col,
              input_table,
              output_schema,
-             verbose
+             verbose,
+             max_itemset_size
            );</pre>
 This generates all association rules that satisfy the specified minimum
 <em>support</em> and <em>confidence</em>.
@@ -180,19 +181,19 @@ This generates all association rules that satisfy the specified minimum
 \b Arguments
 <dl class="arglist">
   <dt>support</dt>
-  <dd>The minimum level of support needed for each itemset to be included in result.</dd>
+  <dd>Minimum level of support needed for each itemset to be included in result.</dd>
 
   <dt>confidence</dt>
-  <dd>The minimum level of confidence needed for each rule to be included in result.</dd>
+  <dd>Minimum level of confidence needed for each rule to be included in result.</dd>
 
   <dt>tid_col</dt>
-  <dd>The name of the column storing the transaction ids.</dd>
+  <dd>Name of the column storing the transaction ids.</dd>
 
   <dt>item_col</dt>
-  <dd>The name of the column storing the products.</dd>
+  <dd>Name of the column storing the products.</dd>
 
   <dt>input_table</dt>
-  <dd>The name of the table containing the input data.
+  <dd>Name of the table containing the input data.
 
   The input data is expected to be of the following form:
 <pre>{TABLE|VIEW} <em>input_table</em> (
@@ -227,14 +228,14 @@ This generates all association rules that satisfy the specified minimum
         <th>post</th>
         <td>text</td>
       </tr>
-      <tr>
-        <th>support</th>
-        <td>double</td>
-      </tr>
       <tr>
         <th>count</th>
         <td>integer</td>
       </tr>
+      <tr>
+        <th>support</th>
+        <td>double</td>
+      </tr>
       <tr>
         <th>confidence</th>
         <td>double</td>
@@ -261,16 +262,16 @@ This generates all association rules that satisfy the specified minimum
   as the algorithm progresses.</dd>
 
   <dt>max_itemset_size</dt>
-  <dd>INTEGER, default: itemsets of ALL sizes. Determines the maximum size of frequent
-  itemsets allowed that are used for generating association rules. Value less
-  than 2 throws an error..</dd>
+  <dd>INTEGER, default: generate itemsets of all sizes. Determines the maximum size of frequent
+  itemsets that are used for generating association rules. Must be 2 or more.
+  This parameter can be used to reduce run time for data sets where itemset size is large. </dd>
 </dl>
 
 
 @anchor examples
 @examp
 
-Let us take a look at some sample transactional data and generate association rules.
+Let's look at some sample transactional data and generate association rules.
 
 -# Create an input dataset:
 <pre class="example">
@@ -302,20 +303,20 @@ In this example we set verbose to
 TRUE so that we have some insight into progress of the function. We
 can now generate association rules as follows:
 <pre class="example">
-SELECT * FROM madlib.assoc_rules( .25,
-                                  .5,
-                                  'trans_id',
-                                  'product',
-                                  'test_data',
-                                  NULL,
-                                  TRUE
+SELECT * FROM madlib.assoc_rules( .25,            -- Support
+                                  .5,             -- Confidence
+                                  'trans_id',     -- Transaction id col
+                                  'product',      -- Product col
+                                  'test_data',    -- Input data
+                                  NULL,           -- Output schema
+                                  TRUE            -- Verbose output
                                 );
 </pre>
 Result (iteration details not shown):
 <pre class="result">
  output_schema | output_table | total_rules |   total_time    
 ---------------+--------------+-------------+-----------------
- public        | assoc_rules  |           7 | 00:00:00.028534
+ public        | assoc_rules  |           7 | 00:00:00.569254
 (1 row)
 </pre>
 The association rules are stored in the assoc_rules table:
@@ -325,33 +326,35 @@ ORDER BY support DESC, confidence DESC;
 </pre>
 Result:
 <pre class="result">
- ruleid |       pre       |      post      | count |      support      |    confidence     |       lift        |    conviction
+ ruleid |       pre       |      post      | count |      support      |    confidence     |       lift        |    conviction     
 --------+-----------------+----------------+-------+-------------------+-------------------+-------------------+-------------------
-      4 | {diapers}       | {beer}         |     5 | 0.714285714285714 |                 1 |                 1 |                 0
-      3 | {beer}          | {diapers}      |     5 | 0.714285714285714 | 0.714285714285714 |                 1 |                 1
-      1 | {chips}         | {beer}         |     3 | 0.428571428571429 |                 1 |                 1 |                 0
-      7 | {diapers,chips} | {beer}         |     2 | 0.285714285714286 |                 1 |                 1 |                 0
-      2 | {chips}         | {diapers}      |     2 | 0.285714285714286 | 0.666666666666667 | 0.933333333333333 | 0.857142857142857
-      5 | {chips}         | {beer,diapers} |     2 | 0.285714285714286 | 0.666666666666667 | 0.933333333333333 | 0.857142857142857
-      6 | {beer,chips}    | {diapers}      |     2 | 0.285714285714286 | 0.666666666666667 | 0.933333333333333 | 0.857142857142857
+      2 | {diapers}       | {beer}         |     5 | 0.714285714285714 |                 1 |                 1 |                 0
+      6 | {beer}          | {diapers}      |     5 | 0.714285714285714 | 0.714285714285714 |                 1 |                 1
+      5 | {chips}         | {beer}         |     3 | 0.428571428571429 |                 1 |                 1 |                 0
+      4 | {chips,diapers} | {beer}         |     2 | 0.285714285714286 |                 1 |                 1 |                 0
+      1 | {chips}         | {diapers,beer} |     2 | 0.285714285714286 | 0.666666666666667 | 0.933333333333333 | 0.857142857142857
+      7 | {chips}         | {diapers}      |     2 | 0.285714285714286 | 0.666666666666667 | 0.933333333333333 | 0.857142857142857
+      3 | {beer,chips}    | {diapers}      |     2 | 0.285714285714286 | 0.666666666666667 | 0.933333333333333 | 0.857142857142857
 (7 rows)
 </pre>
-Let us now find association rules generated from itemsets of size at most 2:
+
+-# Limit association rules generated from itemsets of size at most 2:
 <pre class="example">
-SELECT * FROM madlib.assoc_rules( .25,
-                                  .5,
-                                  'trans_id',
-                                  'product',
-                                  'test_data',
-                                  NULL,
-                                  TRUE, 2
+SELECT * FROM madlib.assoc_rules( .25,            -- Support
+                                  .5,             -- Confidence
+                                  'trans_id',     -- Transaction id col
+                                  'product',      -- Product col
+                                  'test_data',    -- Input data
+                                  NULL,           -- Output schema
+                                  TRUE,           -- Verbose output
+                                  2               -- Max itemset size
                                 );
 </pre>
 Result (iteration details not shown):
 <pre class="result">
- output_schema | output_table | total_rules |   total_time
+ output_schema | output_table | total_rules |   total_time    
 ---------------+--------------+-------------+-----------------
- public        | assoc_rules  |           4 | 00:00:00.053899
+ public        | assoc_rules  |           4 | 00:00:00.565176
 (1 row)
 </pre>
 The association rules are again stored in the assoc_rules table:
@@ -361,12 +364,12 @@ ORDER BY support DESC, confidence DESC;
 </pre>
 Result:
 <pre class="result">
- ruleid |    pre    |   post    | count |      support      |    confidence     |       lift        |    conviction
+ ruleid |    pre    |   post    | count |      support      |    confidence     |       lift        |    conviction     
 --------+-----------+-----------+-------+-------------------+-------------------+-------------------+-------------------
-      4 | {diapers} | {beer}    |     5 | 0.714285714285714 |                 1 |                 1 |                 0
-      3 | {beer}    | {diapers} |     5 | 0.714285714285714 | 0.714285714285714 |                 1 |                 1
-      1 | {chips}   | {beer}    |     3 | 0.428571428571429 |                 1 |                 1 |                 0
-      2 | {chips}   | {diapers} |     2 | 0.285714285714286 | 0.666666666666667 | 0.933333333333333 | 0.857142857142857
+      1 | {diapers} | {beer}    |     5 | 0.714285714285714 |                 1 |                 1 |                 0
+      2 | {beer}    | {diapers} |     5 | 0.714285714285714 | 0.714285714285714 |                 1 |                 1
+      3 | {chips}   | {beer}    |     3 | 0.428571428571429 |                 1 |                 1 |                 0
+      4 | {chips}   | {diapers} |     2 | 0.285714285714286 | 0.666666666666667 | 0.933333333333333 | 0.857142857142857
 (4 rows)
 </pre>
 
@@ -378,10 +381,10 @@ SELECT * FROM assoc_rules WHERE array_upper(pre,1) = 1 AND post = array['beer'];
 </pre>
 Result:
 <pre class="result">
- ruleid |    pre    |  post  | count |      support      | confidence | lift | conviction
+ ruleid |    pre    |  post  | count |      support      | confidence | lift | conviction 
 --------+-----------+--------+-------+-------------------+------------+------+------------
-      1 | {chips}   | {beer} |     3 | 0.428571428571429 |          1 |    1 |          0
-      4 | {diapers} | {beer} |     5 | 0.714285714285714 |          1 |    1 |          0
+      1 | {diapers} | {beer} |     5 | 0.714285714285714 |          1 |    1 |          0
+      3 | {chips}   | {beer} |     3 | 0.428571428571429 |          1 |    1 |          0
 (2 rows)
 </pre>