From 03f6e0572f80e9437a45f84702db77b2d91ab3fd Mon Sep 17 00:00:00 2001
From: Frank McQuillan <fmcquillan@pivotal.io>
Date: Thu, 9 Nov 2017 15:50:33 -0800
Subject: [PATCH] multiple doc updates for 1dot13

---
 doc/mainpage.dox.in                           |   6 +-
 src/ports/postgres/modules/convex/mlp.sql_in  |   3 +-
 .../postgres/modules/graph/measures.sql_in    |   4 +-
 .../postgres/modules/graph/pagerank.sql_in    |  15 +-
 .../decision_tree.sql_in                      |  29 +-
 .../random_forest.sql_in                      |  28 +-
 .../postgres/modules/regress/linear.sql_in    | 194 ++++++----
 .../postgres/modules/regress/logistic.sql_in  | 242 ++++++++----
 .../postgres/modules/stats/correlation.sql_in |   2 +-
 .../postgres/modules/summary/summary.sql_in   | 355 +++++++++++++-----
 .../postgres/modules/utilities/path.sql_in    |   4 +-
 11 files changed, 597 insertions(+), 285 deletions(-)
diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index cddc2b93b..4a58e3067 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in
@@ -99,7 +99,7 @@ complete matrix stored as a distributed table.
         @defgroup grp_matrix_factorization Matrix Factorization
         @brief Matrix Factorization methods including Singular Value Decomposition and Low-rank Matrix Factorization
         @{
-            @defgroup grp_lmf Low-rank Matrix Factorization
+            @defgroup grp_lmf Low-Rank Matrix Factorization
             @defgroup grp_svd Singular Value Decomposition
         @}
 
@@ -134,10 +134,10 @@ Contains graph algorithms.
     @defgroup grp_graph_measures Measures
     Graph Measures
     @{
+        @defgroup grp_graph_avg_path_length Average Path Length
         @defgroup grp_graph_closeness Closeness
         @defgroup grp_graph_diameter Graph Diameter
-        @defgroup grp_graph_avg_path_length Average Path Length
-        @defgroup grp_graph_vertex_degrees In-Out degree
+        @defgroup grp_graph_vertex_degrees In-Out Degree
     @}
     @defgroup grp_pagerank PageRank
     @defgroup grp_sssp Single Source Shortest Path
diff --git a/src/ports/postgres/modules/convex/mlp.sql_in b/src/ports/postgres/modules/convex/mlp.sql_in
index bafb4dd0f..e6e271619 100644
--- a/src/ports/postgres/modules/convex/mlp.sql_in
+++ b/src/ports/postgres/modules/convex/mlp.sql_in
@@ -1228,8 +1228,7 @@ For details on backpropogation, see [2].
 @literature
 
 @anchor mlp-lit-1
-[1] "Multilayer Perceptron." Wikipedia. Wikimedia Foundation,
-    12 July 2017. Web. 12 July 2017.
+[1] https://en.wikipedia.org/wiki/Multilayer_perceptron
 
 [2] Yu Hen Hu. "Lecture 11. MLP (III): Back-Propagation."
     University of Wisconsin Madison: Computer-Aided Engineering. Web. 12 July 2017,
diff --git a/src/ports/postgres/modules/graph/measures.sql_in b/src/ports/postgres/modules/graph/measures.sql_in
index b92f68396..92680ba46 100644
--- a/src/ports/postgres/modules/graph/measures.sql_in
+++ b/src/ports/postgres/modules/graph/measures.sql_in
@@ -84,7 +84,7 @@ the following columns (in addition to the grouping columns):
 <dd>TEXT, default = NULL. Valid PostgreSQL expression that describes the
  vertices to generate closeness measures for.  If this parameter is not
 specified, closeness measures are generated for all vertices in the apsp table.
-This input should be treated like a WHERE clause.
+You can think of this input parameter as being like a WHERE clause.
 
 Some example inputs:
 - If you want a short list of vertices, say 1, 2 and 3:
@@ -750,7 +750,7 @@ INSERT INTO edge_gr VALUES
 (4,5,-20,1);
 </pre>
 
--# Find APSP for all groups:
+-# Find in-out degrees for all groups:
 <pre class="syntax">
 DROP TABLE IF EXISTS out_gr;
 SELECT madlib.graph_vertex_degrees(
diff --git a/src/ports/postgres/modules/graph/pagerank.sql_in b/src/ports/postgres/modules/graph/pagerank.sql_in
index e028f9228..a4c200431 100644
--- a/src/ports/postgres/modules/graph/pagerank.sql_in
+++ b/src/ports/postgres/modules/graph/pagerank.sql_in
@@ -35,7 +35,6 @@ m4_include(`SQLCommon.m4')
 <div class="toc"><b>Contents</b>
 <ul>
 <li><a href="#pagerank">PageRank</a></li>
-<li><a href="#notes">Notes</a></li>
 <li><a href="#examples">Examples</a></li>
 <li><a href="#literature">Literature</a></li>
 </ul>
@@ -46,7 +45,8 @@ m4_include(`SQLCommon.m4')
 Given a graph, the PageRank algorithm outputs a probability distribution representing the
 likelihood that a person randomly traversing the graph will arrive at any particular vertex.
 This algorithm was originally used by Google to rank websites where the World Wide Web was
-modeled as a directed graph with the vertices representing the websites.
+modeled as a directed graph with the vertices representing the websites.  The PageRank 
+algorithm initially proposed by Larry Page and Sergey Brin is implemented here [1].
 
 @anchor pagerank
 @par PageRank
@@ -100,13 +100,13 @@ regarding the number of iterations required for convergence.
 It is named by adding the suffix '_summary' to the 'out_table' 
 parameter.
 
-<dt>damping_factor</dt>
+<dt>damping_factor (optional)</dt>
 <dd>FLOAT8, default 0.85. The probability, at any step, that a user will continue following the links in a random surfer model.</dd>
 
-<dt>max_iter</dt>
+<dt>max_iter (optional)</dt>
 <dd>INTEGER, default: 100. The maximum number of iterations allowed.</dd>
 
-<dt>threshold</dt>
+<dt>threshold (optional)</dt>
 <dd>FLOAT8, default: (1/number of vertices * 1000). If the difference between the PageRank of every vertex of two consecutive
 iterations is smaller than 'threshold', or the iteration number is larger than 'max_iter', the
 computation stops.  If you set the threshold to zero, then you will force the algorithm to run for the full number of iterations specified in 'max_iter'.
@@ -122,11 +122,6 @@ a single model is generated for all data.
 
 </dl>
 
-@anchor notes
-@par Notes
-
-The PageRank algorithm proposed by Larry Page and Sergey Brin is used [1].
-
 @anchor examples
 @examp
 
diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
index 3632f2e2d..91e900d74 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
@@ -86,19 +86,21 @@ tree_train(
 
   <DT>list_of_features</DT>
   <DD>TEXT. Comma-separated string of column names or expressions to use as predictors.
-  Can also be a '*' implying all columns are to be used as predictors (except the
-  ones included in the next argument). The types of the features can be mixed
-  where boolean, integer, and text columns are considered categorical and
-  double precision columns are considered continuous. The categorical variables
-  are not encoded and used as is for the training. Array columns can also be
-  included in the list, where each element of the array is considered as a
-  feature.
+  Can also be a '*' implying all columns are to be used as predictors (except for the
+  ones included in the next argument that lists exclusions). 
+  The types of the features can be mixed - boolean, integer, and text columns 
+  are considered categorical and
+  double precision columns are considered continuous. Categorical variables
+  are not encoded and used as is for the training. 
+
+  Array columns can also be included in the list, where the array is expanded 
+  to treat each element of the array as a feature.
 
   It is important to note that not every combination of the levels of a
   categorical variable is checked when evaluating a split. The levels of the
   non-integer categorical variable are ordered by the entropy of the variable in
   predicting the response. The split at each node is evaluated between these
-  ordered levels. Integer categorical variables, howeve, are simply ordered
+  ordered levels. Integer categorical variables, however, are simply ordered
   by their value.
   </DD>
 
@@ -124,7 +126,10 @@ tree_train(
       each group. </DD>
 
   <DT>weights (optional)</DT>
-  <DD>TEXT. Column name containing weights for each observation.</DD>
+  <DD>TEXT. Column name containing numerical weights for each observation.
+  This can be used to handle the case of unbalanced data sets.
+  If this parameter is not set, all observations (tuples)
+  are treated equally with a weight of 1.0.</DD>
 
   <DT>max_depth (optional)</DT>
   <DD>INTEGER, default: 7. Maximum depth of any node of the final tree,
@@ -552,7 +557,7 @@ SELECT madlib.tree_train('dt_golf',         -- source table
                          NULL::text,        -- exclude columns
                          'gini',            -- split criterion
                          NULL::text,        -- no grouping
-                         NULL::text,        -- no weights
+                         NULL::text,        -- no weights, all observations treated equally
                          5,                 -- max depth
                          3,                 -- min split
                          1,                 -- min bucket
@@ -784,7 +789,7 @@ SELECT madlib.tree_train('mt_cars',         -- source table
                          'id, hp, drat, am, gear, carb',  -- exclude columns
                          'mse',             -- split criterion
                          NULL::text,        -- no grouping
-                         NULL::text,        -- no weights
+                         NULL::text,        -- no weights, all observations treated equally
                          10,                -- max depth
                          8,                 -- min split
                          3,                 -- number of bins per continuous variable
@@ -940,7 +945,7 @@ SELECT madlib.tree_train('null_handling_example',         -- source table
                          NULL,                            -- features to exclude
                          'gini',                          -- split criterion
                          NULL::text,                      -- no grouping
-                         NULL::text,                      -- no weights
+                         NULL::text,                      -- no weights, all observations treated equally
                          4,                               -- max depth
                          1,                               -- min split
                          1,                               -- number of bins per continuous variable
diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
index d9ae9bfd7..b9ce9104a 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
@@ -277,18 +277,24 @@ forest_train(training_table_name,
   while float values are considered regression outputs.</DD>
 
   <DT>list_of_features</DT>
-  <DD>text. Comma-separated string of column names to use as predictors. Can
-  also be a '*' implying all columns are to be used as predictors (except the
-  ones included in the next argument). The types of the features can be mixed
-  where boolean, integer, and text columns are considered categorical and
-  double precision columns are considered continuous. The categorical variables
-  are not encoded and used as is for the training.
-
-  It is important to note that we don't test for every combination of
-  levels of a categorical variable when evaluating a split. We order the levels
-  of the non-integer categorical variable by the entropy of the variable in
+  <DD>TEXT. Comma-separated string of column names or expressions to use as predictors.
+  Can also be a '*' implying all columns are to be used as predictors (except for the
+  ones included in the next argument that lists exclusions). 
+  The types of the features can be mixed - boolean, integer, and text columns 
+  are considered categorical and
+  double precision columns are considered continuous. Categorical variables
+  are not encoded and used as is for the training. 
+
+  Array columns can also be included in the list, where the array is expanded 
+  to treat each element of the array as a feature.
+
+  It is important to note that not every combination of the levels of a
+  categorical variable is checked when evaluating a split. The levels of the
+  non-integer categorical variable are ordered by the entropy of the variable in
   predicting the response. The split at each node is evaluated between these
-  ordered levels. Integer categorical variables are ordered by their value.</DD>
+  ordered levels. Integer categorical variables, however, are simply ordered
+  by their value.
+  </DD>
 
   <DT>list_of_features_to_exclude</DT>
   <DD>text. Comma-separated string of column names to exclude from the predictors
diff --git a/src/ports/postgres/modules/regress/linear.sql_in b/src/ports/postgres/modules/regress/linear.sql_in
index 81261ce5d..6572652c5 100644
--- a/src/ports/postgres/modules/regress/linear.sql_in
+++ b/src/ports/postgres/modules/regress/linear.sql_in
@@ -29,7 +29,7 @@ m4_include(`SQLCommon.m4')
 @brief Also called Ordinary Least Squares Regression, models linear relationship between a dependent variable and one or more independent variables.
 
 Linear regression models a linear relationship of a scalar dependent variable
-\f$ y \f$ to one or more explanatory independent variables \f$ x \f$ to build
+\f$ y \f$ to one or more explanatory independent variables \f$ x \f$ and builds
 a model of coefficients.
 
 @anchor train
@@ -49,12 +49,12 @@ linregr_train( source_table,
 \b Arguments
 <DL class="arglist">
 <DT>source_table</DT>
-<DD>TEXT. The name of the table containing the training data.</DD>
+<DD>TEXT. Name of the table containing the training data.</DD>
 
 <DT>out_table</DT>
 <DD>TEXT. Name of the generated table containing the output model.
 
-  The output table contains the following columns.
+  The output table contains the following columns:
   <table class="output">
   <tr>
   <th>\<...></th>
@@ -108,11 +108,18 @@ linregr_train( source_table,
   </tr>
   <tr>
   <th>num_missing_rows_skipped</th>
-  <td>INTEGER. The number of rows that have NULL values in the dependent and independent variables, and were skipped in the computation for each group.</td>
-  </tr></table>
+  <td>INTEGER. The number of rows that have NULL values in the dependent and independent variables, and were skipped in the computation for each group.</td></tr>
+  <tr>
+  <th>variance_covariance</th>
+  <td>FLOAT[]. Variance/covariance matrix.</td></tr>
+  </table>
 
   A summary table named \<out_table\>_summary is created together with the output table. It has the following columns:
  <table class="output">
+ <tr>
+    <th>method</th>
+    <td>'linregr' for linear regression.</td>
+    </tr>
   <tr>
 <th>source_table</th>
 <td>The data source table name</td></tr>
@@ -126,11 +133,16 @@ linregr_train( source_table,
 <td>The total number of rows that were used in the computation.</td></tr>
 <tr><th>num_missing_rows_skipped</th>
 <td>The total number of rows that were skipped because of NULL values in them.</td>
-    </tr></table>
+    </tr>
+<tr>
+    <th>grouping_cols</th>
+    <td>Names of the grouping columns.</td>
+    </tr>
+</table>
 </dd>
 <dt></dt>
 <dd>@note For p-values, we just return the computation result directly.
-Other statistical packages, like 'R', produce the same result, but on printing the
+Other statistical packages like 'R' produce the same result, but on printing the
 result to screen, another format function is used and any p-value that is
 smaller than the machine epsilon (the smallest positive floating-point number
 'x' such that '1 + x != 1') will be printed on screen as "< xxx" (xxx is the
@@ -145,7 +157,8 @@ in fact the same.
 <DD>TEXT. Expression list to evaluate for the independent variables. An intercept variable is not assumed. It is common to provide an explicit intercept term by including a single constant <tt>1</tt> term in the independent variable list.</DD>
 
 <DT>grouping_cols (optional)</DT>
-<DD>TEXT, default: NULL. An expression list used to group the input dataset into discrete groups, running one regression per group. Similar to the SQL <tt>GROUP BY</tt> clause. When this value is null, no grouping is used and a single result model is generated.</DD>
+<DD>TEXT, default: NULL. An expression list used to group the input dataset into discrete groups, running one regression per group. Similar to the SQL <tt>GROUP BY</tt> clause. When this value is null, no grouping is used and a 
+single result model is generated for the whole data set.</DD>
 
 <DT>heteroskedasticity_option (optional)</DT>
 <DD>BOOLEAN, default: FALSE. When TRUE, the heteroskedasticity of the model is also calculated and returned with the results.</DD>
@@ -168,43 +181,46 @@ FROM (
 
 @anchor predict
 @par Prediction Function
+The prediction function is as follows:
 <pre class="syntax">
 linregr_predict(coef, col_ind)
 </pre>
 \b Arguments
 <dl class="arglist">
 <dt>coef</dt>
-<dd>FLOAT8[]. Vector of the coefficients of regression.</dd>
+<dd>FLOAT8[]. Vector of the coefficients of regression from training.</dd>
 
 <dt>col_ind</dt>
-<dd>FLOAT8[]. An array containing the independent variable column names. </dd>
+<dd>FLOAT8[]. An array containing the independent variable column names,
+as was used for the training. </dd>
 
 @anchor examples
 @par Examples
 -#  Create an input data set.
 <pre class="example">
+DROP TABLE IF EXISTS houses;
 CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT,
             size INT, lot INT);
-COPY houses FROM STDIN WITH DELIMITER '|';
-  1 |  590 |       2 |    1 |  50000 |  770 | 22100
-  2 | 1050 |       3 |    2 |  85000 | 1410 | 12000
-  3 |   20 |       3 |    1 |  22500 | 1060 |  3500
-  4 |  870 |       2 |    2 |  90000 | 1300 | 17500
-  5 | 1320 |       3 |    2 | 133000 | 1500 | 30000
-  6 | 1350 |       2 |    1 |  90500 |  820 | 25700
-  7 | 2790 |       3 |  2.5 | 260000 | 2130 | 25000
-  8 |  680 |       2 |    1 | 142500 | 1170 | 22000
-  9 | 1840 |       3 |    2 | 160000 | 1500 | 19000
- 10 | 3680 |       4 |    2 | 240000 | 2790 | 20000
- 11 | 1660 |       3 |    1 |  87000 | 1030 | 17500
- 12 | 1620 |       3 |    2 | 118600 | 1250 | 20000
- 13 | 3100 |       3 |    2 | 140000 | 1760 | 38000
- 14 | 2070 |       2 |    3 | 148000 | 1550 | 14000
- 15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000
-\\.
+INSERT INTO houses VALUES   
+  (1 ,  590 ,       2 ,    1 ,  50000 ,  770 , 22100),
+  (2 , 1050 ,       3 ,    2 ,  85000 , 1410 , 12000),
+  (3 ,   20 ,       3 ,    1 ,  22500 , 1060 ,  3500),
+  (4 ,  870 ,       2 ,    2 ,  90000 , 1300 , 17500),
+  (5 , 1320 ,       3 ,    2 , 133000 , 1500 , 30000),
+  (6 , 1350 ,       2 ,    1 ,  90500 ,  820 , 25700),
+  (7 , 2790 ,       3 ,  2.5 , 260000 , 2130 , 25000),
+  (8 ,  680 ,       2 ,    1 , 142500 , 1170 , 22000),
+  (9 , 1840 ,       3 ,    2 , 160000 , 1500 , 19000),
+ (10 , 3680 ,       4 ,    2 , 240000 , 2790 , 20000),
+ (11 , 1660 ,       3 ,    1 ,  87000 , 1030 , 17500),
+ (12 , 1620 ,       3 ,    2 , 118600 , 1250 , 20000),
+ (13 , 3100 ,       3 ,    2 , 140000 , 1760 , 38000),
+ (14 , 2070 ,       2 ,    3 , 148000 , 1550 , 14000),
+ (15 ,  650 ,       3 ,  1.5 ,  65000 , 1450 , 12000);
 </pre>
 -#  Train a regression model. First, we generate a single regression for all data.  
 <pre class="example">
+DROP TABLE IF EXISTS houses_linregr, houses_linregr_summary;
 SELECT madlib.linregr_train( 'houses',
                              'houses_linregr',
                              'price',
@@ -217,6 +233,7 @@ limit of maximum columns per table, you would pre-build the arrays and store the
 single column.)
 -# Next we generate three output models, one for each value of "bedroom".
 <pre class="example">
+DROP TABLE IF EXISTS houses_linregr_bedroom, houses_linregr_bedroom_summary;
 SELECT madlib.linregr_train( 'houses',
                              'houses_linregr_bedroom',
                              'price',
@@ -233,43 +250,15 @@ SELECT * FROM houses_linregr;
 Result:
 <pre class="result">
 -[ RECORD 1 ]+---------------------------------------------------------------------------
-coef         | {-12849.4168959872,28.9613922651765,10181.6290712648,50.516894915354}
-r2           | 0.768577580597443
-std_err      | {33453.0344331391,15.8992104963997,19437.7710925923,32.928023174087}
-t_stats      | {-0.38410317968819,1.82156166004184,0.523806408809133,1.53416118083605}
-p_values     | {0.708223134615422,0.0958005827189772,0.610804093526536,0.153235085548186}
-condition_no | 9002.50457085737
-</pre>
--# View the results grouped by bedroom.
-<pre class="example">
-SELECT * FROM houses_linregr_bedroom;
-</pre>
-Result:
-<pre class="result">
--[ RECORD 1 ]+--------------------------------------------------------------------------
-bedroom      | 2
-coef         | {-84242.0345406597,55.4430144648696,-78966.9753675319,225.611910021192}
-r2           | 0.968809546465313
-std_err      | {35018.9991665742,19.5731125320686,23036.8071292552,49.0448678148784}
-t_stats      | {-2.40560942761235,2.83261103077151,-3.42786111480046,4.60011251070697}
-p_values     | {0.250804617665239,0.21605133377602,0.180704400437373,0.136272031474122}
-condition_no | 10086.1048721726
--[ RECORD 2 ]+--------------------------------------------------------------------------
-bedroom      | 4
-coef         | {0.0112536020318378,41.4132554771633,0.0225072040636757,31.3975496688276}
-r2           | 1
-std_err      | {0,0,0,0}
-t_stats      | {Infinity,Infinity,Infinity,Infinity}
-p_values     |
-condition_no | Infinity
--[ RECORD 3 ]+--------------------------------------------------------------------------
-bedroom      | 3
-coef         | {-88155.8292501601,27.1966436294429,41404.0293363612,62.637521075324}
-r2           | 0.841699901311252
-std_err      | {57867.9999702625,17.8272309154689,43643.1321511114,70.8506824863954}
-t_stats      | {-1.52339512849005,1.52556747362508,0.948695185143966,0.884077878676067}
-p_values     | {0.188161432894871,0.187636685729869,0.386340032374927,0.417132778705789}
-condition_no | 11722.6225642147
+coef                     | {-12849.4168959872,28.9613922651765,10181.6290712648,50.516894915354}
+r2                       | 0.768577580597443
+std_err                  | {33453.0344331391,15.8992104963997,19437.7710925923,32.928023174087}
+t_stats                  | {-0.38410317968819,1.82156166004184,0.523806408809133,1.53416118083605}
+p_values                 | {0.708223134615422,0.0958005827189772,0.610804093526536,0.153235085548186}
+condition_no             | 9002.50457085737
+num_rows_processed       | 15
+num_missing_rows_skipped | 0
+variance_covariance      | {{1119105512.78479,217782.067878023,-283344228.394562,-616679.69319088}, ...
 </pre>
 Alternatively you can unnest the results for easier reading of output.
 <pre class="example">
@@ -281,8 +270,61 @@ SELECT unnest(ARRAY['intercept','tax','bath','size']) as attribute,
        unnest(p_values) as pvalue
 FROM houses_linregr;
 </pre>
--# Use the prediction function to evaluate residuals.
+Result:
+<pre class="result">
+ attribute |    coefficient    |  standard_error  |      t_stat       |       pvalue       
+-----------+-------------------+------------------+-------------------+--------------------
+ intercept | -12849.4168959872 | 33453.0344331391 | -0.38410317968819 |  0.708223134615422
+ tax       |  28.9613922651765 | 15.8992104963997 |  1.82156166004184 | 0.0958005827189772
+ bath      |  10181.6290712648 | 19437.7710925923 | 0.523806408809133 |  0.610804093526536
+ size      |   50.516894915354 |  32.928023174087 |  1.53416118083605 |  0.153235085548186
+(4 rows)
+</pre>
+-# View the results grouped by bedroom.
 <pre class="example">
+\\x ON
+SELECT * FROM houses_linregr_bedroom ORDER BY bedroom;
+</pre>
+Result:
+<pre class="result">
+-[ RECORD 1 ]------------+----------------------------------------------------------------
+bedroom                  | 4
+coef                     | {0.0112536020318378,41.4132554771633,0.0225072040636757,31.3975496688276}
+r2                       | 1
+std_err                  | {0,0,0,0}
+t_stats                  | {Infinity,Infinity,Infinity,Infinity}
+p_values                 | 
+condition_no             | Infinity
+num_rows_processed       | 1
+num_missing_rows_skipped | 0
+variance_covariance      | {{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}}
+-[ RECORD 2 ]------------+----------------------------------------------------------------
+bedroom                  | 3
+coef                     | {-88155.8292501601,27.1966436294429,41404.0293363612,62.637521075324}
+r2                       | 0.841699901311252
+std_err                  | {57867.9999702625,17.8272309154689,43643.1321511114,70.8506824863954}
+t_stats                  | {-1.52339512849005,1.52556747362508,0.948695185143966,0.884077878676067}
+p_values                 | {0.188161432894871,0.187636685729869,0.386340032374927,0.417132778705789}
+condition_no             | 11722.6225642147
+num_rows_processed       | 9
+num_missing_rows_skipped | 0
+variance_covariance      | {{3348705420.5583,433697.545104226,-70253017.45773,-2593488.13800193}, ...
+-[ RECORD 3 ]------------+----------------------------------------------------------------
+bedroom                  | 2
+coef                     | {-84242.0345406597,55.4430144648696,-78966.9753675319,225.611910021192}
+r2                       | 0.968809546465313
+std_err                  | {35018.9991665742,19.5731125320686,23036.8071292552,49.0448678148784}
+t_stats                  | {-2.40560942761235,2.83261103077151,-3.42786111480046,4.60011251070697}
+p_values                 | {0.250804617665239,0.21605133377602,0.180704400437373,0.136272031474122}
+condition_no             | 10086.1048721726
+num_rows_processed       | 5
+num_missing_rows_skipped | 0
+variance_covariance      | {{1226330302.62852,-300921.595596804,551696673.397849,-1544160.63236119}, ...
+</pre>
+
+-# Compare predicted price with actual.  (This example uses the original data table to perform the prediction. Typically a different test dataset with the same features as the original training dataset would be used for prediction.)
+<pre class="example">
+\\x OFF
 SELECT houses.*,
        madlib.linregr_predict( ARRAY[1,tax,bath,size],
                                m.coef
@@ -293,6 +335,28 @@ SELECT houses.*,
                                 ) as residual
 FROM houses, houses_linregr m;
 </pre>
+Result:
+<pre class="result">
+ id | tax  | bedroom | bath | price  | size |  lot  |     predict      |     residual      
+----+------+---------+------+--------+------+-------+------------------+-------------------
+  1 |  590 |       2 |    1 |  50000 |  770 | 22100 | 53317.4426965542 | -3317.44269655424
+  2 | 1050 |       3 |    2 |  85000 | 1410 | 12000 | 109152.124955627 | -24152.1249556268
+  3 |   20 |       3 |    1 |  22500 | 1060 |  3500 | 51459.3486308563 | -28959.3486308563
+  4 |  870 |       2 |    2 |  90000 | 1300 | 17500 |  98382.215907206 | -8382.21590720605
+  5 | 1320 |       3 |    2 | 133000 | 1500 | 30000 | 121518.221409606 |  11481.7785903937
+  6 | 1350 |       2 |    1 |  90500 |  820 | 25700 | 77853.9455638561 |  12646.0544361439
+  7 | 2790 |       3 |  2.5 | 260000 | 2130 | 25000 | 201007.926371721 |  58992.0736282788
+  8 |  680 |       2 |    1 | 142500 | 1170 | 22000 | 76130.7259665617 |  66369.2740334383
+  9 | 1840 |       3 |    2 | 160000 | 1500 | 19000 | 136578.145387498 |  23421.8546125019
+ 10 | 3680 |       4 |    2 | 240000 | 2790 | 20000 |  255033.90159623 | -15033.9015962295
+ 11 | 1660 |       3 |    1 |  87000 | 1030 | 17500 | 97440.5250982852 | -10440.5250982852
+ 12 | 1620 |       3 |    2 | 118600 | 1250 | 20000 | 117577.415360321 |  1022.58463967926
+ 13 | 3100 |       3 |    2 | 140000 | 1760 | 38000 | 186203.892319613 | -46203.8923196126
+ 14 | 2070 |       2 |    3 | 148000 | 1550 | 14000 | 155946.739425521 | -7946.73942552117
+ 15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000 | 94497.4293105379 | -29497.4293105379
+(15 rows)
+</pre>
+
 
 @anchor notes
 @par Note
diff --git a/src/ports/postgres/modules/regress/logistic.sql_in b/src/ports/postgres/modules/regress/logistic.sql_in
index a27e1548f..6ce347aac 100644
--- a/src/ports/postgres/modules/regress/logistic.sql_in
+++ b/src/ports/postgres/modules/regress/logistic.sql_in
@@ -53,7 +53,7 @@ logregr_train( source_table,
 \b Arguments
 <DL class="arglist">
   <DT>source_table</DT>
-  <DD>TEXT. The name of the table containing the training data.</DD>
+  <DD>TEXT. Name of the table containing the training data.</DD>
 
   <DT>out_table</DT>
   <DD>TEXT. Name of the generated table containing the output model.
@@ -63,8 +63,8 @@ logregr_train( source_table,
     <table class="output">
       <tr>
         <th>&lt;...&gt;</th>
-        <td>Text. Grouping columns, if provided in input. This could be multiple columns
-          depending on the \c grouping_col input.</td>
+        <td>TEXT. Grouping columns, if provided in input. This could be multiple columns
+          depending on the \c grouping_cols input.</td>
       </tr>
 
       <tr>
@@ -107,26 +107,39 @@ logregr_train( source_table,
           techniques may be more appropriate.</td>
       </tr>
 
-      <tr>
-        <th>num_iterations</th>
-        <td>INTEGER. The number of iterations actually completed. This would be different
-          from the \c nIterations argument if a \c tolerance parameter is provided and the
-          algorithm converges before all iterations are completed.</td>
-      </tr>
       <tr>
         <th>num_rows_processed</th>
         <td>INTEGER. The number of rows actually processed, which is equal to the total
             number of rows in the source table minus the number of skipped rows.</td>
       </tr>
+
       <tr>
         <th>num_missing_rows_skipped</th>
-        <td>INTEGER. The number of rows skipped during the training. A row will be skipped
-    if the independent_varname is NULL or contains NULL values.</td>
+        <td>INTEGER. The number of rows skipped during the training. 
+        A row will be skipped if the independent_varname is NULL or 
+        contains NULL values.</td>
+      </tr>
+
+      <tr>
+        <th>num_iterations</th>
+        <td>INTEGER. The number of iterations actually completed. This would be different
+          from the \c nIterations argument if a \c tolerance parameter is provided and the
+          algorithm converges before all iterations are completed.</td>
+      </tr>
+
+      <tr>
+        <th>variance_covariance</th>
+        <td>FLOAT[]. Variance/covariance matrix.</td>
       </tr>
     </table>
 
     A summary table named \<out_table\>_summary is also created at the same time, which has the following columns:
      <table class="output">
+    <tr>
+    <th>method</th>
+    <td>'logregr' for logistic regression.</td>
+    </tr>
+
     <tr>
     <th>source_table</th>
     <td>The data source table name.</td>
@@ -139,17 +152,18 @@ logregr_train( source_table,
 
     <tr>
     <th>dependent_varname</th>
-    <td>The dependent variable.</td>
+    <td>The dependent variable name.</td>
     </tr>
 
     <tr>
     <th>independent_varname</th>
-    <td>The independent variables</td>
+    <td>The independent variable names.</td>
     </tr>
 
     <tr>
     <th>optimizer_params</th>
-    <td>A string that contains all the optimizer parameters, and has the form of 'optimizer=..., max_iter=..., tolerance=...'</td>
+    <td>A string that contains all the optimizer parameters, and 
+    has the form of 'optimizer=..., max_iter=..., tolerance=...'</td>
     </tr>
 
     <tr>
@@ -159,28 +173,33 @@ logregr_train( source_table,
 
     <tr>
     <th>num_failed_groups</th>
-    <td>How many groups' fitting processes failed.</td>
+    <td>How many groups failed in training.</td>
     </tr>
 
     <tr>
     <th>num_rows_processed</th>
-    <td>The total number of rows usd in the computation.</td>
+    <td>The total number of rows used in the computation.</td>
     </tr>
 
     <tr>
     <th>num_missing_rows_skipped</th>
     <td>The total number of rows skipped.</td>
     </tr>
+
+    <tr>
+    <th>grouping_cols</th>
+    <td>Names of the grouping columns.</td>
+    </tr>
    </table>
   </DD>
 
   <DT>dependent_varname</DT>
   <DD>TEXT. Name of the dependent variable column (of type BOOLEAN) in the
-    training data or an expression evaluating to a BOOLEAN.</DD>
+    training data, or an expression evaluating to a BOOLEAN.</DD>
 
   <DT>independent_varname</DT>
   <DD>TEXT. Expression list to evaluate for the
-    independent variables. An intercept variable is not assumed. It is common to
+    independent variables. An intercept variable is not assumed so it is common to
     provide an explicit intercept term by including a single constant \c 1 term in
     the independent variable list.</DD>
 
@@ -188,10 +207,10 @@ logregr_train( source_table,
   <DD>TEXT, default: NULL. An expression list used to group
     the input dataset into discrete groups, running one regression per group.
     Similar to the SQL "GROUP BY" clause. When this value is NULL, no
-    grouping is used and a single result model is generated.</DD>
+    grouping is used and a single model is generated for the whole data set.</DD>
 
   <DT>max_iter (optional)</DT>
-  <DD>INTEGER, default: 20. The maximum number of iterations that are allowed.</DD>
+  <DD>INTEGER, default: 20. The maximum number of iterations allowed.</DD>
 
   <DT>optimizer (optional)</DT>
   <DD>TEXT, default: 'irls'. The name of the optimizer to use:
@@ -213,16 +232,17 @@ logregr_train( source_table,
 
   <DT>tolerance (optional)</DT>
   <DD>FLOAT8, default: 0.0001. The difference between
-    log-likelihood values in successive iterations that should indicate
+    log-likelihood values in successive iterations that indicate
     convergence. A zero disables the convergence criterion, so that execution
-    stops after \c n iterations have completed.</DD>
+    stops after the maximum iterations have completed, as set in the 'max_iter' 
+    parameter above.</DD>
 
   <DT>verbose (optional)</DT>
   <DD>BOOLEAN, default: FALSE. Provides verbose output of the results of training.</DD>
 </DL>
 
 @note For p-values, we just return the computation result directly.
-Other statistical packages, like 'R', produce the same result, but on printing the
+Other statistical packages like 'R' produce the same result, but on printing the
 result to screen, another format function is used and any p-value that is
 smaller than the machine epsilon (the smallest positive floating-point number
 'x' such that '1 + x != 1') will be printed on screen as "< xxx" (xxx is the
@@ -232,9 +252,9 @@ in fact the same.
 
 @anchor predict
 @par Prediction Function
-Two prediction functions are provided to either predict the boolean value of the
-dependent variable or the probability of the value of dependent variable being
-'True', both functions using the same syntax.
+Two prediction functions are provided.   One predicts the boolean value of the
+dependent variable, and the other predicts the probability of the value of the
+dependent variable being 'True'.  Syntax is the same for both functions.
 
 The function to predict the boolean value (True/False) of the dependent variable
 has the following syntax:
@@ -244,7 +264,7 @@ logregr_predict(coefficients,
                )
 </pre>
 
-The function to predict the probability of the dependent variable being True
+The function to predict the probability of the dependent variable being 'True'
 has the following syntax:
 <pre class="syntax">
 logregr_predict_prob(coefficients,
@@ -255,60 +275,62 @@ logregr_predict_prob(coefficients,
 \b Arguments
 <DL class="arglist">
   <DT>coefficients</DT>
-  <DD>DOUBLE PRECISION[]. Model coefficients obtained from \ref logregr_train().</DD>
+  <DD>DOUBLE PRECISION[]. Model coefficients obtained from training \ref logregr_train().</DD>
 
   <DT>ind_var</DT>
-  <DD>Independent variables, as a DOUBLE array. This should be the same length
+  <DD>Independent variables expressed as a DOUBLE array. This should be the same length
   as the array obtained by evaluation of the 'independent_varname' argument in
   \ref logregr_train().</DD>
 </DL>
 
 @anchor examples
 @examp
--# Create the training data table.
+-# Create the training data table.  This data set is related to predicting
+a second heart attack given treatment and health factors.
 <pre class="example">
+DROP TABLE IF EXISTS patients;
 CREATE TABLE patients( id INTEGER NOT NULL,
                        second_attack INTEGER,
                        treatment INTEGER,
-                       trait_anxiety INTEGER);
-COPY patients FROM STDIN WITH DELIMITER '|';
-  1 |             1 |         1 |            70
-  3 |             1 |         1 |            50
-  5 |             1 |         0 |            40
-  7 |             1 |         0 |            75
-  9 |             1 |         0 |            70
- 11 |             0 |         1 |            65
- 13 |             0 |         1 |            45
- 15 |             0 |         1 |            40
- 17 |             0 |         0 |            55
- 19 |             0 |         0 |            50
-  2 |             1 |         1 |            80
-  4 |             1 |         0 |            60
-  6 |             1 |         0 |            65
-  8 |             1 |         0 |            80
- 10 |             1 |         0 |            60
- 12 |             0 |         1 |            50
- 14 |             0 |         1 |            35
- 16 |             0 |         1 |            50
- 18 |             0 |         0 |            45
- 20 |             0 |         0 |            60
-\\.
+                       trait_anxiety INTEGER);                        
+INSERT INTO patients VALUES 
+(1,  1, 1, 70),
+(2,  1, 1, 80),
+(3,  1, 1, 50),
+(4,  1, 0, 60),
+(5,  1, 0, 40),
+(6,  1, 0, 65),
+(7,  1, 0, 75),
+(8,  1, 0, 80),
+(9,  1, 0, 70),
+(10, 1, 0, 60),
+(11, 0, 1, 65),
+(12, 0, 1, 50),
+(13, 0, 1, 45),
+(14, 0, 1, 35),
+(15, 0, 1, 40),
+(16, 0, 1, 50),
+(17, 0, 0, 55),
+(18, 0, 0, 45),
+(19, 0, 0, 50),
+(20, 0, 0, 60);
 </pre>
 -# Train a regression model.
 <pre class="example">
-SELECT madlib.logregr_train( 'patients',
-                             'patients_logregr',
-                             'second_attack',
-                             'ARRAY[1, treatment, trait_anxiety]',
-                             NULL,
-                             20,
-                             'irls'
+DROP TABLE IF EXISTS patients_logregr, patients_logregr_summary;
+SELECT madlib.logregr_train( 'patients',                             -- Source table
+                             'patients_logregr',                     -- Output table
+                             'second_attack',                        -- Dependent variable
+                             'ARRAY[1, treatment, trait_anxiety]',   -- Feature vector
+                             NULL,                                   -- Grouping
+                             20,                                     -- Max iterations
+                             'irls'                                  -- Optimizer to use
                            );
 </pre>
-(Note that in this example we are dynamically creating the array of independent variables 
+Note that in the example above we are dynamically creating the array of independent variables 
 from column names.  If you have large numbers of independent variables beyond the PostgreSQL 
-limit of maximum columns per table, you would pre-build the arrays and store them in a 
-single column.)
+limit of maximum columns per table, you would typically pre-build the arrays and store them in a 
+single column.
 -# View the regression results.
 <pre class="example">
 -- Set extended display on for easier reading of output
@@ -317,14 +339,17 @@ SELECT * from patients_logregr;
 </pre>
 Result:
 <pre class="result">
-coef           | {5.59049410898112,2.11077546770772,-0.237276684606453}
-log_likelihood | -467.214718489873
-std_err        | {0.318943457652178,0.101518723785383,0.294509929481773}
-z_stats        | {17.5281667482197,20.7919819024719,-0.805666162169712}
-p_values       | {8.73403463417837e-69,5.11539430631541e-96,0.420435365338518}
-odds_ratios    | {267.867942976278,8.2546400100702,0.788773016471171}
-condition_no   | 179.186118573205
-num_iterations | 9
+coef                     | {-6.36346994178192,-1.02410605239327,0.119044916668607}
+log_likelihood           | -9.41018298388876
+std_err                  | {3.21389766375099,1.17107844860319,0.0549790458269317}
+z_stats                  | {-1.97998524145757,-0.874498248699539,2.16527796868916}
+p_values                 | {0.0477051870698145,0.381846973530455,0.0303664045046183}
+odds_ratios              | {0.00172337630923221,0.359117354054956,1.12642051220895}
+condition_no             | 326.081922791575
+num_rows_processed       | 20
+num_missing_rows_skipped | 0
+num_iterations           | 5
+variance_covariance      | {{10.329138193064,-0.474304665195738,-0.171995901260057}, ...
 </pre>
 
 -# Alternatively, unnest the arrays in the results for easier reading of output:
@@ -338,8 +363,17 @@ SELECT unnest(array['intercept', 'treatment', 'trait_anxiety']) as attribute,
        unnest(odds_ratios) as odds_ratio
     FROM patients_logregr;
 </pre>
+Result:
+<pre class="result">
+   attribute   |    coefficient    |   standard_error   |       z_stat       |       pvalue       |     odds_ratio      
+---------------+-------------------+--------------------+--------------------+--------------------+---------------------
+ intercept     | -6.36346994178192 |   3.21389766375099 |  -1.97998524145757 | 0.0477051870698145 | 0.00172337630923221
+ treatment     | -1.02410605239327 |   1.17107844860319 | -0.874498248699539 |  0.381846973530455 |   0.359117354054956
+ trait_anxiety | 0.119044916668607 | 0.0549790458269317 |   2.16527796868916 | 0.0303664045046183 |    1.12642051220895
+(3 rows)
+</pre>
 
--# Predicting dependent variable using the logistic regression model.
+-# Predict the dependent variable using the logistic regression model.
 (This example uses the original data table to perform the prediction. Typically
 a different test dataset with the same features as the original training dataset
 would be used for prediction.)
@@ -347,20 +381,72 @@ would be used for prediction.)
 \\x off
 -- Display prediction value along with the original value
 SELECT p.id, madlib.logregr_predict(coef, ARRAY[1, treatment, trait_anxiety]),
-       p.second_attack
+       p.second_attack::BOOLEAN
 FROM patients p, patients_logregr m
 ORDER BY p.id;
 </pre>
+Result:
+<pre class="result">
+  id | logregr_predict | second_attack 
+----+-----------------+---------------
+  1 | t               | t
+  2 | t               | t
+  3 | f               | t
+  4 | t               | t
+  5 | f               | t
+  6 | t               | t
+  7 | t               | t
+  8 | t               | t
+  9 | t               | t
+ 10 | t               | t
+ 11 | t               | f
+ 12 | f               | f
+ 13 | f               | f
+ 14 | f               | f
+ 15 | f               | f
+ 16 | f               | f
+ 17 | t               | f
+ 18 | f               | f
+ 19 | f               | f
+ 20 | t               | f
+(20 rows)
+</pre>
 
--# Predicting the probability of the dependent variable being TRUE.
+-# Predict the probability of the dependent variable being TRUE.
 <pre class="example">
 \\x off
 -- Display prediction value along with the original value
-SELECT p.id, madlib.logregr_predict_prob(coef, ARRAY[1, treatment, trait_anxiety])
+SELECT p.id, madlib.logregr_predict_prob(coef, ARRAY[1, treatment, trait_anxiety]),
+       p.second_attack::BOOLEAN
 FROM patients p, patients_logregr m
 ORDER BY p.id;
 </pre>
-
+Result:
+<pre class="result">
+ id | logregr_predict_prob | second_attack 
+----+----------------------+---------------
+  1 |    0.720223028941527 | t
+  2 |    0.894354902502048 | t
+  3 |    0.192269541755171 | t
+  4 |    0.685513072239347 | t
+  5 |    0.167747881508857 | t
+  6 |     0.79809810891514 | t
+  7 |    0.928568075752503 | t
+  8 |    0.959305763693571 | t
+  9 |    0.877576117431452 | t
+ 10 |    0.685513072239347 | t
+ 11 |    0.586700895943317 | f
+ 12 |    0.192269541755171 | f
+ 13 |    0.116032010632994 | f
+ 14 |   0.0383829143134982 | f
+ 15 |   0.0674976224147597 | f
+ 16 |    0.192269541755171 | f
+ 17 |    0.545870774302621 | f
+ 18 |    0.267675422387132 | f
+ 19 |    0.398618639285111 | f
+ 20 |    0.685513072239347 | f
+(20 rows)
+</pre>
 
 @anchor notes
 @par Notes
@@ -461,8 +547,8 @@ than 1000) indicates the presence of significant multicollinearity.
 @anchor literature
 @literature
 
-A somewhat random selection of nice write-ups, with valuable pointers into
-further literature.
+A selection of references pertaining to logistic regression, 
+with some good pointers to other literature.
 
 [1] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 18 November
     2009, http://www.stat.cmu.edu/~cshalizi/350/lectures/26/lecture-26.pdf
diff --git a/src/ports/postgres/modules/stats/correlation.sql_in b/src/ports/postgres/modules/stats/correlation.sql_in
index 3e4e9bd78..25c285951 100644
--- a/src/ports/postgres/modules/stats/correlation.sql_in
+++ b/src/ports/postgres/modules/stats/correlation.sql_in
@@ -20,7 +20,7 @@ m4_include(`SQLCommon.m4')
 <ul>
 <li><a href="#usage">Correlation Function</a></li>
 <li><a href="#examples">Examples</a></li>
-<li><a href="#seealso">See Also</a></li>
+<li><a href="#related">Related Topics</a></li>
 </ul>
 </div>
 
diff --git a/src/ports/postgres/modules/summary/summary.sql_in b/src/ports/postgres/modules/summary/summary.sql_in
index bdde21410..148191fe7 100644
--- a/src/ports/postgres/modules/summary/summary.sql_in
+++ b/src/ports/postgres/modules/summary/summary.sql_in
@@ -64,13 +64,11 @@ The \b summary() function returns a composite type containing three fields:
 \b Arguments
 <DL class="arglist">
 <dt>source_table</dt>
-<dd>TEXT. The name of the table containing the input data.</dd>
-<dt>output_table</dt>
-<dd>TEXT. The name of the table to contain the output summary data.
+<dd>TEXT. Name of the table containing the input data.</dd>
 
-Summary statistics are saved in a table with the name specifed in the
-<em>output_table</em> argument. The table contains the
-following columns:
+<dt>output_table</dt>
+<dd>TEXT. Name of the table for the output summary statistics.
+This table contains the following columns:
 <table class="output">
     <tr>
         <th>group_by</th>
@@ -78,7 +76,7 @@ following columns:
     </tr>
     <tr>
         <th>group_by_value</th>
-        <td>Value of the Group-by column. NULL if there is no grouping.</td>
+        <td>Value of the group-by column. NULL if there is no grouping.</td>
     </tr>
     <tr>
         <th>target_column</th>
@@ -98,7 +96,11 @@ following columns:
     </tr>
     <tr>
         <th>distinct_values</th>
-        <td>Number of distinct values in the target column. When the summary() function is called with the <em>get_estimates</em> argument set to TRUE, this is an estimated statistic based on the Flajolet-Martin distinct count estimator.</td>
+        <td>Number of distinct values in the target column. 
+        If the summary() function is called with the <em>get_estimates</em> 
+        argument set to TRUE (default), then this is an estimated statistic based on the 
+        Flajolet-Martin distinct count estimator.  If the <em>get_estimates</em> 
+        argument set to FALSE, will use PostgreSQL COUNT DISTINCT.</td>
     </tr>
     <tr>
         <th>missing_values</th>
@@ -134,28 +136,32 @@ following columns:
     </tr>
     <tr>
         <th>first_quartile</th>
-        <td>First quartile (25th percentile), only for numeric columns. <b>Currently unavailable for PostgreSQL 9.3 or lower</b>.</td>
+        <td>First quartile (25th percentile), only for numeric columns. 
+        (Unavailable for PostgreSQL 9.3 or lower.)</td>
     </tr>
     <tr>
         <th>median</th>
-        <td>Median value of target column, if target is numeric, otherwise NULL. <b>Currently unavailable for PostgreSQL 9.3 or lower</b>.</td>
+        <td>Median value of target column, if target is numeric, otherwise NULL. 
+        (Unavailable for PostgreSQL 9.3 or lower.)</td>
     </tr>
     <tr>
         <th>third_quartile</th>
-        <td>Third quartile (25th percentile), only for numeric columns. <b>Currently unavailable for PostgreSQL 9.3 or lower</b>.</td>
+        <td>Third quartile (25th percentile), only for numeric columns. 
+        (Unavailable for PostgreSQL 9.3 or lower.)</td>
     </tr>
     <tr>
         <th>quantile_array</th>
-        <td>Percentile values corresponding to \e ntile_array. <b>Currently unavailable for PostgreSQL 9.3 or lower</b>.</td>
+        <td>Percentile values corresponding to \e ntile_array. 
+        (Unavailable for PostgreSQL 9.3 or lower.)</td>
     </tr>
     <tr>
         <th>most_frequent_values</th>
         <td>An array containing the most frequently occurring values. The \e
-            how_many_mfv argument determines the length of the array, 10 by
+            how_many_mfv argument determines the length of the array, which is 10 by
             default. If the summary() function is called with the \e
             get_estimates argument set to TRUE (default), the frequent values
             computation is performed using a parallel aggregation method that is
-            faster, but in some cases can fail to detect the exact most frequent
+            faster, but in some cases may fail to detect the exact most frequent
             values.</td>
     </tr>
     <tr>
@@ -165,11 +171,12 @@ following columns:
 </table></dd>
 
 <dt>target_columns (optional)</dt>
-<dd>TEXT, default NULL. A comma-separated list of columns to summarize. If NULL, summaries are produced for all columns.</dd>
+<dd>TEXT, default NULL. A comma-separated list of columns to summarize. 
+If NULL, summaries are produced for all columns.</dd>
 
 <dt>grouping_cols (optional)</dt>
 <dd>TEXT, default: null. A comma-separated list of columns on which to
-group results. If NULL, summaries are produced on the complete table.</dd>
+group results. If NULL, summaries are produced for the complete table.</dd>
 @note Please note that summary statistics are calculated for each grouping
 column independently.  That is, grouping columns are not combined together
 as in the regular PostgreSQL style GROUP BY directive.  (This was done
@@ -178,23 +185,31 @@ result in the case of large input tables with a lot of grouping_cols and
 target_cols specified.)
 
 <dt>get_distinct (optional)</dt>
-<dd>BOOLEAN, default TRUE. If true, distinct values are counted.</dd>
+<dd>BOOLEAN, default TRUE. If true, distinct values are counted.
+The method for computing distinct values depends on the setting of 
+the 'get_estimates' parameter below.</dd>
 
 <dt>get_quartiles (optional)</dt>
 <dd>BOOLEAN, default TRUE. If TRUE, quartiles are computed.</dd>
 
 <dt>ntile_array (optional)</dt>
-<dd>FLOAT8[], default NULL. An array of quantile values to compute. If NULL, quantile values are not computed.</dd>
-@note Quartile and quantile functions are not available for PostgreSQL 9.3 or
+<dd>FLOAT8[], default NULL. An array of quantile values to compute. 
+If NULL, quantile values are not computed.</dd>
+@note Quartile and quantile functions are not available in PostgreSQL 9.3 or
 lower.  If you are using PostgreSQL 9.3 or lower, the output table will not
 contain these values, even if you set 'get_quartiles' = TRUE or
 provide an array of quantile values for the parameter 'ntile_array'.
 
 <dt>how_many_mfv (optional)</dt>
-<dd>INTEGER, default: 10. The number of most-frequent-values to compute.</dd>
+<dd>INTEGER, default: 10. The number of most-frequent-values to compute.
+The method for computing MFV depends on the setting of 
+the 'get_estimates' parameter below.</dd>
 
 <dt>get_estimates (optional)</dt>
-<dd>BOOLEAN, default TRUE. If TRUE, estimated values are produced for distinct values and most frequent values. If FALSE, exact values are calculated (may take longer to run depending on data size).</dd>
+<dd>BOOLEAN, default TRUE. If TRUE, estimated values are produced for 
+distinct values and most frequent values. If FALSE, exact values are 
+calculated which will take longer to run, with the impact depending on 
+data size.</dd>
 
 <dt>n_cols_per_run (optional)</dt>
 <dd>INTEGER, default: 15. The number of columns to collect summary statistics in
@@ -204,8 +219,8 @@ with a total of 40 columns to summarize and 'n_cols_per_run = 15', there will be
 3 passes through the data, with each pass summarizing a maximum of 15 columns.
 @note This parameter should be used with caution. Increasing this parameter could
 decrease the total run time (if number of passes decreases), but will increase
-the memory consumption during each run. Since Postgresql limits the memory available
-for a single aggregate run, this increased memory consumption could result in
+the memory consumption during each run. Since PostgreSQL limits the memory available
+for a single aggregate run, this increased memory consumption could result in an
 out-of-memory termination error.
 
 </dd>
@@ -220,82 +235,230 @@ out-of-memory termination error.
 SELECT * FROM madlib.summary();
 </pre>
 
--# Create an input data set.
+-# Create an input data table using part of the well known
+iris data set.
 <pre class="example">
-CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT,
-             size INT, lot INT);
-COPY houses FROM STDIN WITH DELIMITER '|';
-  1 |  590 |       2 |    1 |  50000 |  770 | 22100
-  2 | 1050 |       3 |    2 |  85000 | 1410 | 12000
-  3 |   20 |       3 |    1 |  22500 | 1060 |  3500
-  4 |  870 |       2 |    2 |  90000 | 1300 | 17500
-  5 | 1320 |       3 |    2 | 133000 | 1500 | 30000
-  6 | 1350 |       2 |    1 |  90500 |  820 | 25700
-  7 | 2790 |       3 |  2.5 | 260000 | 2130 | 25000
-  8 |  680 |       2 |    1 | 142500 | 1170 | 22000
-  9 | 1840 |       3 |    2 | 160000 | 1500 | 19000
- 10 | 3680 |       4 |    2 | 240000 | 2790 | 20000
- 11 | 1660 |       3 |    1 |  87000 | 1030 | 17500
- 12 | 1620 |       3 |    2 | 118600 | 1250 | 20000
- 13 | 3100 |       3 |    2 | 140000 | 1760 | 38000
- 14 | 2070 |       2 |    3 | 148000 | 1550 | 14000
- 15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000
-\\.
+DROP TABLE IF EXISTS iris;
+CREATE TABLE iris (id INT, sepal_length FLOAT, sepal_width FLOAT,
+                    petal_length FLOAT, petal_width FLOAT, 
+                   class_name text);                        
+INSERT INTO iris VALUES 
+(1,5.1,3.5,1.4,0.2,'Iris-setosa'),
+(2,4.9,3.0,1.4,0.2,'Iris-setosa'),
+(3,4.7,3.2,1.3,0.2,'Iris-setosa'),
+(4,4.6,3.1,1.5,0.2,'Iris-setosa'),
+(5,5.0,3.6,1.4,0.2,'Iris-setosa'),
+(6,5.4,3.9,1.7,0.4,'Iris-setosa'),
+(7,4.6,3.4,1.4,0.3,'Iris-setosa'),
+(8,5.0,3.4,1.5,0.2,'Iris-setosa'),
+(9,4.4,2.9,1.4,0.2,'Iris-setosa'),
+(10,4.9,3.1,1.5,0.1,'Iris-setosa'),
+(11,7.0,3.2,4.7,1.4,'Iris-versicolor'),
+(12,6.4,3.2,4.5,1.5,'Iris-versicolor'),
+(13,6.9,3.1,4.9,1.5,'Iris-versicolor'),
+(14,5.5,2.3,4.0,1.3,'Iris-versicolor'),
+(15,6.5,2.8,4.6,1.5,'Iris-versicolor'),
+(16,5.7,2.8,4.5,1.3,'Iris-versicolor'),
+(17,6.3,3.3,4.7,1.6,'Iris-versicolor'),
+(18,4.9,2.4,3.3,1.0,'Iris-versicolor'),
+(19,6.6,2.9,4.6,1.3,'Iris-versicolor'),
+(20,5.2,2.7,3.9,1.4,'Iris-versicolor'),
+(21,6.3,3.3,6.0,2.5,'Iris-virginica'),
+(22,5.8,2.7,5.1,1.9,'Iris-virginica'),
+(23,7.1,3.0,5.9,2.1,'Iris-virginica'),
+(24,6.3,2.9,5.6,1.8,'Iris-virginica'),
+(25,6.5,3.0,5.8,2.2,'Iris-virginica'),
+(26,7.6,3.0,6.6,2.1,'Iris-virginica'),
+(27,4.9,2.5,4.5,1.7,'Iris-virginica'),
+(28,7.3,2.9,6.3,1.8,'Iris-virginica'),
+(29,6.7,2.5,5.8,1.8,'Iris-virginica'),
+(30,7.2,3.6,6.1,2.5,'Iris-virginica');
 </pre>
 
--# Run the \b summary() function.
+-# Run the \b summary() function using all defaults.
 <pre class="example">
-SELECT * FROM madlib.summary( 'houses',
-                              'houses_summary',
-                              'tax,bedroom,lot,bath,price,size,lot',
-                              'bedroom',
-                              TRUE,
-                              TRUE,
-                              NULL,
-                              5,
-                              FALSE
+DROP TABLE IF EXISTS iris_summary;
+SELECT * FROM madlib.summary( 'iris',            -- Source table
+                              'iris_summary'     -- Output table
                             );
 </pre>
 Result:
 <pre class="result">
-  output_table  | row_count |    duration
-----------------+-----------+----------------
- houses_summary |        21 | 0.207587003708
+ output_table | row_count |      duration       
+--------------+-----------+---------------------
+ iris_summary |         6 | 0.00712704658508301
 (1 row)
 </pre>
-
--# View the summary data.
+View the summary data.
 <pre class=example>
 -- Turn on expanded display for readability.
 \\x on
-SELECT * FROM houses_summary;
+SELECT * FROM iris_summary;
+</pre>
+Result (partial):
+<pre class="result">
+...
+&nbsp;-[ RECORD 2 ]-------+-----------------------------------
+group_by             | 
+group_by_value       | 
+target_column        | sepal_length
+column_number        | 2
+data_type            | float8
+row_count            | 30
+distinct_values      | 22
+missing_values       | 0
+blank_values         | 
+fraction_missing     | 0
+fraction_blank       | 
+mean                 | 5.84333333333333
+variance             | 0.9294367816092
+min                  | 4.4
+max                  | 7.6
+first_quartile       | 4.925
+median               | 5.75
+third_quartile       | 6.575
+most_frequent_values | {4.9,6.3,6.5,4.6,5,6.9,5.4,4.4,7,6.4}
+mfv_frequencies      | {4,3,2,2,2,1,1,1,1,1}  
+...
+&nbsp;-[ RECORD 6 ]-------+-----------------------------------
+group_by             | 
+group_by_value       | 
+target_column        | class_name
+column_number        | 6
+data_type            | text
+row_count            | 30
+distinct_values      | 3
+missing_values       | 0
+blank_values         | 0
+fraction_missing     | 0
+fraction_blank       | 0
+mean                 | 
+variance             | 
+min                  | 11
+max                  | 15
+first_quartile       | 
+median               | 
+third_quartile       | 
+most_frequent_values | {Iris-setosa,Iris-versicolor,Iris-virginica}
+mfv_frequencies      | {10,10,10}
+</pre>
+Note that for the text column in record 6, some statistics are n/a,
+and the min and max values represent the length of the shortest and 
+longest strings respectively.
+
+-# Now group by the class of iris:
+<pre class="example">
+DROP TABLE IF EXISTS iris_summary;
+SELECT * FROM madlib.summary( 'iris',                       -- Source table
+                              'iris_summary',               -- Output table
+                              'sepal_length, sepal_width',  -- Columns to summarize
+                              'class_name'                  -- Grouping column
+                            );
+SELECT * FROM iris_summary;
+</pre>
+Result (partial):
+<pre class="result">
+&nbsp;-[ RECORD 1 ]-------+-----------------------------------
+group_by             | class_name
+group_by_value       | Iris-setosa
+target_column        | sepal_length
+column_number        | 2
+data_type            | float8
+row_count            | 10
+distinct_values      | 7
+missing_values       | 0
+blank_values         | 
+fraction_missing     | 0
+fraction_blank       | 
+mean                 | 4.86
+variance             | 0.0848888888888976
+min                  | 4.4
+max                  | 5.4
+first_quartile       | 4.625
+median               | 4.9
+third_quartile       | 5
+most_frequent_values | {4.6,4.9,5,5.1,4.4,5.4,4.7}
+mfv_frequencies      | {2,2,2,1,1,1,1}
+...
+&nbsp;-[ RECORD 3 ]-------+-----------------------------------
+group_by             | class_name
+group_by_value       | Iris-versicolor
+target_column        | sepal_length
+column_number        | 2
+data_type            | float8
+row_count            | 10
+distinct_values      | 10
+missing_values       | 0
+blank_values         | 
+fraction_missing     | 0
+fraction_blank       | 
+mean                 | 6.1
+variance             | 0.528888888888893
+min                  | 4.9
+max                  | 7
+first_quartile       | 5.55
+median               | 6.35
+third_quartile       | 6.575
+most_frequent_values | {7,6.4,6.9,5.5,6.5,5.7,6.3,4.9,6.6,5.2}
+mfv_frequencies      | {1,1,1,1,1,1,1,1,1,1}
+...
+</pre>
+
+-# Trying some other parameters:
+<pre class="example">
+DROP TABLE IF EXISTS iris_summary;
+SELECT * FROM madlib.summary( 'iris',                       -- Source table
+                              'iris_summary',               -- Output table
+                              'sepal_length, sepal_width',  -- Columns to summarize
+                               NULL,                        -- No grouping
+                               TRUE,                        -- Get distinct values
+                               FALSE,                       -- Dont get quartiles
+                               ARRAY[0.33, 0.66],           -- Get ntiles
+                               3,                           -- Number of MFV to compute
+                               FALSE                        -- Get exact values
+                            );
+SELECT * FROM iris_summary;
 </pre>
 Result:
 <pre class="result">
-&nbsp;-[ RECORD 1 ]--------+-----------------------------------
- group_by             | bedroom
- group_by_value       | 3
- target_column        | tax
- column_number        | 2
- data_type            | int4
- row_count            | 9
- distinct_values      | 9
- missing_values       | 0
- blank_values         |
- fraction_missing     | 0
- fraction_blank       |
- mean                 | 1561.11111111111
- variance             | 936736.111111111
- min                  | 20
- max                  | 3100
- most_frequent_values | {20,1320,2790,1840,1660}
- mfv_frequencies      | {1,1,1,1,1}
-&nbsp;-[ RECORD 2 ]--------+-----------------------------------
- group_by             | bedroom
- group_by_value       | 3
- target_column        | bath
- column_number        | 4
- ...
+&nbsp;-[ RECORD 1 ]-------+-----------------------------------
+group_by             | 
+group_by_value       | 
+target_column        | sepal_length
+column_number        | 2
+data_type            | float8
+row_count            | 30
+distinct_values      | 22
+missing_values       | 0
+blank_values         | 
+fraction_missing     | 0
+fraction_blank       | 
+mean                 | 5.84333333333333
+variance             | 0.9294367816092
+min                  | 4.4
+max                  | 7.6
+quantile_array       | {5.057,6.414}
+most_frequent_values | {4.9,6.3,5}
+mfv_frequencies      | {4,3,2}
+&nbsp;-[ RECORD 2 ]-------+-----------------------------------
+group_by             | 
+group_by_value       | 
+target_column        | sepal_width
+column_number        | 3
+data_type            | float8
+row_count            | 30
+distinct_values      | 14
+missing_values       | 0
+blank_values         | 
+fraction_missing     | 0
+fraction_blank       | 
+mean                 | 3.04
+variance             | 0.13903448275862
+min                  | 2.3
+max                  | 3.9
+quantile_array       | {2.9,3.2}
+most_frequent_values | {3,2.9,3.2}
+mfv_frequencies      | {4,4,3}
 </pre>
 
 @anchor notes
@@ -306,33 +469,25 @@ should follow case-sensitivity and quoting rules per the database.
 (For instance, 'mytable' and 'MyTable' both resolve to the same entity, i.e. 'mytable'.
 If mixed-case or multi-byte characters are desired for entity names then the
 string should be double-quoted; in this case the input would be '"MyTable"').
-- Estimated values are only implemented for the distinct values computation.
-- The <em>get_estimates</em> parameter controls computation for two statistics:
+- The <em>get_estimates</em> parameter controls computation for both distinct 
+count and most frequent values:
     -  If <em>get_estimates</em> is TRUE then the distinct value computation is
-    estimated. Further, the most frequent values computation is computed using a
-    "quick and dirty" method that does parallel aggregation in Greenplum Database at the expense
+    estimated using Flajolet-Martin. MFV is computed using a
+    fast method that does parallel aggregation in Greenplum Database at the expense
     of missing some of the most frequent values.
     -  If <em>get_estimates</em> is FALSE then the distinct values are computed
-    in a slow but exact method. The most frequent values are computed using a
+    in a slower but exact method using PostgreSQL COUNT DISTINCT.  MFV is computed using a
     faithful implementation that preserves the approximation guarantees of
-    the Cormode/Muthukrishnan method (more information in \ref grp_mfvsketch).
-- Summary statistics are calculated for each grouping
-column independently.  That is, grouping columns are not combined together
-as in the regular PostgreSQL style GROUP BY directive.  (This was done
-to reduce long run time and huge output table size which would otherwise
-result in the case of large input tables with a lot of grouping_cols and
-target_cols specified.)
-- Quartile and quantile functions are not available for PostgreSQL 9.3 or
-lower.  If you are using PostgreSQL 9.3 or lower, the output table will not
-contain these values, even if you set 'get_quartiles' = TRUE or
-provide an array of quantile values for the parameter 'ntile_array'.
+    the Cormode/Muthukrishnan method (more information at \ref grp_mfvsketch).
 
 
 @anchor related
 @par Related Topics
 File summary.sql_in documenting the \b summary() function
 
-\ref grp_mfvsketch
+\ref grp_fmsketch <br/>
+\ref grp_mfvsketch <br/>
+\ref grp_countmin
 
 */
 
diff --git a/src/ports/postgres/modules/utilities/path.sql_in b/src/ports/postgres/modules/utilities/path.sql_in
index d98b50077..8fe03ec33 100644
--- a/src/ports/postgres/modules/utilities/path.sql_in
+++ b/src/ports/postgres/modules/utilities/path.sql_in
@@ -152,7 +152,9 @@ path(
 
     <dt>aggregate_func (optional)</dt>
     <dd>VARCHAR, default NULL. A comma-separated list of aggregates to be
-    applied to the pattern matches [3].  Please note that window functions
+    applied to the pattern matches [3].  
+    You can think of this input parameter as being like a SELECT clause.
+    Please note that window functions
     cannot currently be used in the parameter 'aggregate_func'.  If you want
     to use a window function [4], output the pattern matches and write a SQL
     query with a window function over the output tuples (see 'persist_rows'

method	'linregr' for linear regression.
source_table	The data source table name
The total number of rows that were used in the computation.
num_missing_rows_skipped	The total number of rows that were skipped because of NULL values in them.
<...>	Text. Grouping columns, if provided in input. This could be multiple columns - depending on the \c grouping_col input.	TEXT. Grouping columns, if provided in input. This could be multiple columns + depending on the \c grouping_cols input.
num_iterations	INTEGER. The number of iterations actually completed. This would be different - from the \c nIterations argument if a \c tolerance parameter is provided and the - algorithm converges before all iterations are completed.
num_rows_processed	INTEGER. The number of rows actually processed, which is equal to the total number of rows in the source table minus the number of skipped rows.
num_missing_rows_skipped	INTEGER. The number of rows skipped during the training. A row will be skipped - if the independent_varname is NULL or contains NULL values.	INTEGER. The number of rows skipped during the training. + A row will be skipped if the independent_varname is NULL or + contains NULL values.
num_iterations	INTEGER. The number of iterations actually completed. This would be different + from the \c nIterations argument if a \c tolerance parameter is provided and the + algorithm converges before all iterations are completed.
variance_covariance	FLOAT[]. Variance/covariance matrix.
method	'logregr' for logistic regression.
source_table	The data source table name.
dependent_varname	The dependent variable.	The dependent variable name.
independent_varname	The independent variables	The independent variable names.
optimizer_params	A string that contains all the optimizer parameters, and has the form of 'optimizer=..., max_iter=..., tolerance=...'	A string that contains all the optimizer parameters, and + has the form of 'optimizer=..., max_iter=..., tolerance=...'
num_failed_groups	How many groups' fitting processes failed.	How many groups failed in training.
num_rows_processed	The total number of rows usd in the computation.	The total number of rows used in the computation.
num_missing_rows_skipped	The total number of rows skipped.
grouping_cols	Names of the grouping columns.
group_by
group_by_value	Value of the Group-by column. NULL if there is no grouping.	Value of the group-by column. NULL if there is no grouping.
target_column
distinct_values	Number of distinct values in the target column. When the summary() function is called with the get_estimates argument set to TRUE, this is an estimated statistic based on the Flajolet-Martin distinct count estimator.	Number of distinct values in the target column. + If the summary() function is called with the get_estimates + argument set to TRUE (default), then this is an estimated statistic based on the + Flajolet-Martin distinct count estimator. If the get_estimates + argument set to FALSE, will use PostgreSQL COUNT DISTINCT.
missing_values
first_quartile	First quartile (25th percentile), only for numeric columns. Currently unavailable for PostgreSQL 9.3 or lower.	First quartile (25th percentile), only for numeric columns. + (Unavailable for PostgreSQL 9.3 or lower.)
median	Median value of target column, if target is numeric, otherwise NULL. Currently unavailable for PostgreSQL 9.3 or lower.	Median value of target column, if target is numeric, otherwise NULL. + (Unavailable for PostgreSQL 9.3 or lower.)
third_quartile	Third quartile (25th percentile), only for numeric columns. Currently unavailable for PostgreSQL 9.3 or lower.	Third quartile (25th percentile), only for numeric columns. + (Unavailable for PostgreSQL 9.3 or lower.)
quantile_array	Percentile values corresponding to \e ntile_array. Currently unavailable for PostgreSQL 9.3 or lower.	Percentile values corresponding to \e ntile_array. + (Unavailable for PostgreSQL 9.3 or lower.)
most_frequent_values	An array containing the most frequently occurring values. The \e - how_many_mfv argument determines the length of the array, 10 by + how_many_mfv argument determines the length of the array, which is 10 by default. If the summary() function is called with the \e get_estimates argument set to TRUE (default), the frequent values computation is performed using a parallel aggregation method that is - faster, but in some cases can fail to detect the exact most frequent + faster, but in some cases may fail to detect the exact most frequent values.