From 03f6e0572f80e9437a45f84702db77b2d91ab3fd Mon Sep 17 00:00:00 2001 From: Frank McQuillan Date: Thu, 9 Nov 2017 15:50:33 -0800 Subject: [PATCH] multiple doc updates for 1dot13 --- doc/mainpage.dox.in | 6 +- src/ports/postgres/modules/convex/mlp.sql_in | 3 +- .../postgres/modules/graph/measures.sql_in | 4 +- .../postgres/modules/graph/pagerank.sql_in | 15 +- .../decision_tree.sql_in | 29 +- .../random_forest.sql_in | 28 +- .../postgres/modules/regress/linear.sql_in | 194 ++++++---- .../postgres/modules/regress/logistic.sql_in | 242 ++++++++---- .../postgres/modules/stats/correlation.sql_in | 2 +- .../postgres/modules/summary/summary.sql_in | 355 +++++++++++++----- .../postgres/modules/utilities/path.sql_in | 4 +- 11 files changed, 597 insertions(+), 285 deletions(-) diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in index cddc2b93b..4a58e3067 100644 --- a/doc/mainpage.dox.in +++ b/doc/mainpage.dox.in @@ -99,7 +99,7 @@ complete matrix stored as a distributed table. @defgroup grp_matrix_factorization Matrix Factorization @brief Matrix Factorization methods including Singular Value Decomposition and Low-rank Matrix Factorization @{ - @defgroup grp_lmf Low-rank Matrix Factorization + @defgroup grp_lmf Low-Rank Matrix Factorization @defgroup grp_svd Singular Value Decomposition @} @@ -134,10 +134,10 @@ Contains graph algorithms. @defgroup grp_graph_measures Measures Graph Measures @{ + @defgroup grp_graph_avg_path_length Average Path Length @defgroup grp_graph_closeness Closeness @defgroup grp_graph_diameter Graph Diameter - @defgroup grp_graph_avg_path_length Average Path Length - @defgroup grp_graph_vertex_degrees In-Out degree + @defgroup grp_graph_vertex_degrees In-Out Degree @} @defgroup grp_pagerank PageRank @defgroup grp_sssp Single Source Shortest Path diff --git a/src/ports/postgres/modules/convex/mlp.sql_in b/src/ports/postgres/modules/convex/mlp.sql_in index bafb4dd0f..e6e271619 100644 --- a/src/ports/postgres/modules/convex/mlp.sql_in +++ b/src/ports/postgres/modules/convex/mlp.sql_in @@ -1228,8 +1228,7 @@ For details on backpropogation, see [2]. @literature @anchor mlp-lit-1 -[1] "Multilayer Perceptron." Wikipedia. Wikimedia Foundation, - 12 July 2017. Web. 12 July 2017. +[1] https://en.wikipedia.org/wiki/Multilayer_perceptron [2] Yu Hen Hu. "Lecture 11. MLP (III): Back-Propagation." University of Wisconsin Madison: Computer-Aided Engineering. Web. 12 July 2017, diff --git a/src/ports/postgres/modules/graph/measures.sql_in b/src/ports/postgres/modules/graph/measures.sql_in index b92f68396..92680ba46 100644 --- a/src/ports/postgres/modules/graph/measures.sql_in +++ b/src/ports/postgres/modules/graph/measures.sql_in @@ -84,7 +84,7 @@ the following columns (in addition to the grouping columns):
TEXT, default = NULL. Valid PostgreSQL expression that describes the vertices to generate closeness measures for. If this parameter is not specified, closeness measures are generated for all vertices in the apsp table. -This input should be treated like a WHERE clause. +You can think of this input parameter as being like a WHERE clause. Some example inputs: - If you want a short list of vertices, say 1, 2 and 3: @@ -750,7 +750,7 @@ INSERT INTO edge_gr VALUES (4,5,-20,1); --# Find APSP for all groups: +-# Find in-out degrees for all groups:
 DROP TABLE IF EXISTS out_gr;
 SELECT madlib.graph_vertex_degrees(
diff --git a/src/ports/postgres/modules/graph/pagerank.sql_in b/src/ports/postgres/modules/graph/pagerank.sql_in
index e028f9228..a4c200431 100644
--- a/src/ports/postgres/modules/graph/pagerank.sql_in
+++ b/src/ports/postgres/modules/graph/pagerank.sql_in
@@ -35,7 +35,6 @@ m4_include(`SQLCommon.m4')
 
Contents @@ -46,7 +45,8 @@ m4_include(`SQLCommon.m4') Given a graph, the PageRank algorithm outputs a probability distribution representing the likelihood that a person randomly traversing the graph will arrive at any particular vertex. This algorithm was originally used by Google to rank websites where the World Wide Web was -modeled as a directed graph with the vertices representing the websites. +modeled as a directed graph with the vertices representing the websites. The PageRank +algorithm initially proposed by Larry Page and Sergey Brin is implemented here [1]. @anchor pagerank @par PageRank @@ -100,13 +100,13 @@ regarding the number of iterations required for convergence. It is named by adding the suffix '_summary' to the 'out_table' parameter. -
damping_factor
+
damping_factor (optional)
FLOAT8, default 0.85. The probability, at any step, that a user will continue following the links in a random surfer model.
-
max_iter
+
max_iter (optional)
INTEGER, default: 100. The maximum number of iterations allowed.
-
threshold
+
threshold (optional)
FLOAT8, default: (1/number of vertices * 1000). If the difference between the PageRank of every vertex of two consecutive iterations is smaller than 'threshold', or the iteration number is larger than 'max_iter', the computation stops. If you set the threshold to zero, then you will force the algorithm to run for the full number of iterations specified in 'max_iter'. @@ -122,11 +122,6 @@ a single model is generated for all data. -@anchor notes -@par Notes - -The PageRank algorithm proposed by Larry Page and Sergey Brin is used [1]. - @anchor examples @examp diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in index 3632f2e2d..91e900d74 100644 --- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in +++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in @@ -86,19 +86,21 @@ tree_train(
list_of_features
TEXT. Comma-separated string of column names or expressions to use as predictors. - Can also be a '*' implying all columns are to be used as predictors (except the - ones included in the next argument). The types of the features can be mixed - where boolean, integer, and text columns are considered categorical and - double precision columns are considered continuous. The categorical variables - are not encoded and used as is for the training. Array columns can also be - included in the list, where each element of the array is considered as a - feature. + Can also be a '*' implying all columns are to be used as predictors (except for the + ones included in the next argument that lists exclusions). + The types of the features can be mixed - boolean, integer, and text columns + are considered categorical and + double precision columns are considered continuous. Categorical variables + are not encoded and used as is for the training. + + Array columns can also be included in the list, where the array is expanded + to treat each element of the array as a feature. It is important to note that not every combination of the levels of a categorical variable is checked when evaluating a split. The levels of the non-integer categorical variable are ordered by the entropy of the variable in predicting the response. The split at each node is evaluated between these - ordered levels. Integer categorical variables, howeve, are simply ordered + ordered levels. Integer categorical variables, however, are simply ordered by their value.
@@ -124,7 +126,10 @@ tree_train( each group.
weights (optional)
-
TEXT. Column name containing weights for each observation.
+
TEXT. Column name containing numerical weights for each observation. + This can be used to handle the case of unbalanced data sets. + If this parameter is not set, all observations (tuples) + are treated equally with a weight of 1.0.
max_depth (optional)
INTEGER, default: 7. Maximum depth of any node of the final tree, @@ -552,7 +557,7 @@ SELECT madlib.tree_train('dt_golf', -- source table NULL::text, -- exclude columns 'gini', -- split criterion NULL::text, -- no grouping - NULL::text, -- no weights + NULL::text, -- no weights, all observations treated equally 5, -- max depth 3, -- min split 1, -- min bucket @@ -784,7 +789,7 @@ SELECT madlib.tree_train('mt_cars', -- source table 'id, hp, drat, am, gear, carb', -- exclude columns 'mse', -- split criterion NULL::text, -- no grouping - NULL::text, -- no weights + NULL::text, -- no weights, all observations treated equally 10, -- max depth 8, -- min split 3, -- number of bins per continuous variable @@ -940,7 +945,7 @@ SELECT madlib.tree_train('null_handling_example', -- source table NULL, -- features to exclude 'gini', -- split criterion NULL::text, -- no grouping - NULL::text, -- no weights + NULL::text, -- no weights, all observations treated equally 4, -- max depth 1, -- min split 1, -- number of bins per continuous variable diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in index d9ae9bfd7..b9ce9104a 100644 --- a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in +++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in @@ -277,18 +277,24 @@ forest_train(training_table_name, while float values are considered regression outputs.
list_of_features
-
text. Comma-separated string of column names to use as predictors. Can - also be a '*' implying all columns are to be used as predictors (except the - ones included in the next argument). The types of the features can be mixed - where boolean, integer, and text columns are considered categorical and - double precision columns are considered continuous. The categorical variables - are not encoded and used as is for the training. - - It is important to note that we don't test for every combination of - levels of a categorical variable when evaluating a split. We order the levels - of the non-integer categorical variable by the entropy of the variable in +
TEXT. Comma-separated string of column names or expressions to use as predictors. + Can also be a '*' implying all columns are to be used as predictors (except for the + ones included in the next argument that lists exclusions). + The types of the features can be mixed - boolean, integer, and text columns + are considered categorical and + double precision columns are considered continuous. Categorical variables + are not encoded and used as is for the training. + + Array columns can also be included in the list, where the array is expanded + to treat each element of the array as a feature. + + It is important to note that not every combination of the levels of a + categorical variable is checked when evaluating a split. The levels of the + non-integer categorical variable are ordered by the entropy of the variable in predicting the response. The split at each node is evaluated between these - ordered levels. Integer categorical variables are ordered by their value.
+ ordered levels. Integer categorical variables, however, are simply ordered + by their value. +
list_of_features_to_exclude
text. Comma-separated string of column names to exclude from the predictors diff --git a/src/ports/postgres/modules/regress/linear.sql_in b/src/ports/postgres/modules/regress/linear.sql_in index 81261ce5d..6572652c5 100644 --- a/src/ports/postgres/modules/regress/linear.sql_in +++ b/src/ports/postgres/modules/regress/linear.sql_in @@ -29,7 +29,7 @@ m4_include(`SQLCommon.m4') @brief Also called Ordinary Least Squares Regression, models linear relationship between a dependent variable and one or more independent variables. Linear regression models a linear relationship of a scalar dependent variable -\f$ y \f$ to one or more explanatory independent variables \f$ x \f$ to build +\f$ y \f$ to one or more explanatory independent variables \f$ x \f$ and builds a model of coefficients. @anchor train @@ -49,12 +49,12 @@ linregr_train( source_table, \b Arguments
source_table
-
TEXT. The name of the table containing the training data.
+
TEXT. Name of the table containing the training data.
out_table
TEXT. Name of the generated table containing the output model. - The output table contains the following columns. + The output table contains the following columns: @@ -108,11 +108,18 @@ linregr_train( source_table, - -
\<...>
num_missing_rows_skippedINTEGER. The number of rows that have NULL values in the dependent and independent variables, and were skipped in the computation for each group.
+ INTEGER. The number of rows that have NULL values in the dependent and independent variables, and were skipped in the computation for each group. + + variance_covariance + FLOAT[]. Variance/covariance matrix. + A summary table named \_summary is created together with the output table. It has the following columns: + + + + @@ -126,11 +133,16 @@ linregr_train( source_table, -
method'linregr' for linear regression.
source_table The data source table name
The total number of rows that were used in the computation.
num_missing_rows_skipped The total number of rows that were skipped because of NULL values in them.
+ + + grouping_cols + Names of the grouping columns. + +
@note For p-values, we just return the computation result directly. -Other statistical packages, like 'R', produce the same result, but on printing the +Other statistical packages like 'R' produce the same result, but on printing the result to screen, another format function is used and any p-value that is smaller than the machine epsilon (the smallest positive floating-point number 'x' such that '1 + x != 1') will be printed on screen as "< xxx" (xxx is the @@ -145,7 +157,8 @@ in fact the same.
TEXT. Expression list to evaluate for the independent variables. An intercept variable is not assumed. It is common to provide an explicit intercept term by including a single constant 1 term in the independent variable list.
grouping_cols (optional)
-
TEXT, default: NULL. An expression list used to group the input dataset into discrete groups, running one regression per group. Similar to the SQL GROUP BY clause. When this value is null, no grouping is used and a single result model is generated.
+
TEXT, default: NULL. An expression list used to group the input dataset into discrete groups, running one regression per group. Similar to the SQL GROUP BY clause. When this value is null, no grouping is used and a +single result model is generated for the whole data set.
heteroskedasticity_option (optional)
BOOLEAN, default: FALSE. When TRUE, the heteroskedasticity of the model is also calculated and returned with the results.
@@ -168,43 +181,46 @@ FROM ( @anchor predict @par Prediction Function +The prediction function is as follows:
 linregr_predict(coef, col_ind)
 
\b Arguments
coef
-
FLOAT8[]. Vector of the coefficients of regression.
+
FLOAT8[]. Vector of the coefficients of regression from training.
col_ind
-
FLOAT8[]. An array containing the independent variable column names.
+
FLOAT8[]. An array containing the independent variable column names, +as was used for the training.
@anchor examples @par Examples -# Create an input data set.
+DROP TABLE IF EXISTS houses;
 CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT,
             size INT, lot INT);
-COPY houses FROM STDIN WITH DELIMITER '|';
-  1 |  590 |       2 |    1 |  50000 |  770 | 22100
-  2 | 1050 |       3 |    2 |  85000 | 1410 | 12000
-  3 |   20 |       3 |    1 |  22500 | 1060 |  3500
-  4 |  870 |       2 |    2 |  90000 | 1300 | 17500
-  5 | 1320 |       3 |    2 | 133000 | 1500 | 30000
-  6 | 1350 |       2 |    1 |  90500 |  820 | 25700
-  7 | 2790 |       3 |  2.5 | 260000 | 2130 | 25000
-  8 |  680 |       2 |    1 | 142500 | 1170 | 22000
-  9 | 1840 |       3 |    2 | 160000 | 1500 | 19000
- 10 | 3680 |       4 |    2 | 240000 | 2790 | 20000
- 11 | 1660 |       3 |    1 |  87000 | 1030 | 17500
- 12 | 1620 |       3 |    2 | 118600 | 1250 | 20000
- 13 | 3100 |       3 |    2 | 140000 | 1760 | 38000
- 14 | 2070 |       2 |    3 | 148000 | 1550 | 14000
- 15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000
-\\.
+INSERT INTO houses VALUES   
+  (1 ,  590 ,       2 ,    1 ,  50000 ,  770 , 22100),
+  (2 , 1050 ,       3 ,    2 ,  85000 , 1410 , 12000),
+  (3 ,   20 ,       3 ,    1 ,  22500 , 1060 ,  3500),
+  (4 ,  870 ,       2 ,    2 ,  90000 , 1300 , 17500),
+  (5 , 1320 ,       3 ,    2 , 133000 , 1500 , 30000),
+  (6 , 1350 ,       2 ,    1 ,  90500 ,  820 , 25700),
+  (7 , 2790 ,       3 ,  2.5 , 260000 , 2130 , 25000),
+  (8 ,  680 ,       2 ,    1 , 142500 , 1170 , 22000),
+  (9 , 1840 ,       3 ,    2 , 160000 , 1500 , 19000),
+ (10 , 3680 ,       4 ,    2 , 240000 , 2790 , 20000),
+ (11 , 1660 ,       3 ,    1 ,  87000 , 1030 , 17500),
+ (12 , 1620 ,       3 ,    2 , 118600 , 1250 , 20000),
+ (13 , 3100 ,       3 ,    2 , 140000 , 1760 , 38000),
+ (14 , 2070 ,       2 ,    3 , 148000 , 1550 , 14000),
+ (15 ,  650 ,       3 ,  1.5 ,  65000 , 1450 , 12000);
 
-# Train a regression model. First, we generate a single regression for all data.
+DROP TABLE IF EXISTS houses_linregr, houses_linregr_summary;
 SELECT madlib.linregr_train( 'houses',
                              'houses_linregr',
                              'price',
@@ -217,6 +233,7 @@ limit of maximum columns per table, you would pre-build the arrays and store the
 single column.)
 -# Next we generate three output models, one for each value of "bedroom".
 
+DROP TABLE IF EXISTS houses_linregr_bedroom, houses_linregr_bedroom_summary;
 SELECT madlib.linregr_train( 'houses',
                              'houses_linregr_bedroom',
                              'price',
@@ -233,43 +250,15 @@ SELECT * FROM houses_linregr;
 Result:
 
 -[ RECORD 1 ]+---------------------------------------------------------------------------
-coef         | {-12849.4168959872,28.9613922651765,10181.6290712648,50.516894915354}
-r2           | 0.768577580597443
-std_err      | {33453.0344331391,15.8992104963997,19437.7710925923,32.928023174087}
-t_stats      | {-0.38410317968819,1.82156166004184,0.523806408809133,1.53416118083605}
-p_values     | {0.708223134615422,0.0958005827189772,0.610804093526536,0.153235085548186}
-condition_no | 9002.50457085737
-
--# View the results grouped by bedroom. -
-SELECT * FROM houses_linregr_bedroom;
-
-Result: -
--[ RECORD 1 ]+--------------------------------------------------------------------------
-bedroom      | 2
-coef         | {-84242.0345406597,55.4430144648696,-78966.9753675319,225.611910021192}
-r2           | 0.968809546465313
-std_err      | {35018.9991665742,19.5731125320686,23036.8071292552,49.0448678148784}
-t_stats      | {-2.40560942761235,2.83261103077151,-3.42786111480046,4.60011251070697}
-p_values     | {0.250804617665239,0.21605133377602,0.180704400437373,0.136272031474122}
-condition_no | 10086.1048721726
--[ RECORD 2 ]+--------------------------------------------------------------------------
-bedroom      | 4
-coef         | {0.0112536020318378,41.4132554771633,0.0225072040636757,31.3975496688276}
-r2           | 1
-std_err      | {0,0,0,0}
-t_stats      | {Infinity,Infinity,Infinity,Infinity}
-p_values     |
-condition_no | Infinity
--[ RECORD 3 ]+--------------------------------------------------------------------------
-bedroom      | 3
-coef         | {-88155.8292501601,27.1966436294429,41404.0293363612,62.637521075324}
-r2           | 0.841699901311252
-std_err      | {57867.9999702625,17.8272309154689,43643.1321511114,70.8506824863954}
-t_stats      | {-1.52339512849005,1.52556747362508,0.948695185143966,0.884077878676067}
-p_values     | {0.188161432894871,0.187636685729869,0.386340032374927,0.417132778705789}
-condition_no | 11722.6225642147
+coef                     | {-12849.4168959872,28.9613922651765,10181.6290712648,50.516894915354}
+r2                       | 0.768577580597443
+std_err                  | {33453.0344331391,15.8992104963997,19437.7710925923,32.928023174087}
+t_stats                  | {-0.38410317968819,1.82156166004184,0.523806408809133,1.53416118083605}
+p_values                 | {0.708223134615422,0.0958005827189772,0.610804093526536,0.153235085548186}
+condition_no             | 9002.50457085737
+num_rows_processed       | 15
+num_missing_rows_skipped | 0
+variance_covariance      | {{1119105512.78479,217782.067878023,-283344228.394562,-616679.69319088}, ...
 
Alternatively you can unnest the results for easier reading of output.
@@ -281,8 +270,61 @@ SELECT unnest(ARRAY['intercept','tax','bath','size']) as attribute,
        unnest(p_values) as pvalue
 FROM houses_linregr;
 
--# Use the prediction function to evaluate residuals. +Result: +
+ attribute |    coefficient    |  standard_error  |      t_stat       |       pvalue       
+-----------+-------------------+------------------+-------------------+--------------------
+ intercept | -12849.4168959872 | 33453.0344331391 | -0.38410317968819 |  0.708223134615422
+ tax       |  28.9613922651765 | 15.8992104963997 |  1.82156166004184 | 0.0958005827189772
+ bath      |  10181.6290712648 | 19437.7710925923 | 0.523806408809133 |  0.610804093526536
+ size      |   50.516894915354 |  32.928023174087 |  1.53416118083605 |  0.153235085548186
+(4 rows)
+
+-# View the results grouped by bedroom.
+\\x ON
+SELECT * FROM houses_linregr_bedroom ORDER BY bedroom;
+
+Result: +
+-[ RECORD 1 ]------------+----------------------------------------------------------------
+bedroom                  | 4
+coef                     | {0.0112536020318378,41.4132554771633,0.0225072040636757,31.3975496688276}
+r2                       | 1
+std_err                  | {0,0,0,0}
+t_stats                  | {Infinity,Infinity,Infinity,Infinity}
+p_values                 | 
+condition_no             | Infinity
+num_rows_processed       | 1
+num_missing_rows_skipped | 0
+variance_covariance      | {{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,0}}
+-[ RECORD 2 ]------------+----------------------------------------------------------------
+bedroom                  | 3
+coef                     | {-88155.8292501601,27.1966436294429,41404.0293363612,62.637521075324}
+r2                       | 0.841699901311252
+std_err                  | {57867.9999702625,17.8272309154689,43643.1321511114,70.8506824863954}
+t_stats                  | {-1.52339512849005,1.52556747362508,0.948695185143966,0.884077878676067}
+p_values                 | {0.188161432894871,0.187636685729869,0.386340032374927,0.417132778705789}
+condition_no             | 11722.6225642147
+num_rows_processed       | 9
+num_missing_rows_skipped | 0
+variance_covariance      | {{3348705420.5583,433697.545104226,-70253017.45773,-2593488.13800193}, ...
+-[ RECORD 3 ]------------+----------------------------------------------------------------
+bedroom                  | 2
+coef                     | {-84242.0345406597,55.4430144648696,-78966.9753675319,225.611910021192}
+r2                       | 0.968809546465313
+std_err                  | {35018.9991665742,19.5731125320686,23036.8071292552,49.0448678148784}
+t_stats                  | {-2.40560942761235,2.83261103077151,-3.42786111480046,4.60011251070697}
+p_values                 | {0.250804617665239,0.21605133377602,0.180704400437373,0.136272031474122}
+condition_no             | 10086.1048721726
+num_rows_processed       | 5
+num_missing_rows_skipped | 0
+variance_covariance      | {{1226330302.62852,-300921.595596804,551696673.397849,-1544160.63236119}, ...
+
+ +-# Compare predicted price with actual. (This example uses the original data table to perform the prediction. Typically a different test dataset with the same features as the original training dataset would be used for prediction.) +
+\\x OFF
 SELECT houses.*,
        madlib.linregr_predict( ARRAY[1,tax,bath,size],
                                m.coef
@@ -293,6 +335,28 @@ SELECT houses.*,
                                 ) as residual
 FROM houses, houses_linregr m;
 
+Result: +
+ id | tax  | bedroom | bath | price  | size |  lot  |     predict      |     residual      
+----+------+---------+------+--------+------+-------+------------------+-------------------
+  1 |  590 |       2 |    1 |  50000 |  770 | 22100 | 53317.4426965542 | -3317.44269655424
+  2 | 1050 |       3 |    2 |  85000 | 1410 | 12000 | 109152.124955627 | -24152.1249556268
+  3 |   20 |       3 |    1 |  22500 | 1060 |  3500 | 51459.3486308563 | -28959.3486308563
+  4 |  870 |       2 |    2 |  90000 | 1300 | 17500 |  98382.215907206 | -8382.21590720605
+  5 | 1320 |       3 |    2 | 133000 | 1500 | 30000 | 121518.221409606 |  11481.7785903937
+  6 | 1350 |       2 |    1 |  90500 |  820 | 25700 | 77853.9455638561 |  12646.0544361439
+  7 | 2790 |       3 |  2.5 | 260000 | 2130 | 25000 | 201007.926371721 |  58992.0736282788
+  8 |  680 |       2 |    1 | 142500 | 1170 | 22000 | 76130.7259665617 |  66369.2740334383
+  9 | 1840 |       3 |    2 | 160000 | 1500 | 19000 | 136578.145387498 |  23421.8546125019
+ 10 | 3680 |       4 |    2 | 240000 | 2790 | 20000 |  255033.90159623 | -15033.9015962295
+ 11 | 1660 |       3 |    1 |  87000 | 1030 | 17500 | 97440.5250982852 | -10440.5250982852
+ 12 | 1620 |       3 |    2 | 118600 | 1250 | 20000 | 117577.415360321 |  1022.58463967926
+ 13 | 3100 |       3 |    2 | 140000 | 1760 | 38000 | 186203.892319613 | -46203.8923196126
+ 14 | 2070 |       2 |    3 | 148000 | 1550 | 14000 | 155946.739425521 | -7946.73942552117
+ 15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000 | 94497.4293105379 | -29497.4293105379
+(15 rows)
+
+ @anchor notes @par Note diff --git a/src/ports/postgres/modules/regress/logistic.sql_in b/src/ports/postgres/modules/regress/logistic.sql_in index a27e1548f..6ce347aac 100644 --- a/src/ports/postgres/modules/regress/logistic.sql_in +++ b/src/ports/postgres/modules/regress/logistic.sql_in @@ -53,7 +53,7 @@ logregr_train( source_table, \b Arguments
source_table
-
TEXT. The name of the table containing the training data.
+
TEXT. Name of the table containing the training data.
out_table
TEXT. Name of the generated table containing the output model. @@ -63,8 +63,8 @@ logregr_train( source_table, - + @@ -107,26 +107,39 @@ logregr_train( source_table, techniques may be more appropriate. - - - - + - + + + + + + + + + + +
<...>Text. Grouping columns, if provided in input. This could be multiple columns - depending on the \c grouping_col input.TEXT. Grouping columns, if provided in input. This could be multiple columns + depending on the \c grouping_cols input.
num_iterationsINTEGER. The number of iterations actually completed. This would be different - from the \c nIterations argument if a \c tolerance parameter is provided and the - algorithm converges before all iterations are completed.
num_rows_processed INTEGER. The number of rows actually processed, which is equal to the total number of rows in the source table minus the number of skipped rows.
num_missing_rows_skippedINTEGER. The number of rows skipped during the training. A row will be skipped - if the independent_varname is NULL or contains NULL values.INTEGER. The number of rows skipped during the training. + A row will be skipped if the independent_varname is NULL or + contains NULL values.
num_iterationsINTEGER. The number of iterations actually completed. This would be different + from the \c nIterations argument if a \c tolerance parameter is provided and the + algorithm converges before all iterations are completed.
variance_covarianceFLOAT[]. Variance/covariance matrix.
A summary table named \_summary is also created at the same time, which has the following columns: + + + + + @@ -139,17 +152,18 @@ logregr_train( source_table, - + - + - + @@ -159,28 +173,33 @@ logregr_train( source_table, - + - + + + + + +
method'logregr' for logistic regression.
source_table The data source table name.
dependent_varnameThe dependent variable.The dependent variable name.
independent_varnameThe independent variablesThe independent variable names.
optimizer_paramsA string that contains all the optimizer parameters, and has the form of 'optimizer=..., max_iter=..., tolerance=...'A string that contains all the optimizer parameters, and + has the form of 'optimizer=..., max_iter=..., tolerance=...'
num_failed_groupsHow many groups' fitting processes failed.How many groups failed in training.
num_rows_processedThe total number of rows usd in the computation.The total number of rows used in the computation.
num_missing_rows_skipped The total number of rows skipped.
grouping_colsNames of the grouping columns.
dependent_varname
TEXT. Name of the dependent variable column (of type BOOLEAN) in the - training data or an expression evaluating to a BOOLEAN.
+ training data, or an expression evaluating to a BOOLEAN.
independent_varname
TEXT. Expression list to evaluate for the - independent variables. An intercept variable is not assumed. It is common to + independent variables. An intercept variable is not assumed so it is common to provide an explicit intercept term by including a single constant \c 1 term in the independent variable list.
@@ -188,10 +207,10 @@ logregr_train( source_table,
TEXT, default: NULL. An expression list used to group the input dataset into discrete groups, running one regression per group. Similar to the SQL "GROUP BY" clause. When this value is NULL, no - grouping is used and a single result model is generated.
+ grouping is used and a single model is generated for the whole data set.
max_iter (optional)
-
INTEGER, default: 20. The maximum number of iterations that are allowed.
+
INTEGER, default: 20. The maximum number of iterations allowed.
optimizer (optional)
TEXT, default: 'irls'. The name of the optimizer to use: @@ -213,16 +232,17 @@ logregr_train( source_table,
tolerance (optional)
FLOAT8, default: 0.0001. The difference between - log-likelihood values in successive iterations that should indicate + log-likelihood values in successive iterations that indicate convergence. A zero disables the convergence criterion, so that execution - stops after \c n iterations have completed.
+ stops after the maximum iterations have completed, as set in the 'max_iter' + parameter above.
verbose (optional)
BOOLEAN, default: FALSE. Provides verbose output of the results of training.
@note For p-values, we just return the computation result directly. -Other statistical packages, like 'R', produce the same result, but on printing the +Other statistical packages like 'R' produce the same result, but on printing the result to screen, another format function is used and any p-value that is smaller than the machine epsilon (the smallest positive floating-point number 'x' such that '1 + x != 1') will be printed on screen as "< xxx" (xxx is the @@ -232,9 +252,9 @@ in fact the same. @anchor predict @par Prediction Function -Two prediction functions are provided to either predict the boolean value of the -dependent variable or the probability of the value of dependent variable being -'True', both functions using the same syntax. +Two prediction functions are provided. One predicts the boolean value of the +dependent variable, and the other predicts the probability of the value of the +dependent variable being 'True'. Syntax is the same for both functions. The function to predict the boolean value (True/False) of the dependent variable has the following syntax: @@ -244,7 +264,7 @@ logregr_predict(coefficients, ) -The function to predict the probability of the dependent variable being True +The function to predict the probability of the dependent variable being 'True' has the following syntax:
 logregr_predict_prob(coefficients,
@@ -255,60 +275,62 @@ logregr_predict_prob(coefficients,
 \b Arguments
 
coefficients
-
DOUBLE PRECISION[]. Model coefficients obtained from \ref logregr_train().
+
DOUBLE PRECISION[]. Model coefficients obtained from training \ref logregr_train().
ind_var
-
Independent variables, as a DOUBLE array. This should be the same length +
Independent variables expressed as a DOUBLE array. This should be the same length as the array obtained by evaluation of the 'independent_varname' argument in \ref logregr_train().
@anchor examples @examp --# Create the training data table. +-# Create the training data table. This data set is related to predicting +a second heart attack given treatment and health factors.
+DROP TABLE IF EXISTS patients;
 CREATE TABLE patients( id INTEGER NOT NULL,
                        second_attack INTEGER,
                        treatment INTEGER,
-                       trait_anxiety INTEGER);
-COPY patients FROM STDIN WITH DELIMITER '|';
-  1 |             1 |         1 |            70
-  3 |             1 |         1 |            50
-  5 |             1 |         0 |            40
-  7 |             1 |         0 |            75
-  9 |             1 |         0 |            70
- 11 |             0 |         1 |            65
- 13 |             0 |         1 |            45
- 15 |             0 |         1 |            40
- 17 |             0 |         0 |            55
- 19 |             0 |         0 |            50
-  2 |             1 |         1 |            80
-  4 |             1 |         0 |            60
-  6 |             1 |         0 |            65
-  8 |             1 |         0 |            80
- 10 |             1 |         0 |            60
- 12 |             0 |         1 |            50
- 14 |             0 |         1 |            35
- 16 |             0 |         1 |            50
- 18 |             0 |         0 |            45
- 20 |             0 |         0 |            60
-\\.
+                       trait_anxiety INTEGER);                        
+INSERT INTO patients VALUES 
+(1,  1, 1, 70),
+(2,  1, 1, 80),
+(3,  1, 1, 50),
+(4,  1, 0, 60),
+(5,  1, 0, 40),
+(6,  1, 0, 65),
+(7,  1, 0, 75),
+(8,  1, 0, 80),
+(9,  1, 0, 70),
+(10, 1, 0, 60),
+(11, 0, 1, 65),
+(12, 0, 1, 50),
+(13, 0, 1, 45),
+(14, 0, 1, 35),
+(15, 0, 1, 40),
+(16, 0, 1, 50),
+(17, 0, 0, 55),
+(18, 0, 0, 45),
+(19, 0, 0, 50),
+(20, 0, 0, 60);
 
-# Train a regression model.
-SELECT madlib.logregr_train( 'patients',
-                             'patients_logregr',
-                             'second_attack',
-                             'ARRAY[1, treatment, trait_anxiety]',
-                             NULL,
-                             20,
-                             'irls'
+DROP TABLE IF EXISTS patients_logregr, patients_logregr_summary;
+SELECT madlib.logregr_train( 'patients',                             -- Source table
+                             'patients_logregr',                     -- Output table
+                             'second_attack',                        -- Dependent variable
+                             'ARRAY[1, treatment, trait_anxiety]',   -- Feature vector
+                             NULL,                                   -- Grouping
+                             20,                                     -- Max iterations
+                             'irls'                                  -- Optimizer to use
                            );
 
-(Note that in this example we are dynamically creating the array of independent variables +Note that in the example above we are dynamically creating the array of independent variables from column names. If you have large numbers of independent variables beyond the PostgreSQL -limit of maximum columns per table, you would pre-build the arrays and store them in a -single column.) +limit of maximum columns per table, you would typically pre-build the arrays and store them in a +single column. -# View the regression results.
 -- Set extended display on for easier reading of output
@@ -317,14 +339,17 @@ SELECT * from patients_logregr;
 
Result:
-coef           | {5.59049410898112,2.11077546770772,-0.237276684606453}
-log_likelihood | -467.214718489873
-std_err        | {0.318943457652178,0.101518723785383,0.294509929481773}
-z_stats        | {17.5281667482197,20.7919819024719,-0.805666162169712}
-p_values       | {8.73403463417837e-69,5.11539430631541e-96,0.420435365338518}
-odds_ratios    | {267.867942976278,8.2546400100702,0.788773016471171}
-condition_no   | 179.186118573205
-num_iterations | 9
+coef                     | {-6.36346994178192,-1.02410605239327,0.119044916668607}
+log_likelihood           | -9.41018298388876
+std_err                  | {3.21389766375099,1.17107844860319,0.0549790458269317}
+z_stats                  | {-1.97998524145757,-0.874498248699539,2.16527796868916}
+p_values                 | {0.0477051870698145,0.381846973530455,0.0303664045046183}
+odds_ratios              | {0.00172337630923221,0.359117354054956,1.12642051220895}
+condition_no             | 326.081922791575
+num_rows_processed       | 20
+num_missing_rows_skipped | 0
+num_iterations           | 5
+variance_covariance      | {{10.329138193064,-0.474304665195738,-0.171995901260057}, ...
 
-# Alternatively, unnest the arrays in the results for easier reading of output: @@ -338,8 +363,17 @@ SELECT unnest(array['intercept', 'treatment', 'trait_anxiety']) as attribute, unnest(odds_ratios) as odds_ratio FROM patients_logregr;
+Result: +
+   attribute   |    coefficient    |   standard_error   |       z_stat       |       pvalue       |     odds_ratio      
+---------------+-------------------+--------------------+--------------------+--------------------+---------------------
+ intercept     | -6.36346994178192 |   3.21389766375099 |  -1.97998524145757 | 0.0477051870698145 | 0.00172337630923221
+ treatment     | -1.02410605239327 |   1.17107844860319 | -0.874498248699539 |  0.381846973530455 |   0.359117354054956
+ trait_anxiety | 0.119044916668607 | 0.0549790458269317 |   2.16527796868916 | 0.0303664045046183 |    1.12642051220895
+(3 rows)
+
--# Predicting dependent variable using the logistic regression model. +-# Predict the dependent variable using the logistic regression model. (This example uses the original data table to perform the prediction. Typically a different test dataset with the same features as the original training dataset would be used for prediction.) @@ -347,20 +381,72 @@ would be used for prediction.) \\x off -- Display prediction value along with the original value SELECT p.id, madlib.logregr_predict(coef, ARRAY[1, treatment, trait_anxiety]), - p.second_attack + p.second_attack::BOOLEAN FROM patients p, patients_logregr m ORDER BY p.id; +Result: +
+  id | logregr_predict | second_attack 
+----+-----------------+---------------
+  1 | t               | t
+  2 | t               | t
+  3 | f               | t
+  4 | t               | t
+  5 | f               | t
+  6 | t               | t
+  7 | t               | t
+  8 | t               | t
+  9 | t               | t
+ 10 | t               | t
+ 11 | t               | f
+ 12 | f               | f
+ 13 | f               | f
+ 14 | f               | f
+ 15 | f               | f
+ 16 | f               | f
+ 17 | t               | f
+ 18 | f               | f
+ 19 | f               | f
+ 20 | t               | f
+(20 rows)
+
--# Predicting the probability of the dependent variable being TRUE. +-# Predict the probability of the dependent variable being TRUE.
 \\x off
 -- Display prediction value along with the original value
-SELECT p.id, madlib.logregr_predict_prob(coef, ARRAY[1, treatment, trait_anxiety])
+SELECT p.id, madlib.logregr_predict_prob(coef, ARRAY[1, treatment, trait_anxiety]),
+       p.second_attack::BOOLEAN
 FROM patients p, patients_logregr m
 ORDER BY p.id;
 
- +Result: +
+ id | logregr_predict_prob | second_attack 
+----+----------------------+---------------
+  1 |    0.720223028941527 | t
+  2 |    0.894354902502048 | t
+  3 |    0.192269541755171 | t
+  4 |    0.685513072239347 | t
+  5 |    0.167747881508857 | t
+  6 |     0.79809810891514 | t
+  7 |    0.928568075752503 | t
+  8 |    0.959305763693571 | t
+  9 |    0.877576117431452 | t
+ 10 |    0.685513072239347 | t
+ 11 |    0.586700895943317 | f
+ 12 |    0.192269541755171 | f
+ 13 |    0.116032010632994 | f
+ 14 |   0.0383829143134982 | f
+ 15 |   0.0674976224147597 | f
+ 16 |    0.192269541755171 | f
+ 17 |    0.545870774302621 | f
+ 18 |    0.267675422387132 | f
+ 19 |    0.398618639285111 | f
+ 20 |    0.685513072239347 | f
+(20 rows)
+
@anchor notes @par Notes @@ -461,8 +547,8 @@ than 1000) indicates the presence of significant multicollinearity. @anchor literature @literature -A somewhat random selection of nice write-ups, with valuable pointers into -further literature. +A selection of references pertaining to logistic regression, +with some good pointers to other literature. [1] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 18 November 2009, http://www.stat.cmu.edu/~cshalizi/350/lectures/26/lecture-26.pdf diff --git a/src/ports/postgres/modules/stats/correlation.sql_in b/src/ports/postgres/modules/stats/correlation.sql_in index 3e4e9bd78..25c285951 100644 --- a/src/ports/postgres/modules/stats/correlation.sql_in +++ b/src/ports/postgres/modules/stats/correlation.sql_in @@ -20,7 +20,7 @@ m4_include(`SQLCommon.m4') diff --git a/src/ports/postgres/modules/summary/summary.sql_in b/src/ports/postgres/modules/summary/summary.sql_in index bdde21410..148191fe7 100644 --- a/src/ports/postgres/modules/summary/summary.sql_in +++ b/src/ports/postgres/modules/summary/summary.sql_in @@ -64,13 +64,11 @@ The \b summary() function returns a composite type containing three fields: \b Arguments
source_table
-
TEXT. The name of the table containing the input data.
-
output_table
-
TEXT. The name of the table to contain the output summary data. +
TEXT. Name of the table containing the input data.
-Summary statistics are saved in a table with the name specifed in the -output_table argument. The table contains the -following columns: +
output_table
+
TEXT. Name of the table for the output summary statistics. +This table contains the following columns: @@ -78,7 +76,7 @@ following columns: - + @@ -98,7 +96,11 @@ following columns: - + @@ -134,28 +136,32 @@ following columns: - + - + - + - + @@ -165,11 +171,12 @@ following columns:
group_by
group_by_valueValue of the Group-by column. NULL if there is no grouping.Value of the group-by column. NULL if there is no grouping.
target_column
distinct_valuesNumber of distinct values in the target column. When the summary() function is called with the get_estimates argument set to TRUE, this is an estimated statistic based on the Flajolet-Martin distinct count estimator.Number of distinct values in the target column. + If the summary() function is called with the get_estimates + argument set to TRUE (default), then this is an estimated statistic based on the + Flajolet-Martin distinct count estimator. If the get_estimates + argument set to FALSE, will use PostgreSQL COUNT DISTINCT.
missing_values
first_quartileFirst quartile (25th percentile), only for numeric columns. Currently unavailable for PostgreSQL 9.3 or lower.First quartile (25th percentile), only for numeric columns. + (Unavailable for PostgreSQL 9.3 or lower.)
medianMedian value of target column, if target is numeric, otherwise NULL. Currently unavailable for PostgreSQL 9.3 or lower.Median value of target column, if target is numeric, otherwise NULL. + (Unavailable for PostgreSQL 9.3 or lower.)
third_quartileThird quartile (25th percentile), only for numeric columns. Currently unavailable for PostgreSQL 9.3 or lower.Third quartile (25th percentile), only for numeric columns. + (Unavailable for PostgreSQL 9.3 or lower.)
quantile_arrayPercentile values corresponding to \e ntile_array. Currently unavailable for PostgreSQL 9.3 or lower.Percentile values corresponding to \e ntile_array. + (Unavailable for PostgreSQL 9.3 or lower.)
most_frequent_values An array containing the most frequently occurring values. The \e - how_many_mfv argument determines the length of the array, 10 by + how_many_mfv argument determines the length of the array, which is 10 by default. If the summary() function is called with the \e get_estimates argument set to TRUE (default), the frequent values computation is performed using a parallel aggregation method that is - faster, but in some cases can fail to detect the exact most frequent + faster, but in some cases may fail to detect the exact most frequent values.
target_columns (optional)
-
TEXT, default NULL. A comma-separated list of columns to summarize. If NULL, summaries are produced for all columns.
+
TEXT, default NULL. A comma-separated list of columns to summarize. +If NULL, summaries are produced for all columns.
grouping_cols (optional)
TEXT, default: null. A comma-separated list of columns on which to -group results. If NULL, summaries are produced on the complete table.
+group results. If NULL, summaries are produced for the complete table. @note Please note that summary statistics are calculated for each grouping column independently. That is, grouping columns are not combined together as in the regular PostgreSQL style GROUP BY directive. (This was done @@ -178,23 +185,31 @@ result in the case of large input tables with a lot of grouping_cols and target_cols specified.)
get_distinct (optional)
-
BOOLEAN, default TRUE. If true, distinct values are counted.
+
BOOLEAN, default TRUE. If true, distinct values are counted. +The method for computing distinct values depends on the setting of +the 'get_estimates' parameter below.
get_quartiles (optional)
BOOLEAN, default TRUE. If TRUE, quartiles are computed.
ntile_array (optional)
-
FLOAT8[], default NULL. An array of quantile values to compute. If NULL, quantile values are not computed.
-@note Quartile and quantile functions are not available for PostgreSQL 9.3 or +
FLOAT8[], default NULL. An array of quantile values to compute. +If NULL, quantile values are not computed.
+@note Quartile and quantile functions are not available in PostgreSQL 9.3 or lower. If you are using PostgreSQL 9.3 or lower, the output table will not contain these values, even if you set 'get_quartiles' = TRUE or provide an array of quantile values for the parameter 'ntile_array'.
how_many_mfv (optional)
-
INTEGER, default: 10. The number of most-frequent-values to compute.
+
INTEGER, default: 10. The number of most-frequent-values to compute. +The method for computing MFV depends on the setting of +the 'get_estimates' parameter below.
get_estimates (optional)
-
BOOLEAN, default TRUE. If TRUE, estimated values are produced for distinct values and most frequent values. If FALSE, exact values are calculated (may take longer to run depending on data size).
+
BOOLEAN, default TRUE. If TRUE, estimated values are produced for +distinct values and most frequent values. If FALSE, exact values are +calculated which will take longer to run, with the impact depending on +data size.
n_cols_per_run (optional)
INTEGER, default: 15. The number of columns to collect summary statistics in @@ -204,8 +219,8 @@ with a total of 40 columns to summarize and 'n_cols_per_run = 15', there will be 3 passes through the data, with each pass summarizing a maximum of 15 columns. @note This parameter should be used with caution. Increasing this parameter could decrease the total run time (if number of passes decreases), but will increase -the memory consumption during each run. Since Postgresql limits the memory available -for a single aggregate run, this increased memory consumption could result in +the memory consumption during each run. Since PostgreSQL limits the memory available +for a single aggregate run, this increased memory consumption could result in an out-of-memory termination error.
@@ -220,82 +235,230 @@ out-of-memory termination error. SELECT * FROM madlib.summary(); --# Create an input data set. +-# Create an input data table using part of the well known +iris data set.
-CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT,
-             size INT, lot INT);
-COPY houses FROM STDIN WITH DELIMITER '|';
-  1 |  590 |       2 |    1 |  50000 |  770 | 22100
-  2 | 1050 |       3 |    2 |  85000 | 1410 | 12000
-  3 |   20 |       3 |    1 |  22500 | 1060 |  3500
-  4 |  870 |       2 |    2 |  90000 | 1300 | 17500
-  5 | 1320 |       3 |    2 | 133000 | 1500 | 30000
-  6 | 1350 |       2 |    1 |  90500 |  820 | 25700
-  7 | 2790 |       3 |  2.5 | 260000 | 2130 | 25000
-  8 |  680 |       2 |    1 | 142500 | 1170 | 22000
-  9 | 1840 |       3 |    2 | 160000 | 1500 | 19000
- 10 | 3680 |       4 |    2 | 240000 | 2790 | 20000
- 11 | 1660 |       3 |    1 |  87000 | 1030 | 17500
- 12 | 1620 |       3 |    2 | 118600 | 1250 | 20000
- 13 | 3100 |       3 |    2 | 140000 | 1760 | 38000
- 14 | 2070 |       2 |    3 | 148000 | 1550 | 14000
- 15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000
-\\.
+DROP TABLE IF EXISTS iris;
+CREATE TABLE iris (id INT, sepal_length FLOAT, sepal_width FLOAT,
+                    petal_length FLOAT, petal_width FLOAT, 
+                   class_name text);                        
+INSERT INTO iris VALUES 
+(1,5.1,3.5,1.4,0.2,'Iris-setosa'),
+(2,4.9,3.0,1.4,0.2,'Iris-setosa'),
+(3,4.7,3.2,1.3,0.2,'Iris-setosa'),
+(4,4.6,3.1,1.5,0.2,'Iris-setosa'),
+(5,5.0,3.6,1.4,0.2,'Iris-setosa'),
+(6,5.4,3.9,1.7,0.4,'Iris-setosa'),
+(7,4.6,3.4,1.4,0.3,'Iris-setosa'),
+(8,5.0,3.4,1.5,0.2,'Iris-setosa'),
+(9,4.4,2.9,1.4,0.2,'Iris-setosa'),
+(10,4.9,3.1,1.5,0.1,'Iris-setosa'),
+(11,7.0,3.2,4.7,1.4,'Iris-versicolor'),
+(12,6.4,3.2,4.5,1.5,'Iris-versicolor'),
+(13,6.9,3.1,4.9,1.5,'Iris-versicolor'),
+(14,5.5,2.3,4.0,1.3,'Iris-versicolor'),
+(15,6.5,2.8,4.6,1.5,'Iris-versicolor'),
+(16,5.7,2.8,4.5,1.3,'Iris-versicolor'),
+(17,6.3,3.3,4.7,1.6,'Iris-versicolor'),
+(18,4.9,2.4,3.3,1.0,'Iris-versicolor'),
+(19,6.6,2.9,4.6,1.3,'Iris-versicolor'),
+(20,5.2,2.7,3.9,1.4,'Iris-versicolor'),
+(21,6.3,3.3,6.0,2.5,'Iris-virginica'),
+(22,5.8,2.7,5.1,1.9,'Iris-virginica'),
+(23,7.1,3.0,5.9,2.1,'Iris-virginica'),
+(24,6.3,2.9,5.6,1.8,'Iris-virginica'),
+(25,6.5,3.0,5.8,2.2,'Iris-virginica'),
+(26,7.6,3.0,6.6,2.1,'Iris-virginica'),
+(27,4.9,2.5,4.5,1.7,'Iris-virginica'),
+(28,7.3,2.9,6.3,1.8,'Iris-virginica'),
+(29,6.7,2.5,5.8,1.8,'Iris-virginica'),
+(30,7.2,3.6,6.1,2.5,'Iris-virginica');
 
--# Run the \b summary() function. +-# Run the \b summary() function using all defaults.
-SELECT * FROM madlib.summary( 'houses',
-                              'houses_summary',
-                              'tax,bedroom,lot,bath,price,size,lot',
-                              'bedroom',
-                              TRUE,
-                              TRUE,
-                              NULL,
-                              5,
-                              FALSE
+DROP TABLE IF EXISTS iris_summary;
+SELECT * FROM madlib.summary( 'iris',            -- Source table
+                              'iris_summary'     -- Output table
                             );
 
Result:
-  output_table  | row_count |    duration
-----------------+-----------+----------------
- houses_summary |        21 | 0.207587003708
+ output_table | row_count |      duration       
+--------------+-----------+---------------------
+ iris_summary |         6 | 0.00712704658508301
 (1 row)
 
- --# View the summary data. +View the summary data.
 -- Turn on expanded display for readability.
 \\x on
-SELECT * FROM houses_summary;
+SELECT * FROM iris_summary;
+
+Result (partial): +
+...
+ -[ RECORD 2 ]-------+-----------------------------------
+group_by             | 
+group_by_value       | 
+target_column        | sepal_length
+column_number        | 2
+data_type            | float8
+row_count            | 30
+distinct_values      | 22
+missing_values       | 0
+blank_values         | 
+fraction_missing     | 0
+fraction_blank       | 
+mean                 | 5.84333333333333
+variance             | 0.9294367816092
+min                  | 4.4
+max                  | 7.6
+first_quartile       | 4.925
+median               | 5.75
+third_quartile       | 6.575
+most_frequent_values | {4.9,6.3,6.5,4.6,5,6.9,5.4,4.4,7,6.4}
+mfv_frequencies      | {4,3,2,2,2,1,1,1,1,1}  
+...
+ -[ RECORD 6 ]-------+-----------------------------------
+group_by             | 
+group_by_value       | 
+target_column        | class_name
+column_number        | 6
+data_type            | text
+row_count            | 30
+distinct_values      | 3
+missing_values       | 0
+blank_values         | 0
+fraction_missing     | 0
+fraction_blank       | 0
+mean                 | 
+variance             | 
+min                  | 11
+max                  | 15
+first_quartile       | 
+median               | 
+third_quartile       | 
+most_frequent_values | {Iris-setosa,Iris-versicolor,Iris-virginica}
+mfv_frequencies      | {10,10,10}
+
+Note that for the text column in record 6, some statistics are n/a, +and the min and max values represent the length of the shortest and +longest strings respectively. + +-# Now group by the class of iris: +
+DROP TABLE IF EXISTS iris_summary;
+SELECT * FROM madlib.summary( 'iris',                       -- Source table
+                              'iris_summary',               -- Output table
+                              'sepal_length, sepal_width',  -- Columns to summarize
+                              'class_name'                  -- Grouping column
+                            );
+SELECT * FROM iris_summary;
+
+Result (partial): +
+ -[ RECORD 1 ]-------+-----------------------------------
+group_by             | class_name
+group_by_value       | Iris-setosa
+target_column        | sepal_length
+column_number        | 2
+data_type            | float8
+row_count            | 10
+distinct_values      | 7
+missing_values       | 0
+blank_values         | 
+fraction_missing     | 0
+fraction_blank       | 
+mean                 | 4.86
+variance             | 0.0848888888888976
+min                  | 4.4
+max                  | 5.4
+first_quartile       | 4.625
+median               | 4.9
+third_quartile       | 5
+most_frequent_values | {4.6,4.9,5,5.1,4.4,5.4,4.7}
+mfv_frequencies      | {2,2,2,1,1,1,1}
+...
+ -[ RECORD 3 ]-------+-----------------------------------
+group_by             | class_name
+group_by_value       | Iris-versicolor
+target_column        | sepal_length
+column_number        | 2
+data_type            | float8
+row_count            | 10
+distinct_values      | 10
+missing_values       | 0
+blank_values         | 
+fraction_missing     | 0
+fraction_blank       | 
+mean                 | 6.1
+variance             | 0.528888888888893
+min                  | 4.9
+max                  | 7
+first_quartile       | 5.55
+median               | 6.35
+third_quartile       | 6.575
+most_frequent_values | {7,6.4,6.9,5.5,6.5,5.7,6.3,4.9,6.6,5.2}
+mfv_frequencies      | {1,1,1,1,1,1,1,1,1,1}
+...
+
+ +-# Trying some other parameters: +
+DROP TABLE IF EXISTS iris_summary;
+SELECT * FROM madlib.summary( 'iris',                       -- Source table
+                              'iris_summary',               -- Output table
+                              'sepal_length, sepal_width',  -- Columns to summarize
+                               NULL,                        -- No grouping
+                               TRUE,                        -- Get distinct values
+                               FALSE,                       -- Dont get quartiles
+                               ARRAY[0.33, 0.66],           -- Get ntiles
+                               3,                           -- Number of MFV to compute
+                               FALSE                        -- Get exact values
+                            );
+SELECT * FROM iris_summary;
 
Result:
- -[ RECORD 1 ]--------+-----------------------------------
- group_by             | bedroom
- group_by_value       | 3
- target_column        | tax
- column_number        | 2
- data_type            | int4
- row_count            | 9
- distinct_values      | 9
- missing_values       | 0
- blank_values         |
- fraction_missing     | 0
- fraction_blank       |
- mean                 | 1561.11111111111
- variance             | 936736.111111111
- min                  | 20
- max                  | 3100
- most_frequent_values | {20,1320,2790,1840,1660}
- mfv_frequencies      | {1,1,1,1,1}
- -[ RECORD 2 ]--------+-----------------------------------
- group_by             | bedroom
- group_by_value       | 3
- target_column        | bath
- column_number        | 4
- ...
+ -[ RECORD 1 ]-------+-----------------------------------
+group_by             | 
+group_by_value       | 
+target_column        | sepal_length
+column_number        | 2
+data_type            | float8
+row_count            | 30
+distinct_values      | 22
+missing_values       | 0
+blank_values         | 
+fraction_missing     | 0
+fraction_blank       | 
+mean                 | 5.84333333333333
+variance             | 0.9294367816092
+min                  | 4.4
+max                  | 7.6
+quantile_array       | {5.057,6.414}
+most_frequent_values | {4.9,6.3,5}
+mfv_frequencies      | {4,3,2}
+ -[ RECORD 2 ]-------+-----------------------------------
+group_by             | 
+group_by_value       | 
+target_column        | sepal_width
+column_number        | 3
+data_type            | float8
+row_count            | 30
+distinct_values      | 14
+missing_values       | 0
+blank_values         | 
+fraction_missing     | 0
+fraction_blank       | 
+mean                 | 3.04
+variance             | 0.13903448275862
+min                  | 2.3
+max                  | 3.9
+quantile_array       | {2.9,3.2}
+most_frequent_values | {3,2.9,3.2}
+mfv_frequencies      | {4,4,3}
 
@anchor notes @@ -306,33 +469,25 @@ should follow case-sensitivity and quoting rules per the database. (For instance, 'mytable' and 'MyTable' both resolve to the same entity, i.e. 'mytable'. If mixed-case or multi-byte characters are desired for entity names then the string should be double-quoted; in this case the input would be '"MyTable"'). -- Estimated values are only implemented for the distinct values computation. -- The get_estimates parameter controls computation for two statistics: +- The get_estimates parameter controls computation for both distinct +count and most frequent values: - If get_estimates is TRUE then the distinct value computation is - estimated. Further, the most frequent values computation is computed using a - "quick and dirty" method that does parallel aggregation in Greenplum Database at the expense + estimated using Flajolet-Martin. MFV is computed using a + fast method that does parallel aggregation in Greenplum Database at the expense of missing some of the most frequent values. - If get_estimates is FALSE then the distinct values are computed - in a slow but exact method. The most frequent values are computed using a + in a slower but exact method using PostgreSQL COUNT DISTINCT. MFV is computed using a faithful implementation that preserves the approximation guarantees of - the Cormode/Muthukrishnan method (more information in \ref grp_mfvsketch). -- Summary statistics are calculated for each grouping -column independently. That is, grouping columns are not combined together -as in the regular PostgreSQL style GROUP BY directive. (This was done -to reduce long run time and huge output table size which would otherwise -result in the case of large input tables with a lot of grouping_cols and -target_cols specified.) -- Quartile and quantile functions are not available for PostgreSQL 9.3 or -lower. If you are using PostgreSQL 9.3 or lower, the output table will not -contain these values, even if you set 'get_quartiles' = TRUE or -provide an array of quantile values for the parameter 'ntile_array'. + the Cormode/Muthukrishnan method (more information at \ref grp_mfvsketch). @anchor related @par Related Topics File summary.sql_in documenting the \b summary() function -\ref grp_mfvsketch +\ref grp_fmsketch
+\ref grp_mfvsketch
+\ref grp_countmin */ diff --git a/src/ports/postgres/modules/utilities/path.sql_in b/src/ports/postgres/modules/utilities/path.sql_in index d98b50077..8fe03ec33 100644 --- a/src/ports/postgres/modules/utilities/path.sql_in +++ b/src/ports/postgres/modules/utilities/path.sql_in @@ -152,7 +152,9 @@ path(
aggregate_func (optional)
VARCHAR, default NULL. A comma-separated list of aggregates to be - applied to the pattern matches [3]. Please note that window functions + applied to the pattern matches [3]. + You can think of this input parameter as being like a SELECT clause. + Please note that window functions cannot currently be used in the parameter 'aggregate_func'. If you want to use a window function [4], output the pattern matches and write a SQL query with a window function over the output tuples (see 'persist_rows'