From e384c1fc7bb27b7c2401b17b6049cee1374fee1a Mon Sep 17 00:00:00 2001 From: Orhan Kislal Date: Mon, 23 Jan 2017 15:45:08 -0800 Subject: [PATCH] RF: Fixes the online help and example --- .../random_forest.py_in | 83 +++++++++++-------- 1 file changed, 47 insertions(+), 36 deletions(-) diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in index e006a3468..0eb5985c6 100644 --- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in +++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in @@ -103,6 +103,10 @@ SELECT {schema_madlib}.forest_train( is an positive integer with the default 0. verbose, -- Boolean, whether to print more info, default is False + sample_ratio -- Double precision, in the range of (0, 1], default: 1 + If sample_ratio is less than 1, a bootstrap sample + size smaller than the data table is expected to be + used for training each tree in the forest. ); ------------------------------------------------------------ @@ -175,44 +179,51 @@ it has the following columns: ------------------------------------------------------------ EXAMPLE ------------------------------------------------------------ -DROP TABLE IF EXISTS dummy_dt_con_src CASCADE; -CREATE TABLE dummy_dt_con_src ( - id INTEGER, - cat INTEGER[], - con FLOAT8[], - y FLOAT8 +DROP TABLE IF EXISTS dt_golf; +CREATE TABLE dt_golf ( + id integer NOT NULL, + "OUTLOOK" text, + temperature double precision, + humidity double precision, + windy text, + class text ); -INSERT INTO dummy_dt_src VALUES -(1, '{0}'::INTEGER[], ARRAY[0], 0.5), -(2, '{0}'::INTEGER[], ARRAY[1], 0.5), -(3, '{0}'::INTEGER[], ARRAY[4], 0.5), -(4, '{0}'::INTEGER[], ARRAY[4], 0.5), -(5, '{0}'::INTEGER[], ARRAY[4], 0.5), -(6, '{0}'::INTEGER[], ARRAY[5], 0.1), -(7, '{0}'::INTEGER[], ARRAY[6], 0.1), -(8, '{1}'::INTEGER[], ARRAY[9], 0.1); -(9, '{1}'::INTEGER[], ARRAY[9], 0.1); -(10, '{1}'::INTEGER[], ARRAY[9], 0.1); -(11, '{1}'::INTEGER[], ARRAY[9], 0.1); - -DROP TABLE IF EXISTS forest_out, forest_out_summary; -SELECT madlib.forest_train( - 'dummy_dt_src', - 'forest_out', - 'id', - 'y', - 'cat, con', - '', - 'mse', - NULL::Text, - NULL::Text, - 3, - 2, - 1, - 5); - -SELECT madlib.forest_display('forest_out'); +INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) VALUES +(1, 'sunny', 85, 85, 'false', 'Don''t Play'), +(2, 'sunny', 80, 90, 'true', 'Don''t Play'), +(3, 'overcast', 83, 78, 'false', 'Play'), +(4, 'rain', 70, 96, 'false', 'Play'), +(5, 'rain', 68, 80, 'false', 'Play'), +(6, 'rain', 65, 70, 'true', 'Don''t Play'), +(7, 'overcast', 64, 65, 'true', 'Play'), +(8, 'sunny', 72, 95, 'false', 'Don''t Play'), +(9, 'sunny', 69, 70, 'false', 'Play'), +(10, 'rain', 75, 80, 'false', 'Play'), +(11, 'sunny', 75, 70, 'true', 'Play'), +(12, 'overcast', 72, 90, 'true', 'Play'), +(13, 'overcast', 81, 75, 'false', 'Play'), +(14, 'rain', 71, 80, 'true', 'Don''t Play'); + +DROP TABLE IF EXISTS train_output, train_output_group, train_output_summary; +SELECT madlib.forest_train('dt_golf', -- source table + 'train_output', -- output model table + 'id', -- id column + 'class', -- response + '"OUTLOOK", temperature, humidity, windy', -- features + NULL, -- exclude columns + NULL, -- grouping columns + 20::integer, -- number of trees + 2::integer, -- number of random features + TRUE::boolean, -- variable importance + 1::integer, -- num_permutations + 8::integer, -- max depth + 3::integer, -- min split + 1::integer, -- min bucket + 10::integer -- number of splits per continuous variable +); +SELECT madlib.get_tree('train_output',1,2,FALSE); + """ else: help_string = "No such option. Use {schema_madlib}.forest_train('usage')"