diff --git a/src/ports/postgres/modules/utilities/sessionize.py_in b/src/ports/postgres/modules/utilities/sessionize.py_in index cff86a57e..7b8e77791 100644 --- a/src/ports/postgres/modules/utilities/sessionize.py_in +++ b/src/ports/postgres/modules/utilities/sessionize.py_in @@ -17,16 +17,17 @@ import plpy import string +import re from control import MinWarning -from utilities import unique_string, _assert +from utilities import unique_string, _assert, split_quoted_delimited_str from validate_args import get_cols from validate_args import input_tbl_valid, output_tbl_valid, is_var_valid m4_changequote(`') def sessionize(schema_madlib, source_table, output_table, partition_expr, - time_stamp, max_time, **kwargs): + time_stamp, max_time, output_cols=None, create_view=None, **kwargs): """ Perform sessionization over a sequence of rows. @@ -35,41 +36,74 @@ def sessionize(schema_madlib, source_table, output_table, partition_expr, @param source_table: str, Name of the input table/view @param output_table: str, Name of the table to store result @param partition_expr: str, Expression to partition (group) the input data - @param time_stamp: str, Column name with time used for sessionization calculation + @param time_stamp: str, The time stamp column name that is used for sessionization calculation @param max_time: interval, Delta time between subsequent events to define a session - + @param output_cols: str, a valid postgres SELECT expression + @param create_view: boolean, indicates if the output is a view or a table with name + specified by output_table (default TRUE): + TRUE - create view + FALSE - materialize results into a table """ with MinWarning("error"): _validate(source_table, output_table, partition_expr, time_stamp, max_time) - - all_input_cols_str = ', '.join([i.strip() for i in get_cols(source_table, schema_madlib)]) - session_id = 'session_id' if not is_var_valid(source_table, 'session_id') else unique_string('session_id') + table_or_view = 'VIEW' if create_view or create_view is None else 'TABLE' + output_cols = '*' if output_cols is None else output_cols + + # If the output_cols has '*' as one of the elements, expand it to + # include all columns in the source table. The following list + # comprehension is only to handle the case where '*' is included + # in output_cols. Using '*' as is, without expanding it to specific + # column names leads to some temporary intermediate columns + # (new_partition and new_session defined below) occurring in the output. + cols_to_project_list = [', '.join(get_cols(source_table, schema_madlib)) if i=='*' else i + for i in split_quoted_delimited_str(output_cols)] + + # Examples of Invalid SELECT expression in output_cols: + # 1) If output_cols contains '*' along with an existing column name + # in the source table, postgres will throw an error and fail + # for specifying duplicate column names in the output table/view. + # 2) If output_cols contains more than 1 expressions which are not + # renamed using ' AS ', postgres will fail since it will try to + # rename all such new columns as '?column?'. This is considered an + # invalid SELECT expression. + cols_to_project = ', '.join(cols_to_project_list) + + session_id = 'session_id' if not is_var_valid(source_table, 'session_id')\ + else unique_string('session_id') # Create temp column names for intermediate columns. new_partition = unique_string('new_partition') new_session = unique_string('new_session') - plpy.execute(""" - CREATE TABLE {output_table} AS - SELECT - {all_input_cols_str}, - CASE WHEN {time_stamp} IS NOT NULL - THEN SUM(CASE WHEN {new_partition} OR {new_session} THEN 1 END) - OVER (PARTITION BY {partition_expr} - ORDER BY {time_stamp}) - END AS {session_id} - FROM ( - SELECT *, - ROW_NUMBER() OVER (w) = 1 - AND {time_stamp} IS NOT NULL AS {new_partition}, - ({time_stamp} - LAG({time_stamp}, 1) - OVER (w)) > '{max_time}'::INTERVAL AS {new_session} - FROM {source_table} - WINDOW w AS (PARTITION BY {partition_expr} - ORDER BY {time_stamp}) - ) a - """.format(**locals())) - + try: + plpy.execute(""" + CREATE {table_or_view} {output_table} AS + SELECT + {cols_to_project}, + CASE WHEN {time_stamp} IS NOT NULL + THEN SUM(CASE WHEN {new_partition} OR {new_session} THEN 1 END) + OVER (PARTITION BY {partition_expr} ORDER BY {time_stamp}) + END AS {session_id} + FROM ( + SELECT *, + ROW_NUMBER() OVER (w) = 1 AND {time_stamp} IS NOT NULL AS {new_partition}, + ({time_stamp}-LAG({time_stamp}, 1) OVER (w)) > '{max_time}'::INTERVAL AS {new_session} + FROM {source_table} WINDOW w AS (PARTITION BY {partition_expr} ORDER BY {time_stamp}) + ) a + """.format(**locals())) + except plpy.SPIError as e: + # The specific exception we want to catch here is + # "spiexceptions.DuplicateColumn". But the current version of gpdb + # does not seem to have implemented it. So catching a more generic + # exception and displaying this warning message. The reason for + # doing this is that the default error message shown by postgres + # when we have more than one expressions in output_cols that do + # not use ' AS ' to rename them is not user-friendly. + with MinWarning("warning"): + plpy.warning("A plausible error condition: the output_cols\ + parameter might be an invalid SELECT expression, resulting\ + in duplicate column names.") + raise def _validate(source_table, output_table, partition_expr, time_stamp, max_time): input_tbl_valid(source_table, 'Sessionization') @@ -80,8 +114,7 @@ def _validate(source_table, output_table, partition_expr, time_stamp, max_time): _assert(max_time, "Sessionization error: Invalid max time value") # ensure the partition/order expression can actually be used _assert(is_var_valid(source_table, partition_expr, time_stamp), - "Sessionization error: invalid partition expression or time stamp column name") - + "Sessionization error: Invalid partition expression or time stamp column name") def sessionize_help_message(schema_madlib, message, **kwargs): """ @@ -94,17 +127,19 @@ def sessionize_help_message(schema_madlib, message, **kwargs): Functionality: Sessionize The goal of the MADlib sessionize function is to perform sessionization over - a time-series based data. + a time-series based data. ------------------------------------------------------------ USAGE ------------------------------------------------------------ SELECT {schema_madlib}.sessionize( - 'source_table', -- str, Name of the table - 'output_table', -- str, Table name to store the Sessionization results - 'partition_expr', -- str, Partition expression to group the data table - 'time_stamp' -- str, Column name with time used for sessionization calculation - 'max_time' -- str, Delta time between subsequent events to define a session + 'source_table', -- str, Name of the table + 'output_table', -- str, Table name to store the Sessionization results + 'partition_expr', -- str, Partition expression to group the data table + 'time_stamp' -- str, The time stamp column name that is used for sessionization calculation + 'max_time' -- str, Delta time between subsequent events to define a session + 'output_cols' -- str, an optional valid postgres SELECT expression for the output table/view (default *) + 'create_view' -- boolean, optional parameter to specify if output is a view or materilized to a table (default True) ); ------------------------------------------------------------ @@ -171,19 +206,40 @@ def sessionize_help_message(schema_madlib, message, **kwargs): '04/15/2016 02:19:00'|103711|109|'WINE'|0|1 \. - - Sessionize the table for each user_id: + - Sessionize the table for each user_id, and obtain only the user_id, with partition expression, + event_timestamp and session_id: SELECT {schema_madlib}.sessionize( - 'eventlog', -- Name of input table - 'sessionize_output', -- Table name to store sessionized results - 'user_id', -- Partition input table by session + 'eventlog', -- Name of input table + 'sessionize_output', -- Table name to store sessionized results + 'user_id', -- Partition input table by session 'event_timestamp', -- Order partitions in input table by time - '0:3:0' -- Events within a window of this time unit (180 seconds) must be in the same session + '0:3:0' -- Events within a window of this time unit (180 seconds) must be in the same session ); - View the output table containing the session IDs: SELECT * FROM sessionize_output; + + DROP VIEW sessionize_output; + + - Sessionize the table for each user_id, and materialize all columns from source table into an output table: + SELECT {schema_madlib}.sessionize( + 'eventlog', -- Name of input table + 'sessionize_output', -- Table name to store sessionized results + 'user_id', -- Partition input table by session + 'event_timestamp', -- Order partitions in input table by time + '180' -- Events within a window of this time unit (180 seconds) must be in the same session + 'user_id, event_timestamp' -- Preseve only user_id and event_timestamp columns, along with the session id column + 'false' -- Materialize results into a table, and not a view + ); + + - View the output table containing the session IDs: + + SELECT eventlog.*, sessionize_output.session_id FROM eventlog INNER JOIN sessionize_output ON + (eventlog.user_id=sessionize_output.user_id AND eventlog.event_timestamp=sessionize_output.event_timestamp); + + DROP TABLE sessionize_output; """ return help_string.format(schema_madlib=schema_madlib) diff --git a/src/ports/postgres/modules/utilities/sessionize.sql_in b/src/ports/postgres/modules/utilities/sessionize.sql_in index 814708e54..d0fab1189 100644 --- a/src/ports/postgres/modules/utilities/sessionize.sql_in +++ b/src/ports/postgres/modules/utilities/sessionize.sql_in @@ -41,8 +41,8 @@ m4_include(`SQLCommon.m4') @brief Sessionize Functions @details The goal of the MADlib sessionize function is to perform session reconstruction on a data set, so that it can be prepared for input into other algorithms such as path functions, -or predictive analytics algorithms. Sessions for instance, can be defined based on time, -as a sequence of events by a particular user where no more than n-seconds have elapsed between +or predictive analytics algorithms. Sessions for instance, can be defined based on time, +as a sequence of events by a particular user where no more than n-seconds have elapsed between successive events. That is, if we don't see an event from a user for n seconds, a new session is started. @@ -54,7 +54,9 @@ sessionize( output_table, partition_expr, time_stamp, - max_time + max_time, + output_cols, + create_view ) @@ -65,7 +67,7 @@ sessionize( analysis.
DROP TABLE IF EXISTS eventlog; CREATE TABLE eventlog (event_timestamp TIMESTAMP, @@ -140,18 +150,19 @@ INSERT INTO eventlog VALUES ('04/15/2015 02:19:00', 103711, 'WINE', 0);--# Sessionize the table for each user_id: +Sessionize the table for each user_id:
DROP TABLE IF EXISTS sessionize_output; SELECT madlib.sessionize( - 'eventlog', -- Name of input table - 'sessionize_output', -- Table name to store sessionized results - 'user_id', -- Partition input table by session - 'event_timestamp ASC', -- Order partitions in input table by time - '0:3:0' -- Events within a window of this time unit (3 minutes) must be in the same session + 'eventlog', -- Name of input table + 'sessionize_output', -- Table name to store sessionized results + 'user_id', -- Partition input table by session + 'event_timestamp', -- Order partitions in input table by time + '0:3:0' -- Events within a window of this time unit (3 minutes) must be in the same session ); SELECT * FROM sessionize_output;+ Result:
event_timestamp | user_id | page | revenue | session_id @@ -190,59 +201,65 @@ Result: 2015-04-15 02:17:00 | 103711 | BEER | 0 | 1 2015-04-15 02:18:00 | 103711 | LANDING | 0 | 1 2015-04-15 02:19:00 | 103711 | WINE | 0 | 1 +(34 rows)--# Sessionize the table based on a partition expression: +Sessionize the table based on a partition expression:
-DROP TABLE IF EXISTS sessionize_output; -SELECT madlib.sessionize( - 'eventlog', -- Name of input table - 'sessionize_output', -- Table name to store sessionized results - 'user_id < 101500', -- Partition input table by session - 'event_timestamp ASC', -- Order partitions in input table by time - '180' -- Events within a window of this time unit (180 seconds) must be in the same session - -- Note that this is the same as '0:3:0' + DROP VIEW IF EXISTS sessionize_output; + SELECT madlib.sessionize( + 'eventlog', -- Name of input table + 'sessionize_output', -- Table name to store sessionized results + 'user_id < 101500', -- Partition input table by session + 'event_timestamp', -- Order partitions in input table by time + '180', -- Events within a window of this time unit (180 seconds) must be in the same session + -- Note that this is the same as '0:3:0' + 'user_id, event_timestamp, user_id < 101500 AS "user_id < 101500"', -- Select only user_id and event_timestamp columns, along with the session id as output + 'f' -- Materialize the results into a table ); -SELECT * FROM sessionize_output; +SELECT eventlog.*, sessionize_output.session_id, sessionize_output."user_id < 101500" FROM sessionize_output INNER JOIN eventlog ON +(eventlog.user_id=sessionize_output.user_id AND eventlog.event_timestamp=sessionize_output.event_timestamp) ORDER BY "user_id < 101500", session_id;+ Result:
- event_timestamp | user_id | page | revenue | session_id ----------------------+---------+----------+---------+------------ - 2015-04-15 01:05:00 | 102201 | LANDING | 0 | 1 - 2015-04-15 01:06:00 | 102201 | HELP | 0 | 1 - 2015-04-15 01:09:00 | 102201 | LANDING | 0 | 1 - 2015-04-15 02:15:00 | 102201 | WINE | 0 | 2 - 2015-04-15 02:16:00 | 102201 | BEER | 0 | 2 - 2015-04-15 02:17:00 | 103711 | BEER | 0 | 2 - 2015-04-15 02:17:00 | 102201 | WINE | 0 | 2 - 2015-04-15 02:18:00 | 103711 | LANDING | 0 | 2 - 2015-04-15 02:18:00 | 102871 | BEER | 0 | 2 - 2015-04-15 02:19:00 | 102871 | WINE | 0 | 2 - 2015-04-15 02:19:00 | 103711 | WINE | 0 | 2 - 2015-04-15 02:22:00 | 102871 | CHECKOUT | 21 | 2 - 2015-04-15 02:25:00 | 102871 | LANDING | 0 | 2 - 2015-04-15 02:29:00 | 101881 | LANDING | 0 | 3 - 2015-04-15 02:30:00 | 101881 | BEER | 0 | 3 - 2015-04-15 01:03:00 | 100821 | LANDING | 0 | 1 - 2015-04-15 01:04:00 | 100821 | WINE | 0 | 1 - 2015-04-15 01:05:00 | 100821 | CHECKOUT | 39 | 1 - 2015-04-15 01:15:00 | 101121 | LANDING | 0 | 2 - 2015-04-15 01:16:00 | 101121 | WINE | 0 | 2 - 2015-04-15 01:17:00 | 101121 | CHECKOUT | 15 | 2 - 2015-04-15 01:18:00 | 101121 | LANDING | 0 | 2 - 2015-04-15 01:19:00 | 101121 | HELP | 0 | 2 - 2015-04-15 01:21:00 | 101121 | WINE | 0 | 2 - 2015-04-15 01:22:00 | 101121 | CHECKOUT | 23 | 2 - 2015-04-15 02:06:00 | 100821 | WINE | 0 | 3 - 2015-04-15 02:09:00 | 100821 | WINE | 0 | 3 - 2015-04-15 02:15:00 | 101331 | LANDING | 0 | 4 - 2015-04-15 02:16:00 | 101331 | WINE | 0 | 4 - 2015-04-15 02:17:00 | 101331 | HELP | 0 | 4 - 2015-04-15 02:18:00 | 101331 | WINE | 0 | 4 - 2015-04-15 02:19:00 | 101331 | CHECKOUT | 16 | 4 - 2015-04-15 02:22:00 | 101443 | BEER | 0 | 4 - 2015-04-15 02:25:00 | 101443 | CHECKOUT | 12 | 4 + event_timestamp | user_id | page | revenue | session_id | user_id < 101500 +---------------------+---------+----------+---------+------------+------------------ + 2015-04-15 01:05:00 | 102201 | LANDING | 0 | 1 | f + 2015-04-15 01:09:00 | 102201 | LANDING | 0 | 1 | f + 2015-04-15 01:06:00 | 102201 | HELP | 0 | 1 | f + 2015-04-15 02:19:00 | 103711 | WINE | 0 | 2 | f + 2015-04-15 02:18:00 | 103711 | LANDING | 0 | 2 | f + 2015-04-15 02:17:00 | 103711 | BEER | 0 | 2 | f + 2015-04-15 02:25:00 | 102871 | LANDING | 0 | 2 | f + 2015-04-15 02:22:00 | 102871 | CHECKOUT | 21 | 2 | f + 2015-04-15 02:19:00 | 102871 | WINE | 0 | 2 | f + 2015-04-15 02:18:00 | 102871 | BEER | 0 | 2 | f + 2015-04-15 02:17:00 | 102201 | WINE | 0 | 2 | f + 2015-04-15 02:16:00 | 102201 | BEER | 0 | 2 | f + 2015-04-15 02:15:00 | 102201 | WINE | 0 | 2 | f + 2015-04-15 02:30:00 | 101881 | BEER | 0 | 3 | f + 2015-04-15 02:29:00 | 101881 | LANDING | 0 | 3 | f + 2015-04-15 01:03:00 | 100821 | LANDING | 0 | 1 | t + 2015-04-15 01:04:00 | 100821 | WINE | 0 | 1 | t + 2015-04-15 01:05:00 | 100821 | CHECKOUT | 39 | 1 | t + 2015-04-15 01:22:00 | 101121 | CHECKOUT | 23 | 2 | t + 2015-04-15 01:15:00 | 101121 | LANDING | 0 | 2 | t + 2015-04-15 01:16:00 | 101121 | WINE | 0 | 2 | t + 2015-04-15 01:17:00 | 101121 | CHECKOUT | 15 | 2 | t + 2015-04-15 01:18:00 | 101121 | LANDING | 0 | 2 | t + 2015-04-15 01:19:00 | 101121 | HELP | 0 | 2 | t + 2015-04-15 01:21:00 | 101121 | WINE | 0 | 2 | t + 2015-04-15 02:06:00 | 100821 | WINE | 0 | 3 | t + 2015-04-15 02:09:00 | 100821 | WINE | 0 | 3 | t + 2015-04-15 02:16:00 | 101331 | WINE | 0 | 4 | t + 2015-04-15 02:17:00 | 101331 | HELP | 0 | 4 | t + 2015-04-15 02:18:00 | 101331 | WINE | 0 | 4 | t + 2015-04-15 02:19:00 | 101331 | CHECKOUT | 16 | 4 | t + 2015-04-15 02:22:00 | 101443 | BEER | 0 | 4 | t + 2015-04-15 02:25:00 | 101443 | CHECKOUT | 12 | 4 | t + 2015-04-15 02:15:00 | 101331 | LANDING | 0 | 4 | t +(34 rows)@anchor literature @@ -259,12 +276,37 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.sessionize( output_table VARCHAR, partition_expr VARCHAR, time_stamp VARCHAR, - max_time INTERVAL -) RETURNS TEXT AS $$ + max_time INTERVAL, + output_cols VARCHAR, + create_view BOOLEAN +) RETURNS void AS $$ PythonFunction(utilities, sessionize, sessionize) $$ LANGUAGE plpythonu m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.sessionize( + source_table VARCHAR, + output_table VARCHAR, + partition_expr VARCHAR, + time_stamp VARCHAR, + max_time INTERVAL, + output_cols VARCHAR +) RETURNS void AS $$ + SELECT MADLIB_SCHEMA.sessionize($1, $2, $3, $4, $5, $6, NULL); +$$ LANGUAGE SQL +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.sessionize( + source_table VARCHAR, + output_table VARCHAR, + partition_expr VARCHAR, + time_stamp VARCHAR, + max_time INTERVAL +) RETURNS void AS $$ + SELECT MADLIB_SCHEMA.sessionize($1, $2, $3, $4, $5, NULL, NULL); +$$ LANGUAGE SQL +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + ------------------------------------------------------------------------- CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.sessionize(message TEXT) RETURNS text AS $$ @@ -277,3 +319,4 @@ RETURNS text AS $$ SELECT MADLIB_SCHEMA.sessionize(''); $$ language SQL m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); + diff --git a/src/ports/postgres/modules/utilities/test/sessionize.sql_in b/src/ports/postgres/modules/utilities/test/sessionize.sql_in index e3c16ae9d..8e58d1b89 100644 --- a/src/ports/postgres/modules/utilities/test/sessionize.sql_in +++ b/src/ports/postgres/modules/utilities/test/sessionize.sql_in @@ -25,60 +25,62 @@ */ /* ----------------------------------------------------------------------- */ -CREATE TABLE eventlog (event_timestamp TIMESTAMP, - user_id INT, +CREATE TABLE eventlog_installchk (event_timestamp TIMESTAMP, + "user id" INT, original_session_id INT, page TEXT, revenue FLOAT, - row INT); -INSERT INTO eventlog VALUES -('04/15/2015 01:03:0.5', 100821, 1, 'LANDING', 0, 1), -('04/15/2015 01:05:00', 100821, 1, 'WINE', 0, 1), -('04/15/2015 01:07:00', 100821, 1, 'CHECKOUT', 39, 1), -('04/15/2015 02:06:00', 100821, 2, 'WINE', 0, 1), -('04/15/2015 02:07:00', 100821, 2, 'WINE', 0, 1), -('04/15/2015 01:15:00', 101121, 1, 'LANDING', 0, 1), -('04/15/2015 01:16:00', 101121, 1, 'WINE', 0, 1), -('04/15/2015 01:18:00', 101121, 1, 'CHECKOUT', 15, 1), -('04/15/2015 01:19:00', 101121, 1, 'LANDING', 0, 1), -('04/15/2015 01:21:00', 101121, 1, 'HELP', 0, 1), -(NULL, 101121, NULL, 'LANDING', 0, 1), -(NULL, 101121, NULL, 'HELP', 0, 1), -('04/15/2015 01:24:00', 101121, 1, 'WINE', 0, 1), -('04/15/2015 01:26:00', 101121, 1, 'CHECKOUT', 23, 1), -('04/15/2015 02:21:00', 101121, 2, 'HELP', 0, 1), -('04/15/2015 02:24:00', 101121, 2, 'WINE', 0, 1), -('04/15/2015 02:26:00', 101121, 2, 'CHECKOUT', 23, 1), -('04/15/2015 02:15:00', 101331, 1, 'LANDING', 0, 1), -('04/15/2015 02:16:0.56', 101331, 1, 'WINE', 0, 1), -('04/15/2015 02:18:00', 101331, 1, 'HELP', 0, 1), -('04/15/2015 02:20:00', 101331, 1, 'WINE', 0, 1), -('04/15/2015 02:21:00', 101331, 1, 'CHECKOUT', 16, 1), -('04/15/2015 02:22:00', 101443, 1, 'BEER', 0, 1), -('04/15/2015 02:27:00', 101443, 2, 'CHECKOUT', 12, 1), -('04/15/2015 02:29:00', 101881, 1, 'LANDING', 0, 1), -('04/15/2015 02:30:00', 101881, 1, 'BEER', 0, 1), -('04/15/2015 01:05:00', 102201, 1, 'LANDING', 0, 1), -('04/15/2015 01:06:00', 102201, 1, 'HELP', 0, 1), -('04/15/2015 01:10:00', 102201, 2, 'LANDING', 0, 1), -('04/15/2015 02:15:00', 102201, 3, 'WINE', 0, 1), -('04/15/2015 02:16:00', 102201, 3, 'BEER', 0, 1), -('04/15/2015 02:17:00', 102201, 3, 'WINE', 0, 1), -('04/15/2015 02:18:00', 102871, 1, 'BEER', 0, 1), -('04/15/2015 02:19:00', 102871, 1, 'WINE', 0, 1), -('04/15/2015 02:22:00', 102871, 1, 'CHECKOUT', 21, 1), -('04/15/2015 02:25:00', 102871, 1, 'LANDING', 0, 1), -(NULL, 103711, NULL, 'BEER', 0, 1), -(NULL, 103711, NULL, 'LANDING', 0, 1), -(NULL, 103711, NULL, 'WINE', 0, 1), -('04/15/2016 02:17:00', 103711, 1, 'BEER', 0, 1), -('04/15/2016 02:21:00', 103711, 2, 'LANDING', 0, 1), -('04/15/2016 02:31:0.05', 103711, 3, 'WINE', 0, 1); + row INT, + part_expr BOOLEAN); +INSERT INTO eventlog_installchk VALUES +('04/15/2015 01:03:0.5', 100821, 1, 'LANDING', 0, 1, 'f'), +('04/15/2015 01:05:00', 100821, 1, 'WINE', 0, 1, 'f'), +('04/15/2015 01:07:00', 100821, 1, 'CHECKOUT', 39, 1, 'f'), +('04/15/2015 02:06:00', 100821, 2, 'WINE', 0, 1, 'f'), +('04/15/2015 02:07:00', 100821, 2, 'WINE', 0, 1, 'f'), +('04/15/2015 01:15:00', 101121, 1, 'LANDING', 0, 1, 'f'), +('04/15/2015 01:16:00', 101121, 1, 'WINE', 0, 1, 'f'), +('04/15/2015 01:18:00', 101121, 1, 'CHECKOUT', 15, 1, 'f'), +('04/15/2015 01:19:00', 101121, 1, 'LANDING', 0, 1, 'f'), +('04/15/2015 01:21:00', 101121, 1, 'HELP', 0, 1, 'f'), +(NULL, 101121, NULL, 'LANDING', 0, 1, 'f'), +(NULL, 101121, NULL, 'HELP', 0, 1, 'f'), +('04/15/2015 01:24:00', 101121, 1, 'WINE', 0, 1, 'f'), +('04/15/2015 01:26:00', 101121, 1, 'CHECKOUT', 23, 1, 'f'), +('04/15/2015 02:21:00', 101121, 2, 'HELP', 0, 1, 'f'), +('04/15/2015 02:24:00', 101121, 2, 'WINE', 0, 1, 'f'), +('04/15/2015 02:26:00', 101121, 2, 'CHECKOUT', 23, 1, 't'), +('04/15/2015 02:15:00', 101331, 1, 'LANDING', 0, 1, 't'), +('04/15/2015 02:16:0.56', 101331, 1, 'WINE', 0, 1, 't'), +('04/15/2015 02:18:00', 101331, 1, 'HELP', 0, 1, 't'), +('04/15/2015 02:20:00', 101331, 1, 'WINE', 0, 1, 't'), +('04/15/2015 02:21:00', 101331, 1, 'CHECKOUT', 16, 1, 't'), +('04/15/2015 02:22:00', 101443, 1, 'BEER', 0, 1, 't'), +('04/15/2015 02:27:00', 101443, 2, 'CHECKOUT', 12, 1, 't'), +('04/15/2015 02:29:00', 101881, 1, 'LANDING', 0, 1, 't'), +('04/15/2015 02:30:00', 101881, 1, 'BEER', 0, 1, 't'), +('04/15/2015 01:05:00', 102201, 1, 'LANDING', 0, 1, 't'), +('04/15/2015 01:06:00', 102201, 1, 'HELP', 0, 1, 't'), +('04/15/2015 01:10:00', 102201, 2, 'LANDING', 0, 1, 't'), +('04/15/2015 02:15:00', 102201, 3, 'WINE', 0, 1, 't'), +('04/15/2015 02:16:00', 102201, 3, 'BEER', 0, 1, 't'), +('04/15/2015 02:17:00', 102201, 3, 'WINE', 0, 1, 't'), +('04/15/2015 02:18:00', 102871, 1, 'BEER', 0, 1, 't'), +('04/15/2015 02:19:00', 102871, 1, 'WINE', 0, 1, 't'), +('04/15/2015 02:22:00', 102871, 1, 'CHECKOUT', 21, 1, 't'), +('04/15/2015 02:25:00', 102871, 1, 'LANDING', 0, 1, 't'), +(NULL, 103711, NULL, 'BEER', 0, 1, 't'), +(NULL, 103711, NULL, 'LANDING', 0, 1, 't'), +(NULL, 103711, NULL, 'WINE', 0, 1, 't'), +('04/15/2016 02:17:00', 103711, 1, 'BEER', 0, 1, 't'), +('04/15/2016 02:21:00', 103711, 2, 'LANDING', 0, 1, 't'), +('04/15/2016 02:31:0.05', 103711, 3, 'WINE', 0, 1, 't'); + SELECT sessionize( - 'eventlog', -- Name of the input table - 'sessionize_output', -- Name of the output table - 'user_id', -- Partition expression to group the data + 'eventlog_installchk', -- Name of the input table + 'sessionize_output_v', -- Name of the output table + '"user id"', -- Partition expression to group the data 'event_timestamp', -- Order expression to sort the tuples of the data table '0:3:0' -- Max time that can elapse between consecutive rows to be considered part of the same session ); @@ -87,6 +89,23 @@ SELECT assert( relative_error(array_agg(CASE WHEN original_session_id NOTNULL THEN original_session_id ELSE 0 END), array_agg(CASE WHEN session_id NOTNULL THEN session_id ELSE 0 END)) < 1e-6, 'wrong output in sessionization') -FROM sessionize_output; +FROM sessionize_output_v; +SELECT * FROM sessionize_output_v; + +SELECT sessionize( + 'eventlog_installchk', -- Name of the input table + 'sessionize_output_t', -- Name of the output table + '"user id"<102000', -- Partition expression to group the data + 'event_timestamp', -- Order expression to sort the tuples of the data table + '180', -- Max time that can elapse between consecutive rows to be considered part of the same session + '*,"user id"<102000 AS uid,revenue>5 AS rid', -- Select all columns in the input table, along with the partition expression and session id columns + 'f' -- Materialize results into a table + ); + +SELECT + assert( + relative_error(array_agg(CASE WHEN eventlog_installchk.part_expr THEN 1 ELSE 0 END), array_agg(CASE WHEN sessionize_output_t.uid THEN 1 ELSE 0 END)) != 0, + 'wrong output in sessionization') +FROM sessionize_output_t, eventlog_installchk; +SELECT * FROM sessionize_output_t; -SELECT * FROM sessionize_output;