diff --git a/src/ports/postgres/modules/utilities/sessionize.py_in b/src/ports/postgres/modules/utilities/sessionize.py_in index cff86a57e..7b8e77791 100644 --- a/src/ports/postgres/modules/utilities/sessionize.py_in +++ b/src/ports/postgres/modules/utilities/sessionize.py_in @@ -17,16 +17,17 @@ import plpy import string +import re from control import MinWarning -from utilities import unique_string, _assert +from utilities import unique_string, _assert, split_quoted_delimited_str from validate_args import get_cols from validate_args import input_tbl_valid, output_tbl_valid, is_var_valid m4_changequote(`') def sessionize(schema_madlib, source_table, output_table, partition_expr, - time_stamp, max_time, **kwargs): + time_stamp, max_time, output_cols=None, create_view=None, **kwargs): """ Perform sessionization over a sequence of rows. @@ -35,41 +36,74 @@ def sessionize(schema_madlib, source_table, output_table, partition_expr, @param source_table: str, Name of the input table/view @param output_table: str, Name of the table to store result @param partition_expr: str, Expression to partition (group) the input data - @param time_stamp: str, Column name with time used for sessionization calculation + @param time_stamp: str, The time stamp column name that is used for sessionization calculation @param max_time: interval, Delta time between subsequent events to define a session - + @param output_cols: str, a valid postgres SELECT expression + @param create_view: boolean, indicates if the output is a view or a table with name + specified by output_table (default TRUE): + TRUE - create view + FALSE - materialize results into a table """ with MinWarning("error"): _validate(source_table, output_table, partition_expr, time_stamp, max_time) - - all_input_cols_str = ', '.join([i.strip() for i in get_cols(source_table, schema_madlib)]) - session_id = 'session_id' if not is_var_valid(source_table, 'session_id') else unique_string('session_id') + table_or_view = 'VIEW' if create_view or create_view is None else 'TABLE' + output_cols = '*' if output_cols is None else output_cols + + # If the output_cols has '*' as one of the elements, expand it to + # include all columns in the source table. The following list + # comprehension is only to handle the case where '*' is included + # in output_cols. Using '*' as is, without expanding it to specific + # column names leads to some temporary intermediate columns + # (new_partition and new_session defined below) occurring in the output. + cols_to_project_list = [', '.join(get_cols(source_table, schema_madlib)) if i=='*' else i + for i in split_quoted_delimited_str(output_cols)] + + # Examples of Invalid SELECT expression in output_cols: + # 1) If output_cols contains '*' along with an existing column name + # in the source table, postgres will throw an error and fail + # for specifying duplicate column names in the output table/view. + # 2) If output_cols contains more than 1 expressions which are not + # renamed using ' AS ', postgres will fail since it will try to + # rename all such new columns as '?column?'. This is considered an + # invalid SELECT expression. + cols_to_project = ', '.join(cols_to_project_list) + + session_id = 'session_id' if not is_var_valid(source_table, 'session_id')\ + else unique_string('session_id') # Create temp column names for intermediate columns. new_partition = unique_string('new_partition') new_session = unique_string('new_session') - plpy.execute(""" - CREATE TABLE {output_table} AS - SELECT - {all_input_cols_str}, - CASE WHEN {time_stamp} IS NOT NULL - THEN SUM(CASE WHEN {new_partition} OR {new_session} THEN 1 END) - OVER (PARTITION BY {partition_expr} - ORDER BY {time_stamp}) - END AS {session_id} - FROM ( - SELECT *, - ROW_NUMBER() OVER (w) = 1 - AND {time_stamp} IS NOT NULL AS {new_partition}, - ({time_stamp} - LAG({time_stamp}, 1) - OVER (w)) > '{max_time}'::INTERVAL AS {new_session} - FROM {source_table} - WINDOW w AS (PARTITION BY {partition_expr} - ORDER BY {time_stamp}) - ) a - """.format(**locals())) - + try: + plpy.execute(""" + CREATE {table_or_view} {output_table} AS + SELECT + {cols_to_project}, + CASE WHEN {time_stamp} IS NOT NULL + THEN SUM(CASE WHEN {new_partition} OR {new_session} THEN 1 END) + OVER (PARTITION BY {partition_expr} ORDER BY {time_stamp}) + END AS {session_id} + FROM ( + SELECT *, + ROW_NUMBER() OVER (w) = 1 AND {time_stamp} IS NOT NULL AS {new_partition}, + ({time_stamp}-LAG({time_stamp}, 1) OVER (w)) > '{max_time}'::INTERVAL AS {new_session} + FROM {source_table} WINDOW w AS (PARTITION BY {partition_expr} ORDER BY {time_stamp}) + ) a + """.format(**locals())) + except plpy.SPIError as e: + # The specific exception we want to catch here is + # "spiexceptions.DuplicateColumn". But the current version of gpdb + # does not seem to have implemented it. So catching a more generic + # exception and displaying this warning message. The reason for + # doing this is that the default error message shown by postgres + # when we have more than one expressions in output_cols that do + # not use ' AS ' to rename them is not user-friendly. + with MinWarning("warning"): + plpy.warning("A plausible error condition: the output_cols\ + parameter might be an invalid SELECT expression, resulting\ + in duplicate column names.") + raise def _validate(source_table, output_table, partition_expr, time_stamp, max_time): input_tbl_valid(source_table, 'Sessionization') @@ -80,8 +114,7 @@ def _validate(source_table, output_table, partition_expr, time_stamp, max_time): _assert(max_time, "Sessionization error: Invalid max time value") # ensure the partition/order expression can actually be used _assert(is_var_valid(source_table, partition_expr, time_stamp), - "Sessionization error: invalid partition expression or time stamp column name") - + "Sessionization error: Invalid partition expression or time stamp column name") def sessionize_help_message(schema_madlib, message, **kwargs): """ @@ -94,17 +127,19 @@ def sessionize_help_message(schema_madlib, message, **kwargs): Functionality: Sessionize The goal of the MADlib sessionize function is to perform sessionization over - a time-series based data. + a time-series based data. ------------------------------------------------------------ USAGE ------------------------------------------------------------ SELECT {schema_madlib}.sessionize( - 'source_table', -- str, Name of the table - 'output_table', -- str, Table name to store the Sessionization results - 'partition_expr', -- str, Partition expression to group the data table - 'time_stamp' -- str, Column name with time used for sessionization calculation - 'max_time' -- str, Delta time between subsequent events to define a session + 'source_table', -- str, Name of the table + 'output_table', -- str, Table name to store the Sessionization results + 'partition_expr', -- str, Partition expression to group the data table + 'time_stamp' -- str, The time stamp column name that is used for sessionization calculation + 'max_time' -- str, Delta time between subsequent events to define a session + 'output_cols' -- str, an optional valid postgres SELECT expression for the output table/view (default *) + 'create_view' -- boolean, optional parameter to specify if output is a view or materilized to a table (default True) ); ------------------------------------------------------------ @@ -171,19 +206,40 @@ def sessionize_help_message(schema_madlib, message, **kwargs): '04/15/2016 02:19:00'|103711|109|'WINE'|0|1 \. - - Sessionize the table for each user_id: + - Sessionize the table for each user_id, and obtain only the user_id, with partition expression, + event_timestamp and session_id: SELECT {schema_madlib}.sessionize( - 'eventlog', -- Name of input table - 'sessionize_output', -- Table name to store sessionized results - 'user_id', -- Partition input table by session + 'eventlog', -- Name of input table + 'sessionize_output', -- Table name to store sessionized results + 'user_id', -- Partition input table by session 'event_timestamp', -- Order partitions in input table by time - '0:3:0' -- Events within a window of this time unit (180 seconds) must be in the same session + '0:3:0' -- Events within a window of this time unit (180 seconds) must be in the same session ); - View the output table containing the session IDs: SELECT * FROM sessionize_output; + + DROP VIEW sessionize_output; + + - Sessionize the table for each user_id, and materialize all columns from source table into an output table: + SELECT {schema_madlib}.sessionize( + 'eventlog', -- Name of input table + 'sessionize_output', -- Table name to store sessionized results + 'user_id', -- Partition input table by session + 'event_timestamp', -- Order partitions in input table by time + '180' -- Events within a window of this time unit (180 seconds) must be in the same session + 'user_id, event_timestamp' -- Preseve only user_id and event_timestamp columns, along with the session id column + 'false' -- Materialize results into a table, and not a view + ); + + - View the output table containing the session IDs: + + SELECT eventlog.*, sessionize_output.session_id FROM eventlog INNER JOIN sessionize_output ON + (eventlog.user_id=sessionize_output.user_id AND eventlog.event_timestamp=sessionize_output.event_timestamp); + + DROP TABLE sessionize_output; """ return help_string.format(schema_madlib=schema_madlib) diff --git a/src/ports/postgres/modules/utilities/sessionize.sql_in b/src/ports/postgres/modules/utilities/sessionize.sql_in index 814708e54..d0fab1189 100644 --- a/src/ports/postgres/modules/utilities/sessionize.sql_in +++ b/src/ports/postgres/modules/utilities/sessionize.sql_in @@ -41,8 +41,8 @@ m4_include(`SQLCommon.m4') @brief Sessionize Functions @details The goal of the MADlib sessionize function is to perform session reconstruction on a data set, so that it can be prepared for input into other algorithms such as path functions, -or predictive analytics algorithms. Sessions for instance, can be defined based on time, -as a sequence of events by a particular user where no more than n-seconds have elapsed between +or predictive analytics algorithms. Sessions for instance, can be defined based on time, +as a sequence of events by a particular user where no more than n-seconds have elapsed between successive events. That is, if we don't see an event from a user for n seconds, a new session is started. @@ -54,7 +54,9 @@ sessionize( output_table, partition_expr, time_stamp, - max_time + max_time, + output_cols, + create_view ) @@ -65,7 +67,7 @@ sessionize( analysis.
output_table
-
VARCHAR. Name of the result table, that contains 2 new columns apart from +
VARCHAR. Name of the result view/table, that contains 2 new columns apart from the columns in the source_table: session_id and new_session.
time_stamp
-
VARCHAR. Column name with time used for sessionization calculation.
+
VARCHAR. The time stamp column name that is used for sessionization calculation.
max_time
-
INTERVAL. Delta time between subsequent events to define a sessions. - User should make sure this is in the same unit as the time_stamp parameter - (e.g., 'hour:minute:seconds', when time_stamp is of type TIMESTAMP).
+
INTERVAL. Maximum delta time between subsequent events that fall under the same session.
+ +
output_cols
+
VARCHAR. An optional valid SELECT expression indicating the columns to be materialized in the output + This is set to '*' by default, which includes all the columns in the input table and a new session id column.
+ +
create_view
+
BOOLEAN. True creates a view, and if False, the output is materialized into a table. The default + value is TRUE, thus creating a view by default.
@anchor examples @@ -94,9 +102,11 @@ sessionize( The data set describes shopper behavior on a notional web site that sells beer and wine. A beacon fires an event to a log file when the shopper visits different pages on the site: landing page, beer selection page, -wine selection page, and checkout. +wine selection page, and checkout. Each user is identified by a a user id, +and every time a page is visited, its time stamp is also recorded. + +Create the data table: -Create the date table:
 DROP TABLE IF EXISTS eventlog;
 CREATE TABLE eventlog (event_timestamp TIMESTAMP,
@@ -140,18 +150,19 @@ INSERT INTO eventlog VALUES
 ('04/15/2015 02:19:00', 103711, 'WINE', 0);
 
--# Sessionize the table for each user_id: +Sessionize the table for each user_id:
  DROP TABLE IF EXISTS sessionize_output;
  SELECT madlib.sessionize(
-     'eventlog',                -- Name of input table
-     'sessionize_output',             -- Table name to store sessionized results
-     'user_id',              -- Partition input table by session
-     'event_timestamp ASC',     -- Order partitions in input table by time
-     '0:3:0'    -- Events within a window of this time unit (3 minutes) must be in the same session
+     'eventlog',            -- Name of input table
+     'sessionize_output',   -- Table name to store sessionized results
+     'user_id',             -- Partition input table by session
+     'event_timestamp',     -- Order partitions in input table by time
+     '0:3:0'                -- Events within a window of this time unit (3 minutes) must be in the same session
      );
 SELECT * FROM sessionize_output;
 
+ Result:
    event_timestamp   | user_id |   page   | revenue | session_id
@@ -190,59 +201,65 @@ Result:
  2015-04-15 02:17:00 |  103711 | BEER     |       0 |          1
  2015-04-15 02:18:00 |  103711 | LANDING  |       0 |          1
  2015-04-15 02:19:00 |  103711 | WINE     |       0 |          1
+(34 rows)
 
--# Sessionize the table based on a partition expression: +Sessionize the table based on a partition expression:
-DROP TABLE IF EXISTS sessionize_output;
-SELECT madlib.sessionize(
-     'eventlog',                -- Name of input table
-     'sessionize_output',             -- Table name to store sessionized results
-     'user_id < 101500',              -- Partition input table by session
-     'event_timestamp ASC',     -- Order partitions in input table by time
-     '180'    -- Events within a window of this time unit (180 seconds) must be in the same session
-              -- Note that this is the same as '0:3:0'
+ DROP VIEW IF EXISTS sessionize_output;
+ SELECT madlib.sessionize(
+     'eventlog',                    -- Name of input table
+     'sessionize_output',           -- Table name to store sessionized results
+     'user_id < 101500',            -- Partition input table by session
+     'event_timestamp',             -- Order partitions in input table by time
+     '180',                         -- Events within a window of this time unit (180 seconds) must be in the same session
+                                    -- Note that this is the same as '0:3:0'
+     'user_id, event_timestamp, user_id < 101500 AS "user_id < 101500"',    -- Select only user_id and event_timestamp columns, along with the session id as output
+     'f'                            -- Materialize the results into a table
      );
-SELECT * FROM sessionize_output;
+SELECT eventlog.*, sessionize_output.session_id, sessionize_output."user_id < 101500" FROM sessionize_output INNER JOIN eventlog ON
+(eventlog.user_id=sessionize_output.user_id AND eventlog.event_timestamp=sessionize_output.event_timestamp) ORDER BY "user_id < 101500", session_id;
 
+ Result:
-   event_timestamp   | user_id |   page   | revenue | session_id
----------------------+---------+----------+---------+------------
- 2015-04-15 01:05:00 |  102201 | LANDING  |       0 |          1
- 2015-04-15 01:06:00 |  102201 | HELP     |       0 |          1
- 2015-04-15 01:09:00 |  102201 | LANDING  |       0 |          1
- 2015-04-15 02:15:00 |  102201 | WINE     |       0 |          2
- 2015-04-15 02:16:00 |  102201 | BEER     |       0 |          2
- 2015-04-15 02:17:00 |  103711 | BEER     |       0 |          2
- 2015-04-15 02:17:00 |  102201 | WINE     |       0 |          2
- 2015-04-15 02:18:00 |  103711 | LANDING  |       0 |          2
- 2015-04-15 02:18:00 |  102871 | BEER     |       0 |          2
- 2015-04-15 02:19:00 |  102871 | WINE     |       0 |          2
- 2015-04-15 02:19:00 |  103711 | WINE     |       0 |          2
- 2015-04-15 02:22:00 |  102871 | CHECKOUT |      21 |          2
- 2015-04-15 02:25:00 |  102871 | LANDING  |       0 |          2
- 2015-04-15 02:29:00 |  101881 | LANDING  |       0 |          3
- 2015-04-15 02:30:00 |  101881 | BEER     |       0 |          3
- 2015-04-15 01:03:00 |  100821 | LANDING  |       0 |          1
- 2015-04-15 01:04:00 |  100821 | WINE     |       0 |          1
- 2015-04-15 01:05:00 |  100821 | CHECKOUT |      39 |          1
- 2015-04-15 01:15:00 |  101121 | LANDING  |       0 |          2
- 2015-04-15 01:16:00 |  101121 | WINE     |       0 |          2
- 2015-04-15 01:17:00 |  101121 | CHECKOUT |      15 |          2
- 2015-04-15 01:18:00 |  101121 | LANDING  |       0 |          2
- 2015-04-15 01:19:00 |  101121 | HELP     |       0 |          2
- 2015-04-15 01:21:00 |  101121 | WINE     |       0 |          2
- 2015-04-15 01:22:00 |  101121 | CHECKOUT |      23 |          2
- 2015-04-15 02:06:00 |  100821 | WINE     |       0 |          3
- 2015-04-15 02:09:00 |  100821 | WINE     |       0 |          3
- 2015-04-15 02:15:00 |  101331 | LANDING  |       0 |          4
- 2015-04-15 02:16:00 |  101331 | WINE     |       0 |          4
- 2015-04-15 02:17:00 |  101331 | HELP     |       0 |          4
- 2015-04-15 02:18:00 |  101331 | WINE     |       0 |          4
- 2015-04-15 02:19:00 |  101331 | CHECKOUT |      16 |          4
- 2015-04-15 02:22:00 |  101443 | BEER     |       0 |          4
- 2015-04-15 02:25:00 |  101443 | CHECKOUT |      12 |          4
+   event_timestamp   | user_id |   page   | revenue | session_id | user_id < 101500
+---------------------+---------+----------+---------+------------+------------------
+ 2015-04-15 01:05:00 |  102201 | LANDING  |       0 |          1 | f
+ 2015-04-15 01:09:00 |  102201 | LANDING  |       0 |          1 | f
+ 2015-04-15 01:06:00 |  102201 | HELP     |       0 |          1 | f
+ 2015-04-15 02:19:00 |  103711 | WINE     |       0 |          2 | f
+ 2015-04-15 02:18:00 |  103711 | LANDING  |       0 |          2 | f
+ 2015-04-15 02:17:00 |  103711 | BEER     |       0 |          2 | f
+ 2015-04-15 02:25:00 |  102871 | LANDING  |       0 |          2 | f
+ 2015-04-15 02:22:00 |  102871 | CHECKOUT |      21 |          2 | f
+ 2015-04-15 02:19:00 |  102871 | WINE     |       0 |          2 | f
+ 2015-04-15 02:18:00 |  102871 | BEER     |       0 |          2 | f
+ 2015-04-15 02:17:00 |  102201 | WINE     |       0 |          2 | f
+ 2015-04-15 02:16:00 |  102201 | BEER     |       0 |          2 | f
+ 2015-04-15 02:15:00 |  102201 | WINE     |       0 |          2 | f
+ 2015-04-15 02:30:00 |  101881 | BEER     |       0 |          3 | f
+ 2015-04-15 02:29:00 |  101881 | LANDING  |       0 |          3 | f
+ 2015-04-15 01:03:00 |  100821 | LANDING  |       0 |          1 | t
+ 2015-04-15 01:04:00 |  100821 | WINE     |       0 |          1 | t
+ 2015-04-15 01:05:00 |  100821 | CHECKOUT |      39 |          1 | t
+ 2015-04-15 01:22:00 |  101121 | CHECKOUT |      23 |          2 | t
+ 2015-04-15 01:15:00 |  101121 | LANDING  |       0 |          2 | t
+ 2015-04-15 01:16:00 |  101121 | WINE     |       0 |          2 | t
+ 2015-04-15 01:17:00 |  101121 | CHECKOUT |      15 |          2 | t
+ 2015-04-15 01:18:00 |  101121 | LANDING  |       0 |          2 | t
+ 2015-04-15 01:19:00 |  101121 | HELP     |       0 |          2 | t
+ 2015-04-15 01:21:00 |  101121 | WINE     |       0 |          2 | t
+ 2015-04-15 02:06:00 |  100821 | WINE     |       0 |          3 | t
+ 2015-04-15 02:09:00 |  100821 | WINE     |       0 |          3 | t
+ 2015-04-15 02:16:00 |  101331 | WINE     |       0 |          4 | t
+ 2015-04-15 02:17:00 |  101331 | HELP     |       0 |          4 | t
+ 2015-04-15 02:18:00 |  101331 | WINE     |       0 |          4 | t
+ 2015-04-15 02:19:00 |  101331 | CHECKOUT |      16 |          4 | t
+ 2015-04-15 02:22:00 |  101443 | BEER     |       0 |          4 | t
+ 2015-04-15 02:25:00 |  101443 | CHECKOUT |      12 |          4 | t
+ 2015-04-15 02:15:00 |  101331 | LANDING  |       0 |          4 | t
+(34 rows)
 
@anchor literature @@ -259,12 +276,37 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.sessionize( output_table VARCHAR, partition_expr VARCHAR, time_stamp VARCHAR, - max_time INTERVAL -) RETURNS TEXT AS $$ + max_time INTERVAL, + output_cols VARCHAR, + create_view BOOLEAN +) RETURNS void AS $$ PythonFunction(utilities, sessionize, sessionize) $$ LANGUAGE plpythonu m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.sessionize( + source_table VARCHAR, + output_table VARCHAR, + partition_expr VARCHAR, + time_stamp VARCHAR, + max_time INTERVAL, + output_cols VARCHAR +) RETURNS void AS $$ + SELECT MADLIB_SCHEMA.sessionize($1, $2, $3, $4, $5, $6, NULL); +$$ LANGUAGE SQL +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.sessionize( + source_table VARCHAR, + output_table VARCHAR, + partition_expr VARCHAR, + time_stamp VARCHAR, + max_time INTERVAL +) RETURNS void AS $$ + SELECT MADLIB_SCHEMA.sessionize($1, $2, $3, $4, $5, NULL, NULL); +$$ LANGUAGE SQL +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + ------------------------------------------------------------------------- CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.sessionize(message TEXT) RETURNS text AS $$ @@ -277,3 +319,4 @@ RETURNS text AS $$ SELECT MADLIB_SCHEMA.sessionize(''); $$ language SQL m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); + diff --git a/src/ports/postgres/modules/utilities/test/sessionize.sql_in b/src/ports/postgres/modules/utilities/test/sessionize.sql_in index e3c16ae9d..8e58d1b89 100644 --- a/src/ports/postgres/modules/utilities/test/sessionize.sql_in +++ b/src/ports/postgres/modules/utilities/test/sessionize.sql_in @@ -25,60 +25,62 @@ */ /* ----------------------------------------------------------------------- */ -CREATE TABLE eventlog (event_timestamp TIMESTAMP, - user_id INT, +CREATE TABLE eventlog_installchk (event_timestamp TIMESTAMP, + "user id" INT, original_session_id INT, page TEXT, revenue FLOAT, - row INT); -INSERT INTO eventlog VALUES -('04/15/2015 01:03:0.5', 100821, 1, 'LANDING', 0, 1), -('04/15/2015 01:05:00', 100821, 1, 'WINE', 0, 1), -('04/15/2015 01:07:00', 100821, 1, 'CHECKOUT', 39, 1), -('04/15/2015 02:06:00', 100821, 2, 'WINE', 0, 1), -('04/15/2015 02:07:00', 100821, 2, 'WINE', 0, 1), -('04/15/2015 01:15:00', 101121, 1, 'LANDING', 0, 1), -('04/15/2015 01:16:00', 101121, 1, 'WINE', 0, 1), -('04/15/2015 01:18:00', 101121, 1, 'CHECKOUT', 15, 1), -('04/15/2015 01:19:00', 101121, 1, 'LANDING', 0, 1), -('04/15/2015 01:21:00', 101121, 1, 'HELP', 0, 1), -(NULL, 101121, NULL, 'LANDING', 0, 1), -(NULL, 101121, NULL, 'HELP', 0, 1), -('04/15/2015 01:24:00', 101121, 1, 'WINE', 0, 1), -('04/15/2015 01:26:00', 101121, 1, 'CHECKOUT', 23, 1), -('04/15/2015 02:21:00', 101121, 2, 'HELP', 0, 1), -('04/15/2015 02:24:00', 101121, 2, 'WINE', 0, 1), -('04/15/2015 02:26:00', 101121, 2, 'CHECKOUT', 23, 1), -('04/15/2015 02:15:00', 101331, 1, 'LANDING', 0, 1), -('04/15/2015 02:16:0.56', 101331, 1, 'WINE', 0, 1), -('04/15/2015 02:18:00', 101331, 1, 'HELP', 0, 1), -('04/15/2015 02:20:00', 101331, 1, 'WINE', 0, 1), -('04/15/2015 02:21:00', 101331, 1, 'CHECKOUT', 16, 1), -('04/15/2015 02:22:00', 101443, 1, 'BEER', 0, 1), -('04/15/2015 02:27:00', 101443, 2, 'CHECKOUT', 12, 1), -('04/15/2015 02:29:00', 101881, 1, 'LANDING', 0, 1), -('04/15/2015 02:30:00', 101881, 1, 'BEER', 0, 1), -('04/15/2015 01:05:00', 102201, 1, 'LANDING', 0, 1), -('04/15/2015 01:06:00', 102201, 1, 'HELP', 0, 1), -('04/15/2015 01:10:00', 102201, 2, 'LANDING', 0, 1), -('04/15/2015 02:15:00', 102201, 3, 'WINE', 0, 1), -('04/15/2015 02:16:00', 102201, 3, 'BEER', 0, 1), -('04/15/2015 02:17:00', 102201, 3, 'WINE', 0, 1), -('04/15/2015 02:18:00', 102871, 1, 'BEER', 0, 1), -('04/15/2015 02:19:00', 102871, 1, 'WINE', 0, 1), -('04/15/2015 02:22:00', 102871, 1, 'CHECKOUT', 21, 1), -('04/15/2015 02:25:00', 102871, 1, 'LANDING', 0, 1), -(NULL, 103711, NULL, 'BEER', 0, 1), -(NULL, 103711, NULL, 'LANDING', 0, 1), -(NULL, 103711, NULL, 'WINE', 0, 1), -('04/15/2016 02:17:00', 103711, 1, 'BEER', 0, 1), -('04/15/2016 02:21:00', 103711, 2, 'LANDING', 0, 1), -('04/15/2016 02:31:0.05', 103711, 3, 'WINE', 0, 1); + row INT, + part_expr BOOLEAN); +INSERT INTO eventlog_installchk VALUES +('04/15/2015 01:03:0.5', 100821, 1, 'LANDING', 0, 1, 'f'), +('04/15/2015 01:05:00', 100821, 1, 'WINE', 0, 1, 'f'), +('04/15/2015 01:07:00', 100821, 1, 'CHECKOUT', 39, 1, 'f'), +('04/15/2015 02:06:00', 100821, 2, 'WINE', 0, 1, 'f'), +('04/15/2015 02:07:00', 100821, 2, 'WINE', 0, 1, 'f'), +('04/15/2015 01:15:00', 101121, 1, 'LANDING', 0, 1, 'f'), +('04/15/2015 01:16:00', 101121, 1, 'WINE', 0, 1, 'f'), +('04/15/2015 01:18:00', 101121, 1, 'CHECKOUT', 15, 1, 'f'), +('04/15/2015 01:19:00', 101121, 1, 'LANDING', 0, 1, 'f'), +('04/15/2015 01:21:00', 101121, 1, 'HELP', 0, 1, 'f'), +(NULL, 101121, NULL, 'LANDING', 0, 1, 'f'), +(NULL, 101121, NULL, 'HELP', 0, 1, 'f'), +('04/15/2015 01:24:00', 101121, 1, 'WINE', 0, 1, 'f'), +('04/15/2015 01:26:00', 101121, 1, 'CHECKOUT', 23, 1, 'f'), +('04/15/2015 02:21:00', 101121, 2, 'HELP', 0, 1, 'f'), +('04/15/2015 02:24:00', 101121, 2, 'WINE', 0, 1, 'f'), +('04/15/2015 02:26:00', 101121, 2, 'CHECKOUT', 23, 1, 't'), +('04/15/2015 02:15:00', 101331, 1, 'LANDING', 0, 1, 't'), +('04/15/2015 02:16:0.56', 101331, 1, 'WINE', 0, 1, 't'), +('04/15/2015 02:18:00', 101331, 1, 'HELP', 0, 1, 't'), +('04/15/2015 02:20:00', 101331, 1, 'WINE', 0, 1, 't'), +('04/15/2015 02:21:00', 101331, 1, 'CHECKOUT', 16, 1, 't'), +('04/15/2015 02:22:00', 101443, 1, 'BEER', 0, 1, 't'), +('04/15/2015 02:27:00', 101443, 2, 'CHECKOUT', 12, 1, 't'), +('04/15/2015 02:29:00', 101881, 1, 'LANDING', 0, 1, 't'), +('04/15/2015 02:30:00', 101881, 1, 'BEER', 0, 1, 't'), +('04/15/2015 01:05:00', 102201, 1, 'LANDING', 0, 1, 't'), +('04/15/2015 01:06:00', 102201, 1, 'HELP', 0, 1, 't'), +('04/15/2015 01:10:00', 102201, 2, 'LANDING', 0, 1, 't'), +('04/15/2015 02:15:00', 102201, 3, 'WINE', 0, 1, 't'), +('04/15/2015 02:16:00', 102201, 3, 'BEER', 0, 1, 't'), +('04/15/2015 02:17:00', 102201, 3, 'WINE', 0, 1, 't'), +('04/15/2015 02:18:00', 102871, 1, 'BEER', 0, 1, 't'), +('04/15/2015 02:19:00', 102871, 1, 'WINE', 0, 1, 't'), +('04/15/2015 02:22:00', 102871, 1, 'CHECKOUT', 21, 1, 't'), +('04/15/2015 02:25:00', 102871, 1, 'LANDING', 0, 1, 't'), +(NULL, 103711, NULL, 'BEER', 0, 1, 't'), +(NULL, 103711, NULL, 'LANDING', 0, 1, 't'), +(NULL, 103711, NULL, 'WINE', 0, 1, 't'), +('04/15/2016 02:17:00', 103711, 1, 'BEER', 0, 1, 't'), +('04/15/2016 02:21:00', 103711, 2, 'LANDING', 0, 1, 't'), +('04/15/2016 02:31:0.05', 103711, 3, 'WINE', 0, 1, 't'); + SELECT sessionize( - 'eventlog', -- Name of the input table - 'sessionize_output', -- Name of the output table - 'user_id', -- Partition expression to group the data + 'eventlog_installchk', -- Name of the input table + 'sessionize_output_v', -- Name of the output table + '"user id"', -- Partition expression to group the data 'event_timestamp', -- Order expression to sort the tuples of the data table '0:3:0' -- Max time that can elapse between consecutive rows to be considered part of the same session ); @@ -87,6 +89,23 @@ SELECT assert( relative_error(array_agg(CASE WHEN original_session_id NOTNULL THEN original_session_id ELSE 0 END), array_agg(CASE WHEN session_id NOTNULL THEN session_id ELSE 0 END)) < 1e-6, 'wrong output in sessionization') -FROM sessionize_output; +FROM sessionize_output_v; +SELECT * FROM sessionize_output_v; + +SELECT sessionize( + 'eventlog_installchk', -- Name of the input table + 'sessionize_output_t', -- Name of the output table + '"user id"<102000', -- Partition expression to group the data + 'event_timestamp', -- Order expression to sort the tuples of the data table + '180', -- Max time that can elapse between consecutive rows to be considered part of the same session + '*,"user id"<102000 AS uid,revenue>5 AS rid', -- Select all columns in the input table, along with the partition expression and session id columns + 'f' -- Materialize results into a table + ); + +SELECT + assert( + relative_error(array_agg(CASE WHEN eventlog_installchk.part_expr THEN 1 ELSE 0 END), array_agg(CASE WHEN sessionize_output_t.uid THEN 1 ELSE 0 END)) != 0, + 'wrong output in sessionization') +FROM sessionize_output_t, eventlog_installchk; +SELECT * FROM sessionize_output_t; -SELECT * FROM sessionize_output;