Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: Pivot Function #47

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
11 changes: 7 additions & 4 deletions doc/mainpage.dox.in
@@ -1,12 +1,12 @@
/**
@mainpage
Apache MADlib (incubating) is an open-source library for scalable
in-database analytics. It provides data-parallel implementations of
mathematical, statistical and machine learning methods for structured
Apache MADlib (incubating) is an open-source library for scalable
in-database analytics. It provides data-parallel implementations of
mathematical, statistical and machine learning methods for structured
and unstructured data.

The MADlib mission: to foster widespread development of scalable analytic
skills, by harnessing efforts from commercial practice, academic research,
skills, by harnessing efforts from commercial practice, academic research,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 for removing the trailing whitespace

and open-source development.

Useful links:
Expand Down Expand Up @@ -113,6 +113,9 @@ complete matrix stored as a distributed table.
@defgroup grp_data_prep Encoding Categorical Variables
@ingroup grp_datatrans

@defgroup grp_pivot Pivoting
@ingroup grp_datatrans

@defgroup grp_stemmer Stemming
@ingroup grp_datatrans

Expand Down
@@ -1,5 +1,5 @@
"""
@file dummy_coding.py_in
@file data_preparation.py_in

@brief Marginal Effects with Interactions: Contains the main interface function
and other functions that are common to the various methods and related to
Expand Down
219 changes: 219 additions & 0 deletions src/ports/postgres/modules/utilities/pivot.py_in
@@ -0,0 +1,219 @@
# coding=utf-8
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# Pivoting
# The goal of the MADlib pivot function is to provide a data summarization tool
# that can do basic OLAP type operations on data stored in one table and output
# the summarized data to a second table. Typical operations are count, average,
# min, max and standard deviation, however user defined aggregates (UDAs) are
# also be allowed.

# Please refer to the pivot.sql_in file for the documentation

"""
@file pivot.py_in

"""
import plpy
from control import MinWarning
from utilities import _assert
from utilities import split_quoted_delimited_str
from utilities import strip_end_quotes
from validate_args import table_exists
from validate_args import columns_exist_in_table
from validate_args import table_is_empty
from validate_args import _get_table_schema_names
from validate_args import get_first_schema

m4_changequote(`<!', `!>')

def pivot(schema_madlib, source_table, out_table,
index, pivot_cols, pivot_values,
aggregate_func, **kwargs):
"""
Helper function that can be used to pivot tables
Args:
@param source_table The original data table
@param out_table The output table that contains the dummy
variable columns
@param index The index columns to group by the records by
@param pivot_cols The columns to pivot the table
@param pivot_values The value columns to be summarized in the
pivoted table
@param aggregate_func The aggregate function to be applied to the
values
"""

"""
Assume we have the following table
pivset( id INTEGER, piv FLOAT8, val FLOAT8 )
where the piv column has 3 distinct values (10.0, 20.0 and 30.0).
If the pivot function call is :
SELECT madlib.pivot('pivset', 'pivout', 'id', 'piv', 'val');
We want to construct the following sql code to pivot the table.
CREATE TABLE pivout AS (SELECT id,
sum(CASE WHEN "piv" = '10.0' THEN val ELSE NULL END ) as "piv_10.0",
sum(CASE WHEN "piv" = '20.0' THEN val ELSE NULL END ) as "piv_20.0",
sum(CASE WHEN "piv" = '30.0' THEN val ELSE NULL END ) as "piv_30.0"
FROM pivset GROUP BY id ORDER BY id)

"""
indices = split_quoted_delimited_str(index)
pcol = split_quoted_delimited_str(pivot_cols)
pval = split_quoted_delimited_str(pivot_values)
validate_pivot_coding(source_table, out_table, indices, pcol, pval)
new_col_names =[]
sql_list = ["CREATE TABLE " + out_table + " AS (SELECT " + index]

pcol_no_quotes = strip_end_quotes(pcol[0].strip())
pval_no_quotes = strip_end_quotes(pval[0].strip())

# Find the distinct values of pivot_cols
distinct_values = plpy.execute(
"SELECT array_agg(DISTINCT {pcol} ORDER BY {pcol}) AS value "
"FROM {source_table}".
format(pcol=pcol[0], source_table=source_table))

distinct_values = [strip_end_quotes(item)
for item in distinct_values[0]['value']]
# The aggregate collects pivot_values values for a given pivot_cols value
case_str = ("{agg}("
"CASE WHEN \"{{pcol}}\" = '{{value}}' THEN {pval} ELSE NULL END"
")".
format(agg=aggregate_func,
pval=pval_no_quotes))
sql_list.append(
", " +
# Assign the name of the new column
', '.join("{case_str} as \"{{pcol}}_{{value}}\"".
format(case_str=case_str).
format(pcol=pcol_no_quotes, value=str(value))
for value in distinct_values if value is not None))
sql_list.append(" FROM " + source_table +
" GROUP BY " + index + " ORDER BY " + index + ") ")
try:
plpy.execute(''.join(sql_list))
except plpy.SPIError:
# Warn user if the number of columns is over 1000 for the output table
with MinWarning("warning"):
if (len(distinct_values) + len(indices)) > 1000:
plpy.warning("Too many distinct values for pivoting! The "
"execution may fail due to too many columns in the output "
"table.")

raise
return None
#------------------------------------------------------------------------------

def validate_pivot_coding(source_table, out_table, indices, pivs, vals):
"""
Args:
@param source_table The original data table
@param out_table The output table that will contain dummy columns
@param indices An array of index column names
@param cols An array of categorical column names
"""
_assert(out_table and
out_table.strip().lower() not in ('null', ''),
"Invalid output table name!")
_assert(not table_exists(out_table),
"Output table already exists!")
_assert(source_table and source_table.strip().lower() not in ('null', ''),
"Invalid data table name!")
_assert(table_exists(source_table),
"Data table ({0}) is missing!". format(source_table))
_assert(not table_is_empty(source_table),
"Data table ({0}) is empty!". format(source_table))

_assert(indices and indices not in ('null', ''),
"Invalid index column!")
_assert(pivs and pivs not in ('null', ''),
"Invalid pivot column!")
_assert(vals and vals not in ('null', ''),
"Invalid value column!")

_assert(len(indices) is 1, "Multiple index columns are not supported!")
_assert(len(pivs) is 1, "Multiple pivot columns are not supported!")
_assert(len(vals) is 1, "Multiple value columns are not supported!")

_assert(columns_exist_in_table(source_table, indices),
"Not all columns from {0} present in source table ({1})"
.format(indices, source_table))
_assert(columns_exist_in_table(source_table, pivs),
"Not all columns from {0} present in source table ({1})"
.format(pivs, source_table))
_assert(columns_exist_in_table(source_table, vals),
"Not all columns from {0} present in source table ({1})"
.format(vals, source_table))
#------------------------------------------------------------------------------

def pivot_help(schema_madlib, message, **kwargs):
"""
Help function for pivot

Args:
@param schema_madlib
@param message: string, Help message string
@param kwargs

Returns:
String. Help/usage information
"""
if not message:
help_string = """
-----------------------------------------------------------------------
SUMMARY
-----------------------------------------------------------------------
Provide a data summarization tool that can do basic OLAP type operations on
data stored in one table and output the summarized data to a second table.
Typical operations are count, average, min, max and standard deviation, however
user defined aggregates (UDAs) are also be allowed.

For more details on function usage:
SELECT {schema_madlib}.pivot('usage')
"""
elif message in ['usage', 'help', '?']:
help_string = """
-----------------------------------------------------------------------
USAGE
-----------------------------------------------------------------------
SELECT {schema_madlib}.pivot(
source_table, -- Name of source table containing data for pivoting
out_table, -- Name of output table taht contains pivoted data
index, -- Comma-separated columns that will form the index of
-- the output pivot table
pivot_cols, -- Comma-separated columns that will form the columns
-- of the output pivot table
pivot_values -- Comma-separated columns that contain the values to
-- be summarized in the output pivot table
);

-----------------------------------------------------------------------
OUTPUT
-----------------------------------------------------------------------
The output table ('output_table' above) has all the columns present in index
column list, plus additional columns for each distinct value in pivot_cols.
The column name for the pivot is
set as '<pivot name>_<pivot value>'.
"""
else:
help_string = "No such option. Use {schema_madlib}.pivot()"

return help_string.format(schema_madlib=schema_madlib)
# ---------------------------------------------------------------------