Skip to content

Commit

Permalink
Working on the date truncation
Browse files Browse the repository at this point in the history
  • Loading branch information
betodealmeida committed Oct 16, 2018
1 parent fa04132 commit 4cbcff2
Show file tree
Hide file tree
Showing 3 changed files with 261 additions and 12 deletions.
241 changes: 231 additions & 10 deletions gsheetsdb/processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,52 @@
from __future__ import print_function
from __future__ import unicode_literals

import itertools
import warnings

from six import string_types


GRANULARITIES = [
'year',
'month',
'day',
'hour',
'minute',
'second',
]

LOWER_BOUNDS = [
None, # year
0, # month
1, # day
0, # hour
0, # minute
0, # second
]


class Any:
def __eq__(self, other):
return True


class OneOf:
def __init__(self, valid_values):
self.valid_values = valid_values

def __eq__(self, other):
return other in self.valid_values


class JSONMatcher:

def __init__(self, json=None):
self.json = json

def __getitem__(self, key):
return self.__class__(self.json[key])

def match(self, other):
raise NotImplementedError('Subclasses should implement `match`')

Expand Down Expand Up @@ -48,7 +84,7 @@ def is_subset(json, other):
for k, v in json.items():
# each value should be a subset of the value in other
for option in other:
if is_subset(v, option.get(k)):
if k in option and is_subset(v, option[k]):
break
else:
return False
Expand Down Expand Up @@ -77,6 +113,185 @@ def post_process(self, payload, aliases):
return payload


class DateTrunc(Processor):

"""
Implement `datetrunc` UDF.
sql> SELECT time, datetrunc('month', time) FROM "http://example.com"
time datetrunc-time-month
------------------- ----------------------
2018-09-01 00:00:00 2018-09-01 00:00:00
2018-09-02 00:00:00 2018-09-01 00:00:00
2018-09-03 00:00:00 2018-09-01 00:00:00
2018-09-04 00:00:00 2018-09-01 00:00:00
2018-09-05 00:00:00 2018-09-01 00:00:00
2018-09-06 00:00:00 2018-09-01 00:00:00
2018-09-07 00:00:00 2018-09-01 00:00:00
2018-09-08 00:00:00 2018-09-01 00:00:00
2018-09-09 00:00:00 2018-09-01 00:00:00
sql>
This works by calling multiple time functions that extract year/month/etc,
and padding the values below the requested granularity. The query above
would be translated to:
SELECT time, year(time), month(time)
The post-processor then build the new datetime by using the year and month,
and padding day/time to a lower boundary.
"""

pattern = SubsetMatcher(
{
'select': {
'value': {
'datetrunc': [{'literal': OneOf(GRANULARITIES)}, Any()],
},
},
},
)

def pre_process(self, parsed_query, column_map):
select = parsed_query['select']
if not isinstance(select, list):
select = [select]

# match select
new_select = []
matcher = self.pattern['select']
self.new_columns = []
for i, expr in enumerate(select):
if matcher.match(expr):
alias = expr.get('name')
self.new_columns.append((i, alias, expr))
new_select.extend(self.get_columns(expr))
else:
new_select.append(expr)

# remove duplicates
seen = set()
deduped_select = []
for expr in new_select:
value = expr['value']
if isinstance(value, dict):
value = tuple(value.items())
if value not in seen:
seen.add(value)
deduped_select.append(expr)

# remove columns from group by
groupby = parsed_query.get('groupby')
if groupby:
new_groupby = []
matcher = SubsetMatcher({'value': {
'datetrunc': [{'literal': OneOf(GRANULARITIES)}, Any()]},
})
if not isinstance(groupby, list):
groupby = [groupby]
for expr in groupby:
if matcher.match(expr):
new_groupby.extend(
self.get_columns(expr, alias_column=False))
else:
new_groupby.append(expr)

# remove duplicates
seen = set()
deduped_groupby = []
for expr in new_groupby:
value = expr['value']
if isinstance(value, dict):
value = tuple(value.items())
if value not in seen:
seen.add(value)
deduped_groupby.append(expr)

parsed_query['groupby'] = deduped_groupby

parsed_query['select'] = deduped_select
return parsed_query

def post_process(self, payload, aliases):
added_columns = [
alias and
alias.startswith('__{0}__'.format(self.__class__.__name__))
for alias in aliases
]

cols = payload['table']['cols']
payload['table']['cols'] = [
col for (col, added) in zip(cols, added_columns) if not added
]

for position, alias, expr in self.new_columns:
id_ = 'datetrunc-{name}-{granularity}'.format(
name=expr['value']['datetrunc'][1],
granularity=expr['value']['datetrunc'][0]['literal'],
)
payload['table']['cols'].insert(
position,
{'id': id_, 'label': alias or id_, 'type': 'datetime'})

for row in payload['table']['rows']:
row_c = row['c']
row['c'] = [
value for (value, added) in zip(row['c'], added_columns)
if not added
]
for position, alias, expr in self.new_columns:
row['c'].insert(
position, {'v': self.get_value(cols, row_c, expr)})

return payload

def get_value(self, cols, row_c, expr):
"""
Build the datetime from individual columns.
"""
name = expr['value']['datetrunc'][1]
granularity = expr['value']['datetrunc'][0]['literal']
i = GRANULARITIES.index(granularity)

# map function to index
labels = [col['label'] for col in cols]
values = []
for func_name in GRANULARITIES:
label = '{0}({1})'.format(func_name, name)
if label in labels:
values.append(row_c[labels.index(label)]['v'])

# truncate values to requested granularity and pad with lower bounds
args = values[:i+1]
args += LOWER_BOUNDS[len(args):]
args = [str(int(arg)) for arg in args]

return 'Date({0})'.format(','.join(args))

def get_columns(self, expr, alias_column=True):
"""
Get all columns required to compute a given granularity.
"""
name = expr['value']['datetrunc'][1]
granularity = expr['value']['datetrunc'][0]['literal']

for func_name in GRANULARITIES:
alias = '__{namespace}__{func_name}__{name}'.format(
namespace=self.__class__.__name__,
func_name=func_name,
name=name,
)
column = {'value': {func_name: name}}
if alias_column:
column['name'] = alias
yield column
if func_name == granularity:
break


class CountStar(Processor):

pattern = SubsetMatcher({'select': {'value': {'count': '*'}}})
Expand All @@ -90,9 +305,12 @@ def pre_process(self, parsed_query, column_map):
select = [select]

new_select = []
for expr in select:
if isinstance(expr, dict) and expr['value'] == {'count': '*'}:
self.alias = expr.get('name', 'count star')
matcher = self.pattern['select']
self.new_columns = []
for i, expr in enumerate(select):
if matcher.match(expr):
alias = expr.get('name', 'count star')
self.new_columns.append((i, alias, expr))
else:
new_select.append(expr)

Expand All @@ -111,25 +329,28 @@ def post_process(self, payload, aliases):
alias.startswith('__{0}__'.format(self.__class__.__name__))
for alias in aliases
]

payload['table']['cols'] = [
col for (col, added)
in zip(payload['table']['cols'], added_columns)
if not added
]
payload['table']['cols'].append(
{'id': 'count-star', 'label': self.alias, 'type': 'number'})

position, alias, expr = self.new_columns[0]
payload['table']['cols'].insert(
position, {'id': 'count-star', 'label': alias, 'type': 'number'})

for row in payload['table']['rows']:
values = [
value for (value, added) in zip(row['c'], added_columns)
value['v'] for (value, added) in zip(row['c'], added_columns)
if added
]
count_star = max(col['v'] for col in values)
count_star = max(values)
row['c'] = [
value for (value, added) in zip(row['c'], added_columns)
if not added
]
row['c'].append({'v': count_star})
row['c'].insert(position, {'v': count_star})

# the API returns no rows when the count is zero
if not payload['table']['rows']:
Expand All @@ -138,4 +359,4 @@ def post_process(self, payload, aliases):
return payload


processors = [CountStar]
processors = [CountStar, DateTrunc]
9 changes: 8 additions & 1 deletion tests/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,14 @@
check_result,
)
from gsheetsdb.dialect import add_headers, GSheetsDialect
from gsheetsdb.processors import CountStar, is_subset, Processor, SubsetMatcher
from gsheetsdb.processors import (
Any,
CountStar,
DateTrunc,
is_subset,
Processor,
SubsetMatcher,
)
from gsheetsdb.query import (
execute,
get_column_map,
Expand Down
23 changes: 22 additions & 1 deletion tests/test_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,14 @@

from moz_sql_parser import parse

from .context import CountStar, is_subset, Processor, SubsetMatcher
from .context import (
Any,
CountStar,
DateTrunc,
is_subset,
Processor,
SubsetMatcher,
)


class ProcessingTestSuite(unittest.TestCase):
Expand Down Expand Up @@ -215,3 +222,17 @@ def test_is_subset(self):

other = [1, 3, 4]
self.assertFalse(is_subset(json, other))

def test_any_match(self):
pattern = SubsetMatcher({'name': Any()})
self.assertTrue(pattern.match({'name': 'Alice'}))
self.assertTrue(pattern.match({'name': 'Bob'}))

def test_datetrunc_match(self):
self.assertTrue(DateTrunc.match(
{'select': {'value': {'datetrunc': [{'literal': 'month'}, 'datetime']}}}))
self.assertFalse(DateTrunc.match(
{'select': {'value': {'datetrunc': [{'literal': 'week'}, 'datetime']}}}))
self.assertFalse(DateTrunc.match({'select': {'value': 'datetime'}}))
self.assertFalse(DateTrunc.match({'select': {'value': {'year': 'datetime'}}}))
self.assertFalse(DateTrunc.match({'select': {'value': {'month': 'datetime'}}}))

0 comments on commit 4cbcff2

Please sign in to comment.