Working on the date truncation

betodealmeida · Oct 16, 2018 · 4cbcff2 · 4cbcff2
1 parent fa04132
commit 4cbcff2
Show file tree

Hide file tree

Showing 3 changed files with 261 additions and 12 deletions.
diff --git a/gsheetsdb/processors.py b/gsheetsdb/processors.py
@@ -3,16 +3,52 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
+import itertools
 import warnings
 
 from six import string_types
 
 
+GRANULARITIES = [
+    'year',
+    'month',
+    'day',
+    'hour',
+    'minute',
+    'second',
+]
+
+LOWER_BOUNDS = [
+    None,  # year
+    0,     # month
+    1,     # day
+    0,     # hour
+    0,     # minute
+    0,     # second
+]
+
+
+class Any:
+    def __eq__(self, other):
+        return True
+
+
+class OneOf:
+    def __init__(self, valid_values):
+        self.valid_values = valid_values
+
+    def __eq__(self, other):
+        return other in self.valid_values
+
+
 class JSONMatcher:
 
     def __init__(self, json=None):
         self.json = json
 
+    def __getitem__(self, key):
+        return self.__class__(self.json[key])
+
     def match(self, other):
         raise NotImplementedError('Subclasses should implement `match`')
 
@@ -48,7 +84,7 @@ def is_subset(json, other):
         for k, v in json.items():
             # each value should be a subset of the value in other
             for option in other:
-                if is_subset(v, option.get(k)):
+                if k in option and is_subset(v, option[k]):
                     break
             else:
                 return False
@@ -77,6 +113,185 @@ def post_process(self, payload, aliases):
         return payload
 
 
+class DateTrunc(Processor):
+
+    """
+    Implement `datetrunc` UDF.
+
+        sql> SELECT time, datetrunc('month', time) FROM "http://example.com"
+        time                 datetrunc-time-month
+        -------------------  ----------------------
+        2018-09-01 00:00:00  2018-09-01 00:00:00
+        2018-09-02 00:00:00  2018-09-01 00:00:00
+        2018-09-03 00:00:00  2018-09-01 00:00:00
+        2018-09-04 00:00:00  2018-09-01 00:00:00
+        2018-09-05 00:00:00  2018-09-01 00:00:00
+        2018-09-06 00:00:00  2018-09-01 00:00:00
+        2018-09-07 00:00:00  2018-09-01 00:00:00
+        2018-09-08 00:00:00  2018-09-01 00:00:00
+        2018-09-09 00:00:00  2018-09-01 00:00:00
+        sql>
+
+    This works by calling multiple time functions that extract year/month/etc,
+    and padding the values below the requested granularity. The query above
+    would be translated to:
+
+        SELECT time, year(time), month(time)
+
+    The post-processor then build the new datetime by using the year and month,
+    and padding day/time to a lower boundary.
+
+    """
+
+    pattern = SubsetMatcher(
+        {
+            'select': {
+                'value': {
+                    'datetrunc': [{'literal': OneOf(GRANULARITIES)}, Any()],
+                },
+            },
+        },
+    )
+
+    def pre_process(self, parsed_query, column_map):
+        select = parsed_query['select']
+        if not isinstance(select, list):
+            select = [select]
+
+        # match select
+        new_select = []
+        matcher = self.pattern['select']
+        self.new_columns = []
+        for i, expr in enumerate(select):
+            if matcher.match(expr):
+                alias = expr.get('name')
+                self.new_columns.append((i, alias, expr))
+                new_select.extend(self.get_columns(expr))
+            else:
+                new_select.append(expr)
+
+        # remove duplicates
+        seen = set()
+        deduped_select = []
+        for expr in new_select:
+            value = expr['value']
+            if isinstance(value, dict):
+                value = tuple(value.items())
+            if value not in seen:
+                seen.add(value)
+                deduped_select.append(expr)
+
+        # remove columns from group by
+        groupby = parsed_query.get('groupby')
+        if groupby:
+            new_groupby = []
+            matcher = SubsetMatcher({'value': {
+                'datetrunc': [{'literal': OneOf(GRANULARITIES)}, Any()]},
+            })
+            if not isinstance(groupby, list):
+                groupby = [groupby]
+            for expr in groupby:
+                if matcher.match(expr):
+                    new_groupby.extend(
+                        self.get_columns(expr, alias_column=False))
+                else:
+                    new_groupby.append(expr)
+
+            # remove duplicates
+            seen = set()
+            deduped_groupby = []
+            for expr in new_groupby:
+                value = expr['value']
+                if isinstance(value, dict):
+                    value = tuple(value.items())
+                if value not in seen:
+                    seen.add(value)
+                    deduped_groupby.append(expr)
+
+            parsed_query['groupby'] = deduped_groupby
+
+        parsed_query['select'] = deduped_select
+        return parsed_query
+
+    def post_process(self, payload, aliases):
+        added_columns = [
+            alias and
+            alias.startswith('__{0}__'.format(self.__class__.__name__))
+            for alias in aliases
+        ]
+
+        cols = payload['table']['cols']
+        payload['table']['cols'] = [
+            col for (col, added) in zip(cols, added_columns) if not added
+        ]
+
+        for position, alias, expr in self.new_columns:
+            id_ = 'datetrunc-{name}-{granularity}'.format(
+                name=expr['value']['datetrunc'][1],
+                granularity=expr['value']['datetrunc'][0]['literal'],
+            )
+            payload['table']['cols'].insert(
+                position,
+                {'id': id_, 'label': alias or id_, 'type': 'datetime'})
+
+        for row in payload['table']['rows']:
+            row_c = row['c']
+            row['c'] = [
+                value for (value, added) in zip(row['c'], added_columns)
+                if not added
+            ]
+            for position, alias, expr in self.new_columns:
+                row['c'].insert(
+                    position, {'v': self.get_value(cols, row_c, expr)})
+
+        return payload
+
+    def get_value(self, cols, row_c, expr):
+        """
+        Build the datetime from individual columns.
+
+        """
+        name = expr['value']['datetrunc'][1]
+        granularity = expr['value']['datetrunc'][0]['literal']
+        i = GRANULARITIES.index(granularity)
+
+        # map function to index
+        labels = [col['label'] for col in cols]
+        values = []
+        for func_name in GRANULARITIES:
+            label = '{0}({1})'.format(func_name, name)
+            if label in labels:
+                values.append(row_c[labels.index(label)]['v'])
+
+        # truncate values to requested granularity and pad with lower bounds
+        args = values[:i+1]
+        args += LOWER_BOUNDS[len(args):]
+        args = [str(int(arg)) for arg in args]
+
+        return 'Date({0})'.format(','.join(args))
+
+    def get_columns(self, expr, alias_column=True):
+        """
+        Get all columns required to compute a given granularity.
+
+        """
+        name = expr['value']['datetrunc'][1]
+        granularity = expr['value']['datetrunc'][0]['literal']
+
+        for func_name in GRANULARITIES:
+            alias = '__{namespace}__{func_name}__{name}'.format(
+                namespace=self.__class__.__name__,
+                func_name=func_name,
+                name=name,
+            )
+            column = {'value': {func_name: name}}
+            if alias_column:
+                column['name'] = alias
+            yield column
+            if func_name == granularity:
+                break
+
+
 class CountStar(Processor):
 
     pattern = SubsetMatcher({'select': {'value': {'count': '*'}}})
@@ -90,9 +305,12 @@ def pre_process(self, parsed_query, column_map):
             select = [select]
 
         new_select = []
-        for expr in select:
-            if isinstance(expr, dict) and expr['value'] == {'count': '*'}:
-                self.alias = expr.get('name', 'count star')
+        matcher = self.pattern['select']
+        self.new_columns = []
+        for i, expr in enumerate(select):
+            if matcher.match(expr):
+                alias = expr.get('name', 'count star')
+                self.new_columns.append((i, alias, expr))
             else:
                 new_select.append(expr)
 
@@ -111,25 +329,28 @@ def post_process(self, payload, aliases):
             alias.startswith('__{0}__'.format(self.__class__.__name__))
             for alias in aliases
         ]
+
         payload['table']['cols'] = [
             col for (col, added)
             in zip(payload['table']['cols'], added_columns)
             if not added
         ]
-        payload['table']['cols'].append(
-            {'id': 'count-star', 'label': self.alias, 'type': 'number'})
+
+        position, alias, expr = self.new_columns[0]
+        payload['table']['cols'].insert(
+            position, {'id': 'count-star', 'label': alias, 'type': 'number'})
 
         for row in payload['table']['rows']:
             values = [
-                value for (value, added) in zip(row['c'], added_columns)
+                value['v'] for (value, added) in zip(row['c'], added_columns)
                 if added
             ]
-            count_star = max(col['v'] for col in values)
+            count_star = max(values)
             row['c'] = [
                 value for (value, added) in zip(row['c'], added_columns)
                 if not added
             ]
-            row['c'].append({'v': count_star})
+            row['c'].insert(position, {'v': count_star})
 
         # the API returns no rows when the count is zero
         if not payload['table']['rows']:
@@ -138,4 +359,4 @@ def post_process(self, payload, aliases):
         return payload
 
 
-processors = [CountStar]
+processors = [CountStar, DateTrunc]
diff --git a/tests/context.py b/tests/context.py
@@ -16,7 +16,14 @@
     check_result,
 )
 from gsheetsdb.dialect import add_headers, GSheetsDialect
-from gsheetsdb.processors import CountStar, is_subset, Processor, SubsetMatcher
+from gsheetsdb.processors import (
+    Any,
+    CountStar,
+    DateTrunc,
+    is_subset,
+    Processor,
+    SubsetMatcher,
+)
 from gsheetsdb.query import (
     execute,
     get_column_map,

diff --git a/tests/test_processing.py b/tests/test_processing.py
@@ -6,7 +6,14 @@
 
 from moz_sql_parser import parse
 
-from .context import CountStar, is_subset, Processor, SubsetMatcher
+from .context import (
+    Any,
+    CountStar,
+    DateTrunc,
+    is_subset,
+    Processor,
+    SubsetMatcher,
+)
 
 
 class ProcessingTestSuite(unittest.TestCase):
@@ -215,3 +222,17 @@ def test_is_subset(self):
 
         other = [1, 3, 4]
         self.assertFalse(is_subset(json, other))
+
+    def test_any_match(self):
+        pattern = SubsetMatcher({'name': Any()})
+        self.assertTrue(pattern.match({'name': 'Alice'}))
+        self.assertTrue(pattern.match({'name': 'Bob'}))
+
+    def test_datetrunc_match(self):
+        self.assertTrue(DateTrunc.match(
+            {'select': {'value': {'datetrunc': [{'literal': 'month'}, 'datetime']}}}))
+        self.assertFalse(DateTrunc.match(
+            {'select': {'value': {'datetrunc': [{'literal': 'week'}, 'datetime']}}}))
+        self.assertFalse(DateTrunc.match({'select': {'value': 'datetime'}}))
+        self.assertFalse(DateTrunc.match({'select': {'value': {'year': 'datetime'}}}))
+        self.assertFalse(DateTrunc.match({'select': {'value': {'month': 'datetime'}}}))