Merge pull request #251 from andialbrecht/filters_sql

Update Filters sql
andialbrecht · Jun 6, 2016 · b9d81ac · b9d81ac
2 parents c6a5e7a + 5747015
commit b9d81ac
Show file tree

Hide file tree

Showing 14 changed files with 249 additions and 1,024 deletions.
diff --git a/sqlparse/__init__.py b/sqlparse/__init__.py
@@ -66,11 +66,4 @@ def split(sql, encoding=None):
     :returns: A list of strings.
     """
     stack = engine.FilterStack()
-    stack.split_statements = True
     return [u(stmt).strip() for stmt in stack.run(sql, encoding)]
-
-
-def split2(stream):
-    from sqlparse.engine.filter import StatementFilter
-    splitter = StatementFilter()
-    return list(splitter.process(None, stream))
diff --git a/sqlparse/compat.py b/sqlparse/compat.py
@@ -25,6 +25,10 @@ def u(s, encoding=None):
         return str(s)
 
 
+    def unicode_compatible(cls):
+        return cls
+
+
     text_type = str
     string_types = (str,)
     from io import StringIO
@@ -39,6 +43,12 @@ def u(s, encoding=None):
             return unicode(s, encoding)
 
 
+    def unicode_compatible(cls):
+        cls.__unicode__ = cls.__str__
+        cls.__str__ = lambda x: x.__unicode__().encode('utf-8')
+        return cls
+
+
     text_type = unicode
     string_types = (basestring,)
     from StringIO import StringIO
diff --git a/sqlparse/engine/__init__.py b/sqlparse/engine/__init__.py
@@ -13,12 +13,10 @@
 
 
 class FilterStack(object):
-
     def __init__(self):
         self.preprocess = []
         self.stmtprocess = []
         self.postprocess = []
-        self.split_statements = False
         self._grouping = False
 
     def enable_grouping(self):
@@ -27,42 +25,20 @@ def enable_grouping(self):
     def run(self, sql, encoding=None):
         stream = lexer.tokenize(sql, encoding)
         # Process token stream
-        if self.preprocess:
-            for filter_ in self.preprocess:
-                stream = filter_.process(self, stream)
-
-        if (self.stmtprocess or self.postprocess or
-                self.split_statements or self._grouping):
-            splitter = StatementFilter()
-            stream = splitter.process(self, stream)
-
-        if self._grouping:
-
-            def _group(stream):
-                for stmt in stream:
-                    grouping.group(stmt)
-                    yield stmt
-            stream = _group(stream)
+        for filter_ in self.preprocess:
+            stream = filter_.process(stream)
 
-        if self.stmtprocess:
+        stream = StatementFilter().process(stream)
 
-            def _run1(stream):
-                ret = []
-                for stmt in stream:
-                    for filter_ in self.stmtprocess:
-                        filter_.process(self, stmt)
-                    ret.append(stmt)
-                return ret
-            stream = _run1(stream)
+        # Output: Stream processed Statements
+        for stmt in stream:
+            if self._grouping:
+                stmt = grouping.group(stmt)
 
-        if self.postprocess:
+            for filter_ in self.stmtprocess:
+                filter_.process(stmt)
 
-            def _run2(stream):
-                for stmt in stream:
-                    stmt.tokens = list(stmt.flatten())
-                    for filter_ in self.postprocess:
-                        stmt = filter_.process(self, stmt)
-                    yield stmt
-            stream = _run2(stream)
+            for filter_ in self.postprocess:
+                stmt = filter_.process(stmt)
 
-        return stream
+            yield stmt
diff --git a/sqlparse/engine/filter.py b/sqlparse/engine/filter.py
@@ -5,113 +5,119 @@
 # This module is part of python-sqlparse and is released under
 # the BSD License: http://www.opensource.org/licenses/bsd-license.php
 
-from sqlparse.sql import Statement, Token
-from sqlparse import tokens as T
+from sqlparse import sql, tokens as T
 
 
 class StatementFilter(object):
-    "Filter that split stream at individual statements"
+    """Filter that split stream at individual statements"""
 
     def __init__(self):
-        self._in_declare = False
-        self._in_dbldollar = False
-        self._is_create = False
-        self._begin_depth = 0
+        self._reset()
 
     def _reset(self):
-        "Set the filter attributes to its default values"
+        """Set the filter attributes to its default values"""
         self._in_declare = False
         self._in_dbldollar = False
         self._is_create = False
         self._begin_depth = 0
 
+        self.consume_ws = False
+        self.tokens = []
+        self.level = 0
+
     def _change_splitlevel(self, ttype, value):
-        "Get the new split level (increase, decrease or remain equal)"
+        """Get the new split level (increase, decrease or remain equal)"""
         # PostgreSQL
-        if ttype == T.Name.Builtin \
-           and value.startswith('$') and value.endswith('$'):
+        if ttype == T.Name.Builtin and value[0] == '$' and value[-1] == '$':
+
+            # 2nd dbldollar found. $quote$ completed
+            # decrease level
             if self._in_dbldollar:
                 self._in_dbldollar = False
                 return -1
             else:
                 self._in_dbldollar = True
                 return 1
+
+        # if inside $$ everything inside is defining function character.
+        # Nothing inside can create a new statement
         elif self._in_dbldollar:
             return 0
 
         # ANSI
+        # if normal token return
+        # wouldn't parenthesis increase/decrease a level?
+        # no, inside a paranthesis can't start new statement
         if ttype not in T.Keyword:
             return 0
 
+        # Everything after here is ttype = T.Keyword
+        # Also to note, once entered an If statement you are done and basically
+        # returning
         unified = value.upper()
 
+        # three keywords begin with CREATE, but only one of them is DDL
+        # DDL Create though can contain more words such as "or replace"
+        if ttype is T.Keyword.DDL and unified.startswith('CREATE'):
+            self._is_create = True
+            return 0
+
+        # can have nested declare inside of being...
         if unified == 'DECLARE' and self._is_create and self._begin_depth == 0:
             self._in_declare = True
             return 1
 
         if unified == 'BEGIN':
             self._begin_depth += 1
-            if self._in_declare or self._is_create:
+            if self._is_create:
                 # FIXME(andi): This makes no sense.
                 return 1
             return 0
 
-        if unified in ('END IF', 'END FOR', 'END WHILE'):
-            return -1
-
+        # Should this respect a preceeding BEGIN?
+        # In CASE ... WHEN ... END this results in a split level -1.
+        # Would having multiple CASE WHEN END and a Assigment Operator
+        # cause the statement to cut off prematurely?
         if unified == 'END':
-            # Should this respect a preceeding BEGIN?
-            # In CASE ... WHEN ... END this results in a split level -1.
             self._begin_depth = max(0, self._begin_depth - 1)
             return -1
 
-        if ttype is T.Keyword.DDL and unified.startswith('CREATE'):
-            self._is_create = True
-            return 0
-
-        if unified in ('IF', 'FOR', 'WHILE') \
-           and self._is_create and self._begin_depth > 0:
+        if (unified in ('IF', 'FOR', 'WHILE') and
+                self._is_create and self._begin_depth > 0):
             return 1
 
+        if unified in ('END IF', 'END FOR', 'END WHILE'):
+            return -1
+
         # Default
         return 0
 
-    def process(self, stack, stream):
-        "Process the stream"
-        consume_ws = False
-        splitlevel = 0
-        stmt = None
-        stmt_tokens = []
+    def process(self, stream):
+        """Process the stream"""
+        EOS_TTYPE = T.Whitespace, T.Comment.Single
 
         # Run over all stream tokens
         for ttype, value in stream:
             # Yield token if we finished a statement and there's no whitespaces
-            if consume_ws and ttype not in (T.Whitespace, T.Comment.Single):
-                stmt.tokens = stmt_tokens
-                yield stmt
+            # It will count newline token as a non whitespace. In this context
+            # whitespace ignores newlines.
+            # why don't multi line comments also count?
+            if self.consume_ws and ttype not in EOS_TTYPE:
+                yield sql.Statement(self.tokens)
 
                 # Reset filter and prepare to process next statement
                 self._reset()
-                consume_ws = False
-                splitlevel = 0
-                stmt = None
-
-            # Create a new statement if we are not currently in one of them
-            if stmt is None:
-                stmt = Statement()
-                stmt_tokens = []
 
             # Change current split level (increase, decrease or remain equal)
-            splitlevel += self._change_splitlevel(ttype, value)
+            self.level += self._change_splitlevel(ttype, value)
 
             # Append the token to the current statement
-            stmt_tokens.append(Token(ttype, value))
+            self.tokens.append(sql.Token(ttype, value))
 
             # Check if we get the end of a statement
-            if splitlevel <= 0 and ttype is T.Punctuation and value == ';':
-                consume_ws = True
+            if self.level <= 0 and ttype is T.Punctuation and value == ';':
+                self.consume_ws = True
 
         # Yield pending statement (if any)
-        if stmt is not None:
-            stmt.tokens = stmt_tokens
-            yield stmt
+        if self.tokens:
+            yield sql.Statement(self.tokens)
diff --git a/sqlparse/engine/grouping.py b/sqlparse/engine/grouping.py
@@ -266,7 +266,7 @@ def align_comments(tlist):
         token = tlist.token_next_by(i=sql.Comment, idx=token)
 
 
-def group(tlist):
+def group(stmt):
     for func in [
         group_comments,
         group_brackets,
@@ -291,4 +291,5 @@ def group(tlist):
         group_foreach,
         group_begin,
     ]:
-        func(tlist)
+        func(stmt)
+    return stmt