bakwc · bakwc · Nov 24, 2025 · Nov 20, 2025 · Nov 24, 2025
diff --git a/mysql_ch_replicator/converter.py b/mysql_ch_replicator/converter.py
@@ -278,23 +278,67 @@ def strip_sql_name(name):
     return name
 
 
-def split_high_level(data, token):
-    results = []
-    level = 0
-    curr_data = ''
-    for c in data:
-        if c == token and level == 0:
-            results.append(curr_data.strip())
-            curr_data = ''
+def split_high_level(data, delimiter):
+    """
+    Split a string by a delimiter, ignoring delimiters inside parentheses or quotes.
+
+    This function performs a context-aware split, respecting nested structures:
+    - Delimiters inside parentheses () are ignored
+    - Delimiters inside single quotes '' are ignored
+    - Handles nested parentheses at any depth
+
+    Args:
+        data (str): The string to split
+        delimiter (str): The character to split on (typically ',' or ';')
+
+    Returns:
+        list[str]: List of split segments with whitespace stripped
+
+    Examples:
+        >>> split_high_level("a,b(c,d),e", ",")
+        ['a', 'b(c,d)', 'e']
+
+        >>> split_high_level("name varchar(100) DEFAULT 'a,b',id int", ",")
+        ["name varchar(100) DEFAULT 'a,b'", 'id int']
+    """
+    if not data:
+        return []
+
+    segments = []
+    current_segment = []
+    paren_depth = 0
+    in_quotes = False
+
+    for i, char in enumerate(data):
+        # Handle quote toggling (ignore escaped quotes)
+        if char == "'" and (i == 0 or data[i - 1] != '\\'):
+            in_quotes = not in_quotes
+            current_segment.append(char)
+            continue
+
+        # Track parentheses depth only outside quotes
+        if not in_quotes:
+            if char == '(':
+                paren_depth += 1
+            elif char == ')':
+                paren_depth -= 1
+
+        # Split only at top level (outside parentheses and quotes)
+        if char == delimiter and paren_depth == 0 and not in_quotes:
+            segment_text = ''.join(current_segment).strip()
+            if segment_text:  # Only add non-empty segments
+                segments.append(segment_text)
+            current_segment = []
             continue
-        if c == '(':
-            level += 1
-        if c == ')':
-            level -= 1
-        curr_data += c
-    if curr_data:
-        results.append(curr_data.strip())
-    return results
+
+        current_segment.append(char)
+
+    # Add final segment if it exists
+    final_segment = ''.join(current_segment).strip()
+    if final_segment:
+        segments.append(final_segment)
+
+    return segments
 
 
 def strip_sql_comments(sql_statement):

diff --git a/tests/test_split_high_level.py b/tests/test_split_high_level.py
@@ -0,0 +1,114 @@
+import pytest
+from mysql_ch_replicator.converter import split_high_level
+
+
+@pytest.mark.parametrize("data,delimiter,expected", [
+    # Basic column definitions without quotes or parentheses
+    (
+        "id int NOT NULL, name varchar(255), age int",
+        ",",
+        ['id int NOT NULL', 'name varchar(255)', 'age int']
+    ),
+
+    # Column with DEFAULT value containing comma inside single quotes
+    (
+        "status varchar(50) DEFAULT 'active,pending', id int",
+        ",",
+        ["status varchar(50) DEFAULT 'active,pending'", 'id int']
+    ),
+
+    # Multiple columns with quoted DEFAULT values containing commas
+    (
+        "col1 varchar(50) DEFAULT 'value,with,commas', col2 int, col3 varchar(100) DEFAULT 'another,comma'",
+        ",",
+        ["col1 varchar(50) DEFAULT 'value,with,commas'", 'col2 int', "col3 varchar(100) DEFAULT 'another,comma'"]
+    ),
+
+    # ENUM definition with multiple values (commas inside parentheses)
+    (
+        "status enum('active','inactive','pending'), id int",
+        ",",
+        ["status enum('active','inactive','pending')", 'id int']
+    ),
+
+    # SET type with multiple values
+    (
+        "permissions set('read','write','execute'), user_id int",
+        ",",
+        ["permissions set('read','write','execute')", 'user_id int']
+    ),
+
+    # Column with DEFAULT containing single quote with comma
+    (
+        "description text DEFAULT 'User, Admin', created_at datetime",
+        ",",
+        ["description text DEFAULT 'User, Admin'", 'created_at datetime']
+    ),
+
+    # DECIMAL with precision and scale (comma inside parentheses)
+    (
+        "price decimal(10,2), quantity int",
+        ",",
+        ['price decimal(10,2)', 'quantity int']
+    ),
+
+    # Complex: ENUM + DEFAULT with commas in both
+    (
+        "type enum('type1','type2') DEFAULT 'type1', description varchar(255) DEFAULT 'desc,with,comma'",
+        ",",
+        ["type enum('type1','type2') DEFAULT 'type1'", "description varchar(255) DEFAULT 'desc,with,comma'"]
+    ),
+
+    # VARCHAR with length and DEFAULT containing comma
+    (
+        "name varchar(100) DEFAULT 'Last, First', id int NOT NULL",
+        ",",
+        ["name varchar(100) DEFAULT 'Last, First'", 'id int NOT NULL']
+    ),
+
+    # Empty string should return empty list
+    (
+        "",
+        ",",
+        []
+    ),
+
+    # Single column definition
+    (
+        "id int PRIMARY KEY",
+        ",",
+        ['id int PRIMARY KEY']
+    ),
+
+    # Multiple nested parentheses
+    (
+        "data1 varchar(100), func(arg1, arg2), data2 int",
+        ",",
+        ['data1 varchar(100)', 'func(arg1, arg2)', 'data2 int']
+    ),
+
+    # ALTER TABLE multi-statement with commas in DEFAULT values
+    (
+        "ADD COLUMN status varchar(50) DEFAULT 'new,value', DROP COLUMN old_col",
+        ",",
+        ["ADD COLUMN status varchar(50) DEFAULT 'new,value'", 'DROP COLUMN old_col']
+    ),
+
+    # Real-world example from MySQL CREATE TABLE
+    (
+        "`id` int NOT NULL AUTO_INCREMENT, `email` varchar(255) DEFAULT 'user@example.com', `status` enum('active','inactive') DEFAULT 'active'",
+        ",",
+        ["`id` int NOT NULL AUTO_INCREMENT", "`email` varchar(255) DEFAULT 'user@example.com'", "`status` enum('active','inactive') DEFAULT 'active'"]
+    ),
+])
+def test_split_high_level(data, delimiter, expected):
+    """
+    Test the split_high_level function with SQL column definitions.
+
+    This test verifies that the function correctly splits SQL statements by the delimiter
+    while ignoring delimiters that appear inside:
+    - Parentheses (e.g., enum values, function arguments, type precision)
+    - Single quotes (e.g., DEFAULT values, string literals)
+    """
+    result = split_high_level(data, delimiter)
+    assert result == expected, f"Failed for input: {data} with delimiter: {delimiter}"