diff --git a/mysql_ch_replicator/converter.py b/mysql_ch_replicator/converter.py index 18f7193..ab85ba0 100644 --- a/mysql_ch_replicator/converter.py +++ b/mysql_ch_replicator/converter.py @@ -278,23 +278,67 @@ def strip_sql_name(name): return name -def split_high_level(data, token): - results = [] - level = 0 - curr_data = '' - for c in data: - if c == token and level == 0: - results.append(curr_data.strip()) - curr_data = '' +def split_high_level(data, delimiter): + """ + Split a string by a delimiter, ignoring delimiters inside parentheses or quotes. + + This function performs a context-aware split, respecting nested structures: + - Delimiters inside parentheses () are ignored + - Delimiters inside single quotes '' are ignored + - Handles nested parentheses at any depth + + Args: + data (str): The string to split + delimiter (str): The character to split on (typically ',' or ';') + + Returns: + list[str]: List of split segments with whitespace stripped + + Examples: + >>> split_high_level("a,b(c,d),e", ",") + ['a', 'b(c,d)', 'e'] + + >>> split_high_level("name varchar(100) DEFAULT 'a,b',id int", ",") + ["name varchar(100) DEFAULT 'a,b'", 'id int'] + """ + if not data: + return [] + + segments = [] + current_segment = [] + paren_depth = 0 + in_quotes = False + + for i, char in enumerate(data): + # Handle quote toggling (ignore escaped quotes) + if char == "'" and (i == 0 or data[i - 1] != '\\'): + in_quotes = not in_quotes + current_segment.append(char) + continue + + # Track parentheses depth only outside quotes + if not in_quotes: + if char == '(': + paren_depth += 1 + elif char == ')': + paren_depth -= 1 + + # Split only at top level (outside parentheses and quotes) + if char == delimiter and paren_depth == 0 and not in_quotes: + segment_text = ''.join(current_segment).strip() + if segment_text: # Only add non-empty segments + segments.append(segment_text) + current_segment = [] continue - if c == '(': - level += 1 - if c == ')': - level -= 1 - curr_data += c - if curr_data: - results.append(curr_data.strip()) - return results + + current_segment.append(char) + + # Add final segment if it exists + final_segment = ''.join(current_segment).strip() + if final_segment: + segments.append(final_segment) + + return segments def strip_sql_comments(sql_statement): diff --git a/tests/test_split_high_level.py b/tests/test_split_high_level.py new file mode 100644 index 0000000..73613ba --- /dev/null +++ b/tests/test_split_high_level.py @@ -0,0 +1,114 @@ +import pytest +from mysql_ch_replicator.converter import split_high_level + + +@pytest.mark.parametrize("data,delimiter,expected", [ + # Basic column definitions without quotes or parentheses + ( + "id int NOT NULL, name varchar(255), age int", + ",", + ['id int NOT NULL', 'name varchar(255)', 'age int'] + ), + + # Column with DEFAULT value containing comma inside single quotes + ( + "status varchar(50) DEFAULT 'active,pending', id int", + ",", + ["status varchar(50) DEFAULT 'active,pending'", 'id int'] + ), + + # Multiple columns with quoted DEFAULT values containing commas + ( + "col1 varchar(50) DEFAULT 'value,with,commas', col2 int, col3 varchar(100) DEFAULT 'another,comma'", + ",", + ["col1 varchar(50) DEFAULT 'value,with,commas'", 'col2 int', "col3 varchar(100) DEFAULT 'another,comma'"] + ), + + # ENUM definition with multiple values (commas inside parentheses) + ( + "status enum('active','inactive','pending'), id int", + ",", + ["status enum('active','inactive','pending')", 'id int'] + ), + + # SET type with multiple values + ( + "permissions set('read','write','execute'), user_id int", + ",", + ["permissions set('read','write','execute')", 'user_id int'] + ), + + # Column with DEFAULT containing single quote with comma + ( + "description text DEFAULT 'User, Admin', created_at datetime", + ",", + ["description text DEFAULT 'User, Admin'", 'created_at datetime'] + ), + + # DECIMAL with precision and scale (comma inside parentheses) + ( + "price decimal(10,2), quantity int", + ",", + ['price decimal(10,2)', 'quantity int'] + ), + + # Complex: ENUM + DEFAULT with commas in both + ( + "type enum('type1','type2') DEFAULT 'type1', description varchar(255) DEFAULT 'desc,with,comma'", + ",", + ["type enum('type1','type2') DEFAULT 'type1'", "description varchar(255) DEFAULT 'desc,with,comma'"] + ), + + # VARCHAR with length and DEFAULT containing comma + ( + "name varchar(100) DEFAULT 'Last, First', id int NOT NULL", + ",", + ["name varchar(100) DEFAULT 'Last, First'", 'id int NOT NULL'] + ), + + # Empty string should return empty list + ( + "", + ",", + [] + ), + + # Single column definition + ( + "id int PRIMARY KEY", + ",", + ['id int PRIMARY KEY'] + ), + + # Multiple nested parentheses + ( + "data1 varchar(100), func(arg1, arg2), data2 int", + ",", + ['data1 varchar(100)', 'func(arg1, arg2)', 'data2 int'] + ), + + # ALTER TABLE multi-statement with commas in DEFAULT values + ( + "ADD COLUMN status varchar(50) DEFAULT 'new,value', DROP COLUMN old_col", + ",", + ["ADD COLUMN status varchar(50) DEFAULT 'new,value'", 'DROP COLUMN old_col'] + ), + + # Real-world example from MySQL CREATE TABLE + ( + "`id` int NOT NULL AUTO_INCREMENT, `email` varchar(255) DEFAULT 'user@example.com', `status` enum('active','inactive') DEFAULT 'active'", + ",", + ["`id` int NOT NULL AUTO_INCREMENT", "`email` varchar(255) DEFAULT 'user@example.com'", "`status` enum('active','inactive') DEFAULT 'active'"] + ), +]) +def test_split_high_level(data, delimiter, expected): + """ + Test the split_high_level function with SQL column definitions. + + This test verifies that the function correctly splits SQL statements by the delimiter + while ignoring delimiters that appear inside: + - Parentheses (e.g., enum values, function arguments, type precision) + - Single quotes (e.g., DEFAULT values, string literals) + """ + result = split_high_level(data, delimiter) + assert result == expected, f"Failed for input: {data} with delimiter: {delimiter}"