Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 60 additions & 16 deletions mysql_ch_replicator/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,23 +278,67 @@ def strip_sql_name(name):
return name


def split_high_level(data, token):
results = []
level = 0
curr_data = ''
for c in data:
if c == token and level == 0:
results.append(curr_data.strip())
curr_data = ''
def split_high_level(data, delimiter):
"""
Split a string by a delimiter, ignoring delimiters inside parentheses or quotes.

This function performs a context-aware split, respecting nested structures:
- Delimiters inside parentheses () are ignored
- Delimiters inside single quotes '' are ignored
- Handles nested parentheses at any depth

Args:
data (str): The string to split
delimiter (str): The character to split on (typically ',' or ';')

Returns:
list[str]: List of split segments with whitespace stripped

Examples:
>>> split_high_level("a,b(c,d),e", ",")
['a', 'b(c,d)', 'e']

>>> split_high_level("name varchar(100) DEFAULT 'a,b',id int", ",")
["name varchar(100) DEFAULT 'a,b'", 'id int']
"""
if not data:
return []

segments = []
current_segment = []
paren_depth = 0
in_quotes = False

for i, char in enumerate(data):
# Handle quote toggling (ignore escaped quotes)
if char == "'" and (i == 0 or data[i - 1] != '\\'):
in_quotes = not in_quotes
current_segment.append(char)
continue

# Track parentheses depth only outside quotes
if not in_quotes:
if char == '(':
paren_depth += 1
elif char == ')':
paren_depth -= 1

# Split only at top level (outside parentheses and quotes)
if char == delimiter and paren_depth == 0 and not in_quotes:
segment_text = ''.join(current_segment).strip()
if segment_text: # Only add non-empty segments
segments.append(segment_text)
current_segment = []
continue
if c == '(':
level += 1
if c == ')':
level -= 1
curr_data += c
if curr_data:
results.append(curr_data.strip())
return results

current_segment.append(char)

# Add final segment if it exists
final_segment = ''.join(current_segment).strip()
if final_segment:
segments.append(final_segment)

return segments


def strip_sql_comments(sql_statement):
Expand Down
114 changes: 114 additions & 0 deletions tests/test_split_high_level.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import pytest
from mysql_ch_replicator.converter import split_high_level


@pytest.mark.parametrize("data,delimiter,expected", [
# Basic column definitions without quotes or parentheses
(
"id int NOT NULL, name varchar(255), age int",
",",
['id int NOT NULL', 'name varchar(255)', 'age int']
),

# Column with DEFAULT value containing comma inside single quotes
(
"status varchar(50) DEFAULT 'active,pending', id int",
",",
["status varchar(50) DEFAULT 'active,pending'", 'id int']
),

# Multiple columns with quoted DEFAULT values containing commas
(
"col1 varchar(50) DEFAULT 'value,with,commas', col2 int, col3 varchar(100) DEFAULT 'another,comma'",
",",
["col1 varchar(50) DEFAULT 'value,with,commas'", 'col2 int', "col3 varchar(100) DEFAULT 'another,comma'"]
),

# ENUM definition with multiple values (commas inside parentheses)
(
"status enum('active','inactive','pending'), id int",
",",
["status enum('active','inactive','pending')", 'id int']
),

# SET type with multiple values
(
"permissions set('read','write','execute'), user_id int",
",",
["permissions set('read','write','execute')", 'user_id int']
),

# Column with DEFAULT containing single quote with comma
(
"description text DEFAULT 'User, Admin', created_at datetime",
",",
["description text DEFAULT 'User, Admin'", 'created_at datetime']
),

# DECIMAL with precision and scale (comma inside parentheses)
(
"price decimal(10,2), quantity int",
",",
['price decimal(10,2)', 'quantity int']
),

# Complex: ENUM + DEFAULT with commas in both
(
"type enum('type1','type2') DEFAULT 'type1', description varchar(255) DEFAULT 'desc,with,comma'",
",",
["type enum('type1','type2') DEFAULT 'type1'", "description varchar(255) DEFAULT 'desc,with,comma'"]
),

# VARCHAR with length and DEFAULT containing comma
(
"name varchar(100) DEFAULT 'Last, First', id int NOT NULL",
",",
["name varchar(100) DEFAULT 'Last, First'", 'id int NOT NULL']
),

# Empty string should return empty list
(
"",
",",
[]
),

# Single column definition
(
"id int PRIMARY KEY",
",",
['id int PRIMARY KEY']
),

# Multiple nested parentheses
(
"data1 varchar(100), func(arg1, arg2), data2 int",
",",
['data1 varchar(100)', 'func(arg1, arg2)', 'data2 int']
),

# ALTER TABLE multi-statement with commas in DEFAULT values
(
"ADD COLUMN status varchar(50) DEFAULT 'new,value', DROP COLUMN old_col",
",",
["ADD COLUMN status varchar(50) DEFAULT 'new,value'", 'DROP COLUMN old_col']
),

# Real-world example from MySQL CREATE TABLE
(
"`id` int NOT NULL AUTO_INCREMENT, `email` varchar(255) DEFAULT 'user@example.com', `status` enum('active','inactive') DEFAULT 'active'",
",",
["`id` int NOT NULL AUTO_INCREMENT", "`email` varchar(255) DEFAULT 'user@example.com'", "`status` enum('active','inactive') DEFAULT 'active'"]
),
])
def test_split_high_level(data, delimiter, expected):
"""
Test the split_high_level function with SQL column definitions.

This test verifies that the function correctly splits SQL statements by the delimiter
while ignoring delimiters that appear inside:
- Parentheses (e.g., enum values, function arguments, type precision)
- Single quotes (e.g., DEFAULT values, string literals)
"""
result = split_high_level(data, delimiter)
assert result == expected, f"Failed for input: {data} with delimiter: {delimiter}"