Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 45 additions & 2 deletions mysql_ch_replicator/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -1075,8 +1075,51 @@ def convert_drop_table_query(self, mysql_query):
raise Exception('not implement')

def _strip_comments(self, create_statement):
pattern = r'\bCOMMENT(?:\s*=\s*|\s+)([\'"])(?:\\.|[^\\])*?\1'
return re.sub(pattern, '', create_statement, flags=re.IGNORECASE)
"""
Strip COMMENT clauses from CREATE TABLE statements.
Handles MySQL-style quote escaping where quotes are doubled ('' or "").
"""
result = []
i = 0
while i < len(create_statement):
# Look for COMMENT keyword (case insensitive)
if (i + 7 < len(create_statement) and
create_statement[i:i+7].upper() == 'COMMENT' and
(i == 0 or not create_statement[i-1].isalnum()) and
(i + 7 >= len(create_statement) or not create_statement[i+7].isalnum())):

# Skip COMMENT keyword
i += 7

# Skip whitespace and optional '='
while i < len(create_statement) and create_statement[i].isspace():
i += 1
if i < len(create_statement) and create_statement[i] == '=':
i += 1
while i < len(create_statement) and create_statement[i].isspace():
i += 1

# Find the quoted string
if i < len(create_statement) and create_statement[i] in ('"', "'"):
quote_char = create_statement[i]
i += 1 # Skip opening quote

# Find the closing quote, handling escaped quotes
while i < len(create_statement):
if create_statement[i] == quote_char:
# Check if this is an escaped quote (doubled)
if i + 1 < len(create_statement) and create_statement[i + 1] == quote_char:
i += 2 # Skip both quotes
else:
i += 1 # Skip closing quote
break
else:
i += 1
else:
result.append(create_statement[i])
i += 1

return ''.join(result)

def parse_mysql_table_structure(self, create_statement, required_table_name=None):
create_statement = self._strip_comments(create_statement)
Expand Down
148 changes: 144 additions & 4 deletions test_mysql_ch_replicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,13 +492,13 @@ def test_multi_column_erase():

prepare_env(cfg, mysql, ch)

mysql.execute(f'''
mysql.execute(f"""
CREATE TABLE `{TEST_TABLE_NAME}` (
departments int(11) NOT NULL,
termine int(11) NOT NULL,
departments int(11) NOT NULL COMMENT '事件类型,可选值: ''SYSTEM'', ''BUSINESS''',
termine int(11) NOT NULL COMMENT '事件类型,可选值: ''SYSTEM'', ''BUSINESS''',
PRIMARY KEY (departments,termine)
)
''')
""")


mysql.execute(f"INSERT INTO `{TEST_TABLE_NAME}` (departments, termine) VALUES (10, 20);", commit=True)
Expand Down Expand Up @@ -3082,3 +3082,143 @@ def test_resume_initial_replication_with_ignore_deletes():
finally:
# Clean up temp config file
os.unlink(config_file)


@pytest.mark.parametrize("input_sql,expected_output", [
# Basic single quote comment
(
"CREATE TABLE test (id int NOT NULL COMMENT 'Simple comment', name varchar(255))",
"CREATE TABLE test (id int NOT NULL , name varchar(255))"
),
# Basic double quote comment
(
"CREATE TABLE test (id int NOT NULL COMMENT \"Simple comment\", name varchar(255))",
"CREATE TABLE test (id int NOT NULL , name varchar(255))"
),
# Comment with escaped single quotes (the original bug case)
(
"CREATE TABLE test (id int NOT NULL COMMENT '事件类型,可选值: ''SYSTEM'', ''BUSINESS''', name varchar(255))",
"CREATE TABLE test (id int NOT NULL , name varchar(255))"
),
# Comment with escaped double quotes
(
"CREATE TABLE test (id int NOT NULL COMMENT \"Value can be: \"\"ACTIVE\"\" or \"\"INACTIVE\"\"\", name varchar(255))",
"CREATE TABLE test (id int NOT NULL , name varchar(255))"
),
# Multiple comments in same table
(
"""CREATE TABLE test (
id int NOT NULL COMMENT 'Primary key',
name varchar(255) COMMENT 'User name',
status enum('active','inactive') COMMENT 'Status with ''quotes'''
)""",
"""CREATE TABLE test (
id int NOT NULL ,
name varchar(255) ,
status enum('active','inactive')
)"""
),
# Comment with COMMENT = syntax
(
"CREATE TABLE test (id int NOT NULL COMMENT = 'Primary key', name varchar(255))",
"CREATE TABLE test (id int NOT NULL , name varchar(255))"
),
# Comment with mixed quotes and special characters
(
"CREATE TABLE test (id int COMMENT 'Mixed: ''single'', \"double\", and `backtick`', name text)",
"CREATE TABLE test (id int , name text)"
),
# Multiline comment
(
"""CREATE TABLE test (
id int NOT NULL COMMENT 'This is a
multiline comment
with newlines',
name varchar(255)
)""",
"""CREATE TABLE test (
id int NOT NULL ,
name varchar(255)
)"""
),
# Comment with Unicode characters
(
"CREATE TABLE test (id int COMMENT '用户ID - 主键', name varchar(255) COMMENT 'Имя пользователя')",
"CREATE TABLE test (id int , name varchar(255) )"
),
# No comments (should remain unchanged)
(
"CREATE TABLE test (id int NOT NULL, name varchar(255))",
"CREATE TABLE test (id int NOT NULL, name varchar(255))"
),
# Comment at table level
(
"CREATE TABLE test (id int NOT NULL, name varchar(255)) COMMENT 'Table comment'",
"CREATE TABLE test (id int NOT NULL, name varchar(255)) "
),
# Complex case with multiple escaped quotes and special characters
(
"""CREATE TABLE test (
departments int(11) NOT NULL COMMENT '事件类型,可选值: ''SYSTEM'', ''BUSINESS''',
termine int(11) NOT NULL COMMENT '事件类型,可选值: ''SYSTEM'', ''BUSINESS''',
PRIMARY KEY (departments,termine)
)""",
"""CREATE TABLE test (
departments int(11) NOT NULL ,
termine int(11) NOT NULL ,
PRIMARY KEY (departments,termine)
)"""
),
# Comment with JSON-like content
(
"CREATE TABLE test (config json COMMENT '{\"type\": \"config\", \"values\": [\"a\", \"b\"]}', id int)",
"CREATE TABLE test (config json , id int)"
),
# Comment with SQL injection-like content (should be safely handled)
(
"CREATE TABLE test (id int COMMENT 'DROP TABLE users; --', name varchar(255))",
"CREATE TABLE test (id int , name varchar(255))"
),
# Empty comment
(
"CREATE TABLE test (id int COMMENT '', name varchar(255))",
"CREATE TABLE test (id int , name varchar(255))"
),
# Comment with only spaces
(
"CREATE TABLE test (id int COMMENT ' ', name varchar(255))",
"CREATE TABLE test (id int , name varchar(255))"
),
# Case insensitive COMMENT keyword
(
"CREATE TABLE test (id int comment 'lowercase', name varchar(255) Comment 'Mixed case')",
"CREATE TABLE test (id int , name varchar(255) )"
),
])
def test_strip_comments_function(input_sql, expected_output):
"""
Test the _strip_comments function with various realistic scenarios.

This test covers:
- Basic single and double quoted comments
- Escaped quotes within comments (MySQL style with doubled quotes)
- Multiple comments in the same table
- COMMENT = syntax
- Multiline comments with newlines
- Unicode characters in comments
- Table-level comments
- Complex real-world scenarios
- Edge cases like empty comments and case variations
"""
from mysql_ch_replicator.converter import MysqlToClickhouseConverter

converter = MysqlToClickhouseConverter()
result = converter._strip_comments(input_sql)

# Normalize whitespace for comparison (remove extra spaces that might be left behind)
def normalize_whitespace(text):
import re
# Replace multiple spaces with single space, but preserve newlines
return re.sub(r'[ \t]+', ' ', text).strip()

assert normalize_whitespace(result) == normalize_whitespace(expected_output), f"Failed for input: {input_sql}"