diff --git a/mysql_ch_replicator/converter.py b/mysql_ch_replicator/converter.py index 8fd5917..2df6781 100644 --- a/mysql_ch_replicator/converter.py +++ b/mysql_ch_replicator/converter.py @@ -1075,8 +1075,51 @@ def convert_drop_table_query(self, mysql_query): raise Exception('not implement') def _strip_comments(self, create_statement): - pattern = r'\bCOMMENT(?:\s*=\s*|\s+)([\'"])(?:\\.|[^\\])*?\1' - return re.sub(pattern, '', create_statement, flags=re.IGNORECASE) + """ + Strip COMMENT clauses from CREATE TABLE statements. + Handles MySQL-style quote escaping where quotes are doubled ('' or ""). + """ + result = [] + i = 0 + while i < len(create_statement): + # Look for COMMENT keyword (case insensitive) + if (i + 7 < len(create_statement) and + create_statement[i:i+7].upper() == 'COMMENT' and + (i == 0 or not create_statement[i-1].isalnum()) and + (i + 7 >= len(create_statement) or not create_statement[i+7].isalnum())): + + # Skip COMMENT keyword + i += 7 + + # Skip whitespace and optional '=' + while i < len(create_statement) and create_statement[i].isspace(): + i += 1 + if i < len(create_statement) and create_statement[i] == '=': + i += 1 + while i < len(create_statement) and create_statement[i].isspace(): + i += 1 + + # Find the quoted string + if i < len(create_statement) and create_statement[i] in ('"', "'"): + quote_char = create_statement[i] + i += 1 # Skip opening quote + + # Find the closing quote, handling escaped quotes + while i < len(create_statement): + if create_statement[i] == quote_char: + # Check if this is an escaped quote (doubled) + if i + 1 < len(create_statement) and create_statement[i + 1] == quote_char: + i += 2 # Skip both quotes + else: + i += 1 # Skip closing quote + break + else: + i += 1 + else: + result.append(create_statement[i]) + i += 1 + + return ''.join(result) def parse_mysql_table_structure(self, create_statement, required_table_name=None): create_statement = self._strip_comments(create_statement) diff --git a/test_mysql_ch_replicator.py b/test_mysql_ch_replicator.py index d3641a8..3fc302d 100644 --- a/test_mysql_ch_replicator.py +++ b/test_mysql_ch_replicator.py @@ -492,13 +492,13 @@ def test_multi_column_erase(): prepare_env(cfg, mysql, ch) - mysql.execute(f''' + mysql.execute(f""" CREATE TABLE `{TEST_TABLE_NAME}` ( - departments int(11) NOT NULL, - termine int(11) NOT NULL, + departments int(11) NOT NULL COMMENT '事件类型,可选值: ''SYSTEM'', ''BUSINESS''', + termine int(11) NOT NULL COMMENT '事件类型,可选值: ''SYSTEM'', ''BUSINESS''', PRIMARY KEY (departments,termine) ) -''') +""") mysql.execute(f"INSERT INTO `{TEST_TABLE_NAME}` (departments, termine) VALUES (10, 20);", commit=True) @@ -3082,3 +3082,143 @@ def test_resume_initial_replication_with_ignore_deletes(): finally: # Clean up temp config file os.unlink(config_file) + + +@pytest.mark.parametrize("input_sql,expected_output", [ + # Basic single quote comment + ( + "CREATE TABLE test (id int NOT NULL COMMENT 'Simple comment', name varchar(255))", + "CREATE TABLE test (id int NOT NULL , name varchar(255))" + ), + # Basic double quote comment + ( + "CREATE TABLE test (id int NOT NULL COMMENT \"Simple comment\", name varchar(255))", + "CREATE TABLE test (id int NOT NULL , name varchar(255))" + ), + # Comment with escaped single quotes (the original bug case) + ( + "CREATE TABLE test (id int NOT NULL COMMENT '事件类型,可选值: ''SYSTEM'', ''BUSINESS''', name varchar(255))", + "CREATE TABLE test (id int NOT NULL , name varchar(255))" + ), + # Comment with escaped double quotes + ( + "CREATE TABLE test (id int NOT NULL COMMENT \"Value can be: \"\"ACTIVE\"\" or \"\"INACTIVE\"\"\", name varchar(255))", + "CREATE TABLE test (id int NOT NULL , name varchar(255))" + ), + # Multiple comments in same table + ( + """CREATE TABLE test ( + id int NOT NULL COMMENT 'Primary key', + name varchar(255) COMMENT 'User name', + status enum('active','inactive') COMMENT 'Status with ''quotes''' + )""", + """CREATE TABLE test ( + id int NOT NULL , + name varchar(255) , + status enum('active','inactive') + )""" + ), + # Comment with COMMENT = syntax + ( + "CREATE TABLE test (id int NOT NULL COMMENT = 'Primary key', name varchar(255))", + "CREATE TABLE test (id int NOT NULL , name varchar(255))" + ), + # Comment with mixed quotes and special characters + ( + "CREATE TABLE test (id int COMMENT 'Mixed: ''single'', \"double\", and `backtick`', name text)", + "CREATE TABLE test (id int , name text)" + ), + # Multiline comment + ( + """CREATE TABLE test ( + id int NOT NULL COMMENT 'This is a + multiline comment + with newlines', + name varchar(255) + )""", + """CREATE TABLE test ( + id int NOT NULL , + name varchar(255) + )""" + ), + # Comment with Unicode characters + ( + "CREATE TABLE test (id int COMMENT '用户ID - 主键', name varchar(255) COMMENT 'Имя пользователя')", + "CREATE TABLE test (id int , name varchar(255) )" + ), + # No comments (should remain unchanged) + ( + "CREATE TABLE test (id int NOT NULL, name varchar(255))", + "CREATE TABLE test (id int NOT NULL, name varchar(255))" + ), + # Comment at table level + ( + "CREATE TABLE test (id int NOT NULL, name varchar(255)) COMMENT 'Table comment'", + "CREATE TABLE test (id int NOT NULL, name varchar(255)) " + ), + # Complex case with multiple escaped quotes and special characters + ( + """CREATE TABLE test ( + departments int(11) NOT NULL COMMENT '事件类型,可选值: ''SYSTEM'', ''BUSINESS''', + termine int(11) NOT NULL COMMENT '事件类型,可选值: ''SYSTEM'', ''BUSINESS''', + PRIMARY KEY (departments,termine) + )""", + """CREATE TABLE test ( + departments int(11) NOT NULL , + termine int(11) NOT NULL , + PRIMARY KEY (departments,termine) + )""" + ), + # Comment with JSON-like content + ( + "CREATE TABLE test (config json COMMENT '{\"type\": \"config\", \"values\": [\"a\", \"b\"]}', id int)", + "CREATE TABLE test (config json , id int)" + ), + # Comment with SQL injection-like content (should be safely handled) + ( + "CREATE TABLE test (id int COMMENT 'DROP TABLE users; --', name varchar(255))", + "CREATE TABLE test (id int , name varchar(255))" + ), + # Empty comment + ( + "CREATE TABLE test (id int COMMENT '', name varchar(255))", + "CREATE TABLE test (id int , name varchar(255))" + ), + # Comment with only spaces + ( + "CREATE TABLE test (id int COMMENT ' ', name varchar(255))", + "CREATE TABLE test (id int , name varchar(255))" + ), + # Case insensitive COMMENT keyword + ( + "CREATE TABLE test (id int comment 'lowercase', name varchar(255) Comment 'Mixed case')", + "CREATE TABLE test (id int , name varchar(255) )" + ), +]) +def test_strip_comments_function(input_sql, expected_output): + """ + Test the _strip_comments function with various realistic scenarios. + + This test covers: + - Basic single and double quoted comments + - Escaped quotes within comments (MySQL style with doubled quotes) + - Multiple comments in the same table + - COMMENT = syntax + - Multiline comments with newlines + - Unicode characters in comments + - Table-level comments + - Complex real-world scenarios + - Edge cases like empty comments and case variations + """ + from mysql_ch_replicator.converter import MysqlToClickhouseConverter + + converter = MysqlToClickhouseConverter() + result = converter._strip_comments(input_sql) + + # Normalize whitespace for comparison (remove extra spaces that might be left behind) + def normalize_whitespace(text): + import re + # Replace multiple spaces with single space, but preserve newlines + return re.sub(r'[ \t]+', ' ', text).strip() + + assert normalize_whitespace(result) == normalize_whitespace(expected_output), f"Failed for input: {input_sql}"