In [1]:
mapping = {
    'DECIMAL\(\d+,\d+\)': 'DECIMAL(38, 18)',
    'INT': 'INTEGER',
    'SMALLINT': 'INTEGER',
    'DOUBLE': 'DECIMAL(38, 18)',
    'TIMESTAMP\(\d+\)': 'VARCHAR(26) CHARACTER SET UNICODE NOT CASESPECIFIC',
    'DATE': 'VARCHAR(10) CHARACTER SET UNICODE NOT CASESPECIFIC',
    r'(CHAR|VARCHAR)\((\d+)\)': lambda match: f'VARCHAR({match.group(2)}) ',
    'BOOLEAN': 'VARCHAR(1) CHARACTER SET UNICODE NOT CASESPECIFIC',
    'BYTE': 'VARCHAR(1) CHARACTER SET UNICODE NOT CASESPECIFIC'
}

1. `r'(CHAR|VARCHAR)\((\d+)\)'`:
   - `r` before the string denotes a raw string, which means that backslashes(`\`) are treated as literal characters, not escape characters.
   - `(CHAR|VARCHAR)` is a group that matches either the word 'CHAR' or 'VARCHAR'.
   - `\(` matches an opening parenthesis.
   - `(\d+)` is another group that matches one or more digits.
   - ` \)` matches a closing parenthesis.
   - Altogether, this pattern is used to match data types like 'CHAR(20)' or 'VARCHAR(50)' where the numbers inside the parentheses represent the length.

2. `lambda match: f'VARCHAR({match.group(2)})'`:
   - `match` is a variable that represents the match found when the regular expression pattern matches a part of the input string.
   - `match.group(2)` is used to extract the content of the second group within the regular expression (the one that matched one or more digits).
   - `f'VARCHAR({match.group(2)})'` is an f-string that constructs a new string using the extracted digits.
     - For example, if the input is 'VARCHAR(50)', the lambda function will return 'VARCHAR(50)', essentially converting 'VARCHAR(50)' to 'VARCHAR(50)'.

In [2]:
import re

In [3]:
def convert_data_type(data_type):
    """
    Convert Teradata data types to equivalent foreign table data types based on mapping rules.

    This function takes a Teradata data type as input and applies a set of mapping rules to transform it into an equivalent
    foreign table data type.

    Args:
        data_type (str): A string representing a Teradata data type to be converted.

    Returns:
        str: The converted foreign table data type.

    Mapping Rules:
    - 'DECIMAL(\d+,\d+)': Converted to 'DECIMAL(38, 18)'
    - 'INT': Converted to 'INTEGER'
    - 'SMALLINT': Converted to 'INTEGER'
    - 'DOUBLE': Converted to 'DECIMAL(38, 18)'
    - 'TIMESTAMP(\d+)': Converted to 'VARCHAR(26) CHARACTER SET UNICODE NOT CASESPECIFIC'
    - 'DATE': Converted to 'VARCHAR(10) CHARACTER SET UNICODE NOT CASESPECIFIC'
    - '(CHAR|VARCHAR)(\d+)': Converted to 'VARCHAR(\d+)'
    - 'BOOLEAN': Converted to 'VARCHAR(1) CHARACTER SET UNICODE NOT CASESPECIFIC'
    - 'BYTE': Converted to 'VARCHAR(1) CHARACTER SET UNICODE NOT CASESPECIFIC'

    The 'mapping' dictionary provides a set of rules to handle common Teradata data types. The function iterates through
    these rules and applies them to the input 'data_type', producing the corresponding foreign table data type.

    Examples:
    - 'DECIMAL(9,2)' is converted to 'DECIMAL(38, 18)'
    - 'INT' is converted to 'INTEGER'
    - 'VARCHAR(50)' is converted to 'VARCHAR(50) CHARACTER SET UNICODE NOT CASESPECIFIC'
    - 'BOOLEAN' is converted to 'VARCHAR(1) CHARACTER SET UNICODE NOT CASESPECIFIC'

    Note:
    - The function uses regular expressions to match and replace patterns in the 'data_type'.
    - For '(CHAR|VARCHAR)(\d+)', it extracts the number inside the parentheses and uses it to construct the new data type.

    This function simplifies the process of converting Teradata data types to a format suitable for foreign tables.
    """
    mapping = {
        'DECIMAL\(\d+,\d+\)': 'DECIMAL(38, 18)',
        'INT': 'INTEGER',
        'SMALLINT': 'INTEGER',
        'DOUBLE': 'DECIMAL(38, 18)',
        'TIMESTAMP\(\d+\)': 'VARCHAR(26) CHARACTER SET UNICODE NOT CASESPECIFIC',
        'DATE': 'VARCHAR(10) CHARACTER SET UNICODE NOT CASESPECIFIC',
        r'(CHAR|VARCHAR)\((\d+)\)': lambda match: f'VARCHAR({match.group(2)}) ',
        'BOOLEAN': 'VARCHAR(1) CHARACTER SET UNICODE NOT CASESPECIFIC',
        'BYTE': 'VARCHAR(1) CHARACTER SET UNICODE NOT CASESPECIFIC'
    }
    for pattern, replacement in mapping.items():
        data_type = re.sub(pattern, replacement, data_type)
    return data_type


In [4]:
def convert_teradata_table_ddl_to_foreign_ddl(table_ddl, domain, subdomain):
    """
    Convert Teradata table Data Definition Language (DDL) to equivalent foreign table DDL.

    This function takes a Teradata table DDL as input and converts it into a foreign table DDL suitable for external data
    storage. It processes the DDL line by line, converting data types and adding a 'Location' column. The resulting DDL
    string is configured for a specific domain and subdomain.

    Args:
        table_ddl (str): The Teradata table DDL to be converted.
        domain (str): The domain for the external table location.
        subdomain (str): The subdomain for the external table location.

    Returns:
        str: The foreign table DDL for the specified domain and subdomain.

    Conversion Steps:
    - The Teradata table DDL is split into individual lines.
    - Each line is processed:
        - Lines starting with 'SET' are skipped to exclude "SET TABLE" statements.
        - The function extracts the column name and data type from each line using regular expressions.
        - The data type is converted by calling the 'convert_data_type' function.
        - A new line is constructed with the converted column name and data type.
        - These new lines are stored in the 'new_ddl' list.
    - A 'Location' column is added at the beginning of the 'new_ddl' list.
    - The new DDL lines are joined to create the complete foreign table DDL string.
    - The foreign table DDL is structured for the specified domain and subdomain.

    Example Usage:
    - Input Teradata table DDL: 'CREATE TABLE my_table (ID INT, Name VARCHAR(50), Date DATE);'
    - domain: 'my-domain'
    - subdomain: 'my-subdomain'
    - Output Foreign Table DDL:
        CREATE MULTISET FOREIGN TABLE foreign_table_schema.foreign_table_name, FALLBACK,
            EXTERNAL STUFF,
            MAP = MY_MAP1040 (
            Location VARCHAR(2048) CHARACTER SET UNICODE CASESPECIFIC,
            ID INTEGER,
            Name VARCHAR(50),
            Date VARCHAR(10) CHARACTER SET UNICODE NOT CASESPECIFIC,
        )
        USING (
            LOCATION 's3://my-bucket/my-domain/bronze_layer/my-subdomain/foreign_table_name/'
        );

    Note:
    - This function relies on the 'convert_data_type' function for data type conversion.
    - It simplifies the process of converting Teradata table DDL for use with external data storage.
    """

    # Split the table DDL into individual lines
    lines = table_ddl.split('\n')
    new_ddl = []

    for line in lines:
        # Skip the SET TABLE line
        if 'SET' in line:
            continue
        # Extract the column name and data type
        match = re.search(r'\s+(\w+)\s+(\w+(?:\(\d+(?:,\d+)?\))?)', line)
        if match:
            column_name = match.group(1)
            data_type = match.group(2)

            # Convert the data type
            data_type = convert_data_type(data_type)

            # Construct the new line
            new_line = f'  {column_name} {data_type},'
            new_ddl.append(new_line)

    # Add the Location column
    new_ddl.insert(0, '  Location VARCHAR(2048) CHARACTER SET UNICODE CASESPECIFIC,')

    # Join the new DDL lines
    new_table_ddl = '\n'.join(new_ddl)

    # Create the foreign table DDL string
    foreign_table_ddl_string = f"""
CREATE MULTISET FOREIGN TABLE foreign_table_schema.foreign_table_name, FALLBACK,
    EXTERNAL STUFF,
    MAP = MY_MAP1040 (
{new_table_ddl}
    )
    USING (
  LOCATION 's3://my-bucket/{domain}/bronze_layer/{subdomain}/foreign_table_name/'
);

"""
    return foreign_table_ddl_string

In [5]:
domain = "domain1"
subdomain = "subdomain1"

table_ddl_string = """
CREATE SET TABLE sample_schema.sample_table (
  id INT NOT NULL GENERATED ALWAYS AS IDENTITY START WITH 1,
  name VARCHAR(255) CHARACTER SET UTF8 NOT NULL DEFAULT 'Unknown',
  age INT NOT NULL DEFAULT 0,
  created_at TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP,
  updated_at TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
  date_of_birth DATE NOT NULL DEFAULT CURRENT_DATE,
  decimal_value DECIMAL(10,2) NOT NULL DEFAULT 0.00,
  smallint_value SMALLINT NOT NULL DEFAULT 0,
  boolean_value BOOLEAN NOT NULL DEFAULT FALSE,
  double_value DOUBLE NOT NULL DEFAULT 0.0,
  char_value CHAR(1) NOT NULL DEFAULT 'A',
  byte_value BYTE NOT NULL DEFAULT 0
);
"""

foreign_ddl = convert_teradata_table_ddl_to_foreign_ddl(table_ddl_string, domain, subdomain)
print(foreign_ddl)


CREATE MULTISET FOREIGN TABLE foreign_table_schema.foreign_table_name, FALLBACK,
    EXTERNAL STUFF,
    MAP = MY_MAP1040 (
  Location VARCHAR(2048) CHARACTER SET UNICODE CASESPECIFIC,
  id INTEGER,
  age INTEGER,
  created_at VARCHAR(26)  CHARACTER SET UNICODE NOT CASESPECIFIC,
  updated_at VARCHAR(26)  CHARACTER SET UNICODE NOT CASESPECIFIC,
  date_of_birth VARCHAR(10)  CHARACTER SET UNICODE NOT CASESPECIFIC,
  decimal_value DECIMAL(38, 18),
  smallint_value INTEGEREGER,
  boolean_value VARCHAR(1) CHARACTER SET UNICODE NOT CASESPECIFIC,
  double_value DECIMAL(38, 18),
  char_value VARCHAR(1) ,
  byte_value VARCHAR(1) CHARACTER SET UNICODE NOT CASESPECIFIC,
    )
    USING (
  LOCATION 's3://my-bucket/domain1/bronze_layer/subdomain1/foreign_table_name/'
);


