In [167]:
import re

def adjust_indentation(text, baseline_x=None):
    """
    Adjusts indentation to match PDF structure where:
    - Function name and location: 0 spaces (baseline X position)
    - Description and content: 4 spaces (indented X position in PDF)
    
    Key insight from PDF X positions:
    - __init__ line: X0=60.70 (baseline)
    - Location line: X0=60.70 (baseline)
    - Constructor line: X0=80.71 (indented) <- SHOULD GET 4 SPACES
    - PARAMETERS: X0=80.71 (indented)
    
    Strategy: Find the first line after location that is NOT a signature continuation.
    Signature continuations end with ) or , or match param=value pattern.
    """
    if not text:
        return text
    
    lines = text.split('\n')
    if not lines:
        return text
    
    # Find where the signature ends (look for '=' which marks location line)
    signature_end_index = -1
    for i, line in enumerate(lines):
        if '=' in line and i > 0:  # The location line has '='
            signature_end_index = i
            break
    
    if signature_end_index == -1:
        # No location line found, return as-is
        return text
    
    # Find trigger_index: first non-empty, non-signature-continuation line after location
    trigger_index = -1
    for i in range(signature_end_index + 1, len(lines)):
        stripped = lines[i].strip()
        if not stripped:
            # Empty line, keep looking
            continue
        
        # Check if it looks like a signature continuation
        # Signature continuations: end with ) or , or match param=value
        is_continuation = (
            stripped.endswith(')') or 
            stripped.endswith(',') or 
            re.match(r'^[a-z_][a-z0-9_]*\s*=', stripped, re.IGNORECASE)
        )
        
        if not is_continuation:
            # This is the first content line (description or section header)
            trigger_index = i
            break
    
    if trigger_index == -1:
        # No content found after signature, return as-is
        return text
    
    # Apply indentation: indent from trigger point onwards
    adjusted_lines = []
    for i, line in enumerate(lines):
        if i < trigger_index:
            # Before trigger: no indentation (function name and location lines)
            adjusted_lines.append(line)
        else:
            # From trigger onwards: add 4 spaces if not empty
            if line.strip():
                adjusted_lines.append('    ' + line)
            else:
                adjusted_lines.append('')
    
    return '\n'.join(adjusted_lines)

In [168]:
import os
import re
import pymupdf as fitz

# Configuration
PDF_FILE = "Teradata Package for Python Function Reference.pdf"
OUTPUT_DIR = "teradataml_function_reference"

CHAPTER_MAP_TITLES = [
    "Teradata Package for Python Function Reference",
    "teradataml: Context Manager",
    "teradataml: DataFrame",
    "teradataml: Time Series Functions",
    "teradataml: DataFrameColumn",
    "Geospatial",
    "teradataml: Window Aggregates",
    "teradataml: Series",
    "teradataml: General Functions",
    "teradataml: Plot", 
    "teradtaml: sdk",
    "Enterprise Feature Store",
    "teradataml: Bring Your Own Analytics",
    "teradataml: Database 16.20.xx, 17.00.xx, 17.05.xx Analytic Functions",
    "teradataml: Database 17.10.xx Analytic Functions",
    "teradataml: Database 17.20.xx Analytic Functions",
    "teradataml: Database 20.00.xx Analytic Functions",
    "teradataml: Unbounded Array Framework Functions",
    "teradataml: Hyperparameter Tuning",
    "teradataml: AutoML",
    "teradataml: OpenSourceML",
    "teradataml: Vantage Analytics Library Functions",
    "teradataml: Formula",
    "teradataml: Data Preparation Functions",
    "Options"
]

CUSTOM_TITLE_MAP = {
    "Teradata Package for Python Function Reference": "Introduction and Reference Front Matter",
    "teradataml: Context Manager": "teradataml Context Manager",
    "teradataml: DataFrame": "teradataml DataFrame Object and Methods",
    "teradataml: Time Series Functions": "teradataml Time Series Methods",
    "teradataml: DataFrameColumn": "teradataml DataFrameColumn Expressions",
    "Geospatial": "teradataml Geospatial Types and DataFrames",
    "teradataml: Window Aggregates": "teradataml Window Aggregates",
    "teradataml: Series": "teradataml Series Object and Methods",
    "teradataml: General Functions": "teradataml General Functions (Utilities, Configuration, Versioning)",
    "teradataml: Plot": "teradataml Plotting Functions",
    "teradtaml: sdk": "teradataml SDK Functions", 
    "Enterprise Feature Store": "Enterprise Feature Store Functions",
    "teradataml: Bring Your Own Analytics": "teradataml Bring Your Own Analytics",
    "teradataml: Database 16.20.xx, 17.00.xx, 17.05.xx Analytic Functions": "teradataml Database 16.20.xx, 17.00.xx, 17.05.xx Analytic Functions",
    "teradataml: Database 17.10.xx Analytic Functions": "teradataml Database 17.10.xx Analytic Functions",
    "teradataml: Database 17.20.xx Analytic Functions": "teradataml Database 17.20.xx Analytic Functions",
    "teradataml: Database 20.00.xx Analytic Functions": "teradataml Database 20.00.xx Analytic Functions",
    "teradataml: Unbounded Array Framework Functions": "teradataml Unbounded Array Framework Functions",
    "teradataml: Hyperparameter Tuning": "teradataml Hyperparameter Tuning",
    "teradataml: AutoML": "teradataml AutoML",
    "teradataml: OpenSourceML": "teradataml OpenSourceML",
    "teradataml: Vantage Analytics Library Functions": "teradataml Vantage Analytics Library Functions",
    "teradataml: Formula": "teradataml Formula Functions",
    "teradataml: Data Preparation Functions": "teradataml Data Preparation Functions",
    "Options": "Options and Configuration"
}

JUNK_LINE_PATTERNS = [
    re.compile(r'^9/14/2025,\s*\d{2}:\d{2}$', re.IGNORECASE),          
    re.compile(r'^9/14/2025,\s*\d{2}:\d{2}\s+Page [\d,]+ of [\d,]+$', re.IGNORECASE),
    re.compile(r'^Page [\d,]+ of [\d,]+$', re.IGNORECASE),                             
    re.compile(r'^PDF Export$', re.IGNORECASE),                                     
    re.compile(r'^https?://docs\.teradata\.com/internal/api/webapp/print/[\w\d-]+$', re.IGNORECASE),                               
]

KNOWN_TITLE_PATTERNS = [
    re.compile(re.escape(title).replace(r'\ ', r'\s*'), re.IGNORECASE) 
    for title in CHAPTER_MAP_TITLES
]

def clean_extracted_text(text):
    """Removes metadata and normalizes whitespace from PDF text."""
    if text is None:
        return ""
    text = text.replace('\xa0', ' ')
    lines = text.split('\n')
    cleaned_lines = []
    all_junk_patterns = JUNK_LINE_PATTERNS + KNOWN_TITLE_PATTERNS

    for line in lines:
        line_stripped = line.strip().replace(u'\xa0', '').replace(u'\u200b', '')
        if not line_stripped:
            continue
        is_junk = False
        for pattern in all_junk_patterns:
            if pattern.fullmatch(line_stripped):
                is_junk = True
                break
        if not is_junk:
            cleaned_line = line
            for junk_pattern in JUNK_LINE_PATTERNS:
                embedded_pattern = re.compile(junk_pattern.pattern.replace(r'^', '').replace(r'$', ''), junk_pattern.flags)
                cleaned_line = embedded_pattern.sub('', cleaned_line)
            cleaned_lines.append(cleaned_line)
    result = '\n'.join(cleaned_lines)
    return result if result else ""

def sanitize_title(title, index):
    """Maps PDF title to filename."""
    base_title = CUSTOM_TITLE_MAP.get(title, title)
    prefixed_title = f"{index:02d} {base_title}"
    safe_title = re.sub(r'[^\w\s\(\)\[\]-]', '', prefixed_title).strip()
    safe_title = re.sub(r'\s+', '_', safe_title)
    return safe_title

def find_chapter_page_ranges(pdf_path, titles):
    """Finds page ranges for each chapter in the PDF."""
    if not os.path.exists(pdf_path):
        print(f"Error: PDF file not found at '{pdf_path}'.")
        return []

    print(f"Searching for {len(titles)} section headers...")
    doc = None
    try:
        doc = fitz.open(pdf_path)
        total_pages = doc.page_count
        found_starts = {}
        title_index = 0
        
        for page_num_0idx in range(total_pages):
            page = doc.load_page(page_num_0idx)
            text = page.get_text()
            
            if title_index < len(titles):
                current_title = titles[title_index]
                escaped_title = re.escape(current_title)
                flexible_pattern_str = escaped_title.replace(r'\ ', r'[\s\n]+')
                search_pattern = re.compile(flexible_pattern_str, re.IGNORECASE | re.DOTALL)
                match = search_pattern.search(text)
                
                if match:
                    start_page_1idx = page_num_0idx + 1
                    start_offset = match.start()
                    found_starts[current_title] = (start_page_1idx, start_offset)
                    print(f"Found '{current_title}' on page {start_page_1idx}")
                    title_index += 1
        
        final_map = []
        sorted_starts = sorted(found_starts.items(), key=lambda item: item[1][0])
        
        for i, (title, (start_page, start_offset)) in enumerate(sorted_starts):
            if i < len(sorted_starts) - 1:
                next_start_page, next_start_offset = sorted_starts[i+1][1]
                end_page = next_start_page
            else:
                end_page = total_pages
            final_map.append((title, start_page, end_page, start_offset))
        
        return final_map
        
    except Exception as e:
        print(f"Error during PDF processing: {e}")
        import traceback
        traceback.print_exc()
        return []
    finally:
        if doc:
            doc.close()

def create_and_split_files(pdf_path, chapter_map):
    """Splits PDF into markdown files using the updated adjust_indentation()."""
    if not chapter_map:
        print("Cannot create files: Chapter map is empty.")
        return

    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        print(f"Created output directory: {OUTPUT_DIR}")

    doc = None
    try:
        doc = fitz.open(pdf_path)

        for idx, (title, start_page_1idx, end_page_1idx, start_offset) in enumerate(chapter_map):
            try:
                start_page_0idx = start_page_1idx - 1
                end_page_0idx = end_page_1idx - 1
                safe_filename = sanitize_title(title, idx + 1)
                output_filename = os.path.join(OUTPUT_DIR, f"{safe_filename}.md")
                
                chapter_text = []
                
                start_page = doc.load_page(start_page_0idx)
                text = start_page.get_text()
                chapter_text.append(text[start_offset:])

                for page_num in range(start_page_0idx + 1, end_page_0idx):
                    page = doc.load_page(page_num)
                    text = page.get_text()
                    chapter_text.append(text)

                if start_page_0idx < end_page_0idx:
                    end_page_content = doc.load_page(end_page_0idx).get_text()
                    end_offset = len(end_page_content)
                    if idx + 1 < len(chapter_map):
                        next_title, next_start_page, _, next_start_offset = chapter_map[idx+1]
                        if next_start_page == end_page_1idx:
                            end_offset = next_start_offset
                    chapter_text.append(end_page_content[:end_offset])

                raw_text = "".join(chapter_text)
                cleaned_content = clean_extracted_text(raw_text)
                if cleaned_content is None:
                    print(f"WARNING: clean_extracted_text returned None for {title}")
                    cleaned_content = ""
                adjusted_content = adjust_indentation(cleaned_content)
                if adjusted_content is None:
                    print(f"WARNING: adjust_indentation returned None for {title}")
                    adjusted_content = cleaned_content

                with open(output_filename, "w", encoding="utf-8") as f:
                    f.write(f"# {CUSTOM_TITLE_MAP.get(title, title)}\n\n") 
                    f.write(adjusted_content)
                
                print(f"Created: {output_filename} (Pages {start_page_1idx}-{end_page_1idx})")
            except Exception as e:
                print(f"ERROR processing chapter '{title}': {e}")
                import traceback
                traceback.print_exc()
        
        print("\nFunction reference splitting complete.")

    except Exception as e:
        print(f"Error during file creation: {e}")
        import traceback
        traceback.print_exc()
    finally:
        if doc:
            doc.close()

# Run the pipeline
print("=" * 70)
print("REGENERATING MARKDOWN FILES WITH FIXED INDENTATION")
print("=" * 70)
calculated_map = find_chapter_page_ranges(PDF_FILE, CHAPTER_MAP_TITLES)

if calculated_map:
    create_and_split_files(PDF_FILE, calculated_map)
    print("\n✓ Regeneration complete!")
else:
    print("ERROR: Could not generate chapter map.")


REGENERATING MARKDOWN FILES WITH FIXED INDENTATION
Searching for 25 section headers...
Found 'Teradata Package for Python Function Reference' on page 1
Found 'teradataml: Context Manager' on page 3
Found 'teradataml: DataFrame' on page 8
Found 'teradataml: Time Series Functions' on page 172
Found 'teradataml: DataFrameColumn' on page 215
Found 'Geospatial' on page 423
Found 'teradataml: Window Aggregates' on page 580
Found 'teradataml: Series' on page 623
Found 'teradataml: General Functions' on page 625
Found 'teradataml: Plot' on page 716
Found 'teradtaml: sdk' on page 725
Found 'Enterprise Feature Store' on page 737
Found 'teradataml: Bring Your Own Analytics' on page 759
Found 'teradataml: Database 16.20.xx, 17.00.xx, 17.05.xx Analytic Functions' on page 897
Found 'teradataml: Database 17.10.xx Analytic Functions' on page 927
Found 'teradataml: Database 17.20.xx Analytic Functions' on page 1007
Found 'teradataml: Database 20.00.xx Analytic Functions' on page 1220
Found 'teradataml:

In [191]:
import os
import re

directory = "teradataml_function_reference"
functions = []

def is_valid_function_candidate(line):
    # Only allow valid Python identifiers (no spaces, no punctuation, no non-ASCII)
    return re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', line) is not None

def extract_from_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    i = 0
    results = []
    while i < len(lines):
        line = lines[i].rstrip('\n')
        stripped = line.strip()
        
        # FIX PART B: Proper indentation validation (updated for new 4-space indentation)
        # Count all leading whitespace (regular spaces, tabs, and non-breaking spaces)
        leading_ws = len(line) - len(line.lstrip(' \t\xa0'))
        # With new indentation: function names are at 4 spaces, skip deeply indented lines (8+)
        # that are likely code examples or parameter details
        if leading_ws >= 8 and stripped and is_valid_function_candidate(stripped):
            # Line is too deeply indented to be a function definition
            i += 1
            continue

        # Exception: If line is "Functions" and next line is a function signature, use previous line as name
        if stripped == "Functions" and i > 0 and i+1 < len(lines):
            next_line = lines[i+1].strip()
            # Check if next line is a function signature (e.g. Antiselect(data=None, ...)
            if re.match(r'^[A-Za-z_][A-Za-z0-9_]*\s*\(', next_line):
                prev_line = lines[i-1].rstrip('\n')
                prev_stripped = prev_line.strip()
                # Check previous line is a valid candidate
                if prev_stripped and is_valid_function_candidate(prev_stripped):
                    name = prev_stripped
                    location = name
                    k_start = i+2
                    k = k_start
                    while k < len(lines) and not lines[k].strip():
                        k += 1
                    while k < len(lines) and lines[k].strip() and not lines[k].strip().upper().startswith('DESCRIPTION') and lines[k].strip() not in (
                        'PARAMETERS:', 'RAISES:', 'RETURNS:', 'EXAMPLES:', 'NOTES:'):
                        k += 1
                    desc_lines = []
                    if k < len(lines) and lines[k].strip().upper().startswith('DESCRIPTION'):
                        desc_line = lines[k]
                        if ':' in desc_line and desc_line.strip().endswith(':'):
                            k += 1
                        else:
                            possible_parts = desc_line.split(':', 1)
                            possible = possible_parts[1].strip() if len(possible_parts) > 1 else ''
                            if possible:
                                desc_lines = [possible]
                                k += 1
                            else:
                                k += 1
                    else:
                        while k < len(lines) and lines[k].strip() not in ('PARAMETERS:', 'RAISES:', 'RETURNS:', 'EXAMPLES:', 'NOTES:'):
                            line = lines[k].strip()
                            if line:
                                desc_lines.append(line)
                            k += 1
                    while k < len(lines):
                        l = lines[k]
                        if l.strip() == '':
                            k += 1
                            continue
                        stripped_l = l.strip()
                        if stripped_l in ('PARAMETERS:', 'RAISES:', 'RETURNS:', 'EXAMPLES:', 'NOTES:'):
                            break
                        # Check indentation level to determine if it's part of description
                        leading_ws_l = len(l) - len(l.lstrip(' \t\xa0'))
                        if leading_ws_l >= 4:
                            desc_lines.append(l.strip())
                            k += 1
                        else:
                            break
                    desc_text = ' '.join(desc_lines).strip()
                    first_sent = ''
                    if desc_text:
                        sentences = re.split(r'(?<=\.)\s+', desc_text)
                        first_sent = sentences[0].strip()
                        word_count = len(first_sent.split())
                        if word_count < 10 and len(sentences) > 1:
                            first_sent = (first_sent + ' ' + sentences[1].strip()).strip()
                    results.append({
                        'name': name,
                        'location': location,
                        'description': first_sent
                    })
                    i = k
                    continue
                else:
                    i += 1
                    continue
        # Normal candidate function name
        elif stripped and is_valid_function_candidate(stripped):
            name = stripped
        else:
            i += 1
            continue

        # Find the next non-empty line
        j = i + 1
        while j < len(lines) and not lines[j].strip():
            j += 1

        location = None
        k_start = None
        function_indent_level = leading_ws  # Remember indentation level of function name

        if j < len(lines):
            next_line = lines[j].strip()
            if '=' in next_line:
                location_line = next_line
                if ' = ' in location_line:
                    location = location_line.split(' = ')[0].strip()
                else:
                    location = location_line.split('=')[0].strip()
                k_start = j + 1
            elif next_line.startswith(name + '('):
                location = name
                k_start = j + 1
            else:
                # New: Accept a dotted module/function path as a location line
                if re.match(r'^[A-Za-z_][A-Za-z0-9_]*(?:\.[A-Za-z_][A-Za-z0-9_]*)+$', next_line):
                    # Example: teradataml.common.formula.Formula.all_columns
                    location = next_line
                    k_start = j + 1
                else:
                    i += 1
                    continue

        if location is None:
            i += 1
            continue

        # NEW: Simplified description extraction
        # Description lines come after the location line and BEFORE the first section keyword
        # (DESCRIPTION:, PARAMETERS:, RAISES:, RETURNS:, EXAMPLES:, NOTES:)
        # Description can either:
        # 1. Start with "DESCRIPTION:" header (followed by content on next lines or same line after colon)
        # 2. Start immediately as plain text (no header)
        
        # FIX: Handle BOTH adjusted (4-space) and unadjusted (0-space) files
        # If function was at 0 spaces, description is also at 0 spaces
        # If function was at 4 spaces, description is at 4+ spaces
        
        k = k_start
        desc_lines = []
        
        # Skip empty lines after location
        while k < len(lines) and not lines[k].strip():
            k += 1
        
        # Check if first content line is "DESCRIPTION:"
        if k < len(lines) and lines[k].strip().upper().startswith('DESCRIPTION'):
            desc_line = lines[k]
            if ':' in desc_line and desc_line.strip().endswith(':'):
                # DESCRIPTION: on its own line, content starts on next line
                k += 1
            else:
                # DESCRIPTION: with content on same line
                possible_parts = desc_line.split(':', 1)
                possible = possible_parts[1].strip() if len(possible_parts) > 1 else ''
                if possible:
                    desc_lines = [possible]
                k += 1
        
        # Now collect description lines until we hit a section keyword
        # For 0-space functions: collect lines at 0 spaces (same level as function)
        # For 4-space functions: collect lines at 4+ spaces
        min_desc_indent = function_indent_level
        
        while k < len(lines):
            l = lines[k]
            if l.strip() == '':
                k += 1
                continue
            
            stripped_l = l.strip()
            # Check for section keywords
            if stripped_l in ('PARAMETERS:', 'RAISES:', 'RETURNS:', 'EXAMPLES:', 'NOTES:'):
                break
            
            # Check indentation: description lines should be at the same indent level as function or greater
            leading_ws_l = len(l) - len(l.lstrip(' \t\xa0'))
            
            if function_indent_level == 0:
                # For 0-space functions, accept description at 0 or 4+ spaces
                # 0 spaces = content at same level
                # 4+ spaces = indented content (parameters, etc.)
                # Stop at section keywords regardless
                if leading_ws_l in (0, 4, 8, 12):  # Normal description indentation levels
                    # But if it's a section keyword, stop
                    if stripped_l not in ('PARAMETERS:', 'RAISES:', 'RETURNS:', 'EXAMPLES:', 'NOTES:'):
                        desc_lines.append(l.strip())
                        k += 1
                    else:
                        break
                else:
                    k += 1
            else:
                # For 4-space functions, accept description at 4+ spaces only
                if leading_ws_l >= function_indent_level:
                    desc_lines.append(l.strip())
                    k += 1
                else:
                    # Less indent: end of current function
                    break
        
        desc_text = ' '.join(desc_lines).strip()
        first_sent = ''
        if desc_text:
            sentences = re.split(r'(?<=\.)\s+', desc_text)
            first_sent = sentences[0].strip()
            word_count = len(first_sent.split())
            if word_count < 10 and len(sentences) > 1:
                first_sent = (first_sent + ' ' + sentences[1].strip()).strip()
        
        results.append({
            'name': name,
            'location': location,
            'description': first_sent
        })
        i = k
    return results

if not os.path.isdir(directory):
    print(f"Directory not found: {directory}")
else:
    for idx in range(2, 26):
        fname = None
        for f in os.listdir(directory):
            if f.startswith(f"{idx:02d}_") and f.endswith('.md'):
                fname = f
                break
        if not fname:
            continue
        path = os.path.join(directory, fname)
        try:
            extracted = extract_from_file(path)
        except Exception as e:
            print(f"Error parsing {path}: {e}")
            continue
        for item in extracted:
            item['file'] = fname
            functions.append(item)
table_lines = [
    "| File | Name | Location | Description |",
    "|------|------|----------|-------------|",
]
for func in functions:
    desc = (func.get('description') or '').replace('|', '\\|')
    table_lines.append(f"| {func.get('file','')} | {func.get('name','')} | {func.get('location','')} | {desc} |")
output_path = 'teradataml_function_table.md'
with open(output_path, 'w', encoding='utf-8') as f:
    f.write("# TeradataML Function Reference Table\n\n")
    f.write('\n'.join(table_lines) + '\n')
print(f"Table created: {output_path} with {len(functions)} entries")


Table created: teradataml_function_table.md with 1179 entries


In [192]:
# Remove duplicate rows from teradataml_function_table.md
file_path = 'teradataml_function_table.md'

with open(file_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()

# Find the header lines (first 3 lines: title, header, separator)
header_lines = lines[:3]

# Parse the data rows
data_rows = []
seen = set()

for line in lines[3:]:
    stripped = line.strip()
    if stripped:
        # Split by '|' and strip each part
        parts = [p.strip() for p in stripped.split('|')[1:-1]]  # Remove empty first and last
        row_tuple = tuple(parts)
        if row_tuple not in seen:
            seen.add(row_tuple)
            data_rows.append(line)

# Write back the file
with open(file_path, 'w', encoding='utf-8') as f:
    f.writelines(header_lines)
    f.writelines(data_rows)

print(f"Removed duplicates. Total unique rows: {len(data_rows)}")

Removed duplicates. Total unique rows: 1168


In [None]:
import os

# File paths
file1_path = 'teradataml_function_table.md'
file2_path = 'teradataml_function_table copy_synonym3.md'

# Read the files
with open(file1_path, 'r', encoding='utf-8') as f:
    lines1 = f.readlines()

with open(file2_path, 'r', encoding='utf-8') as f:
    lines2 = f.readlines()

# Build synonyms dict from file2 data rows (data starts at index 3)
synonyms_dict = {}
for line in lines2[3:]:
    if line.strip():
        parts = [p.strip() for p in line.strip().split('|')[1:-1]]
        if len(parts) >= 5:
            key = (parts[0], parts[1], parts[2])
            synonyms_dict[key] = parts[4]

# Rebuild the first file content with robust header/separator handling
new_lines = []
# Title
new_lines.append(lines1[0] if lines1 else '# TeradataML Function Reference Table\n')
# Parse header into parts
if len(lines1) > 1 and '|' in lines1[1]:
    header_parts = [p.strip() for p in lines1[1].strip().split('|')[1:-1]]
else:
    header_parts = ['File', 'Name', 'Location', 'Description']
# Ensure header has 5 columns
target_cols = ['File', 'Name', 'Location', 'Description', 'Synonyms/Keywords']
for i in range(len(target_cols)):
    if i >= len(header_parts) or not header_parts[i]:
        header_parts = header_parts[:i] + [target_cols[i]] + header_parts[i:]
header_parts = header_parts[:5]
new_lines.append('| ' + ' | '.join(header_parts) + ' |\n')
# Separator
sep_parts = ['-' * max(6, len(h)) for h in header_parts]
new_lines.append('| ' + ' | '.join(sep_parts) + ' |\n')

num_changed = 0
for i in range(3, len(lines1)):
    line = lines1[i]
    if line.strip() and not line.strip().startswith('|------'):
        parts = [p.strip() for p in line.strip().split('|')[1:-1]]
        # normalize to at least 4 cols
        while len(parts) < 4:
            parts.append('')
        key = (parts[0], parts[1], parts[2])
        syn = synonyms_dict.get(key, '')
        if syn:
            num_changed += 1
        parts = parts[:4] + [syn]
        new_lines.append('| ' + ' | '.join(parts) + ' |\n')

# Write back to file1
with open(file1_path, 'w', encoding='utf-8') as f:
    f.writelines(new_lines)

print(f"Updated {file1_path} with Synonyms/Keywords column based on File, Name, Location match.")
print(f"Number of rows changed: {num_changed}")

Updated teradataml_function_table.md with Synonyms/Keywords column based on File, Name, Location match.
Number of rows changed: 1167
