In [14]:
import pymupdf as fitz
import os
import re

# --- Configuration ---
# NOTE: This should point to the newly uploaded Function Reference PDF.
PDF_FILE = "Teradata Package for Python Function Reference.pdf"
OUTPUT_DIR = "teradataml_function_reference"

# List of all 25 expected section headers in sequential order, verified against the PDF.
CHAPTER_MAP_TITLES = [
    "Teradata Package for Python Function Reference",
    "teradataml: Context Manager",
    "teradataml: DataFrame",
    "teradataml: Time Series Functions",
    "teradataml: DataFrameColumn",
    "Geospatial",
    "teradataml: Window Aggregates",
    "teradataml: Series",
    "teradataml: General Functions",
    
    "teradataml: Plot", 
    "teradtaml: sdk", # Reinstated user's typo, assuming this is the string in the PDF
    "Enterprise Feature Store",
    "teradataml: Bring Your Own Analytics",
    "teradataml: Database 16.20.xx, 17.00.xx, 17.05.xx Analytic Functions",
    "teradataml: Database 17.10.xx Analytic Functions",
    "teradataml: Database 17.20.xx Analytic Functions",
    "teradataml: Database 20.00.xx Analytic Functions",
    "teradataml: Unbounded Array Framework Functions",
    "teradataml: Hyperparameter Tuning",
    "teradataml: AutoML",
    "teradataml: OpenSourceML",
    "teradataml: Vantage Analytics Library Functions",
    "teradataml: Formula",
    "teradataml: Data Preparation Functions",
    "Options"
]

# Mapping from ORIGINAL_TITLE (used for searching) to the SEARCH-OPTIMIZED TITLE (used for file naming).
# This map is now fully complete and verified.
CUSTOM_TITLE_MAP = {
    "Teradata Package for Python Function Reference": "Introduction and Reference Front Matter",
    "teradataml: Context Manager": "teradataml Context Manager",
    "teradataml: DataFrame": "teradataml DataFrame Object and Methods",
    "teradataml: Time Series Functions": "teradataml Time Series Methods",
    "teradataml: DataFrameColumn": "teradataml DataFrameColumn Expressions",
    "Geospatial": "teradataml Geospatial Types and DataFrames",
    "teradataml: Window Aggregates": "teradataml Window Aggregates",
    "teradataml: Series": "teradataml Series Object and Methods",
    "teradataml: General Functions": "teradataml General Functions (Utilities, Configuration, Versioning)",
    
    "teradataml: Plot": "teradataml Plotting Functions",
    "teradtaml: sdk": "teradataml SDK Functions", 
    "Enterprise Feature Store": "Enterprise Feature Store Functions",
    "teradataml: Bring Your Own Analytics": "teradataml Bring Your Own Analytics",
    "teradataml: Database 16.20.xx, 17.00.xx, 17.05.xx Analytic Functions": "teradataml Database 16.20.xx, 17.00.xx, 17.05.xx Analytic Functions",
    "teradataml: Database 17.10.xx Analytic Functions": "teradataml Database 17.10.xx Analytic Functions",
    "teradataml: Database 17.20.xx Analytic Functions": "teradataml Database 17.20.xx Analytic Functions",
    "teradataml: Database 20.00.xx Analytic Functions": "teradataml Database 20.00.xx Analytic Functions",
    "teradataml: Unbounded Array Framework Functions": "teradataml Unbounded Array Framework Functions",
    "teradataml: Hyperparameter Tuning": "teradataml Hyperparameter Tuning",
    "teradataml: AutoML": "teradataml AutoML",
    "teradataml: OpenSourceML": "teradataml OpenSourceML",
    "teradataml: Vantage Analytics Library Functions": "teradataml Vantage Analytics Library Functions",
    "teradataml: Formula": "teradataml Formula Functions",
    "teradataml: Data Preparation Functions": "teradataml Data Preparation Functions",
    "Options": "Options and Configuration"
}


# List of regex patterns to match known junk/metadata lines, compiled for efficiency.
# The `^...$` anchors ensure we only remove lines that *only* contain the junk.
JUNK_LINE_PATTERNS = [
    # 1. Date/Time: Match the specific junk date/time format
    re.compile(r'^9/14/2025,\s*\d{2}:\d{2}$', re.IGNORECASE),          
    # 2. Combined Date/Time and Page: Match lines like "9/14/2025, 16:50 Page 1,498 of 1,675"
    re.compile(r'^9/14/2025,\s*\d{2}:\d{2}\s+Page [\d,]+ of [\d,]+$', re.IGNORECASE),
    # 3. Page numbers: Matches 'Page 1,457 of 1,675' (with commas in numbers)
    re.compile(r'^Page [\d,]+ of [\d,]+$', re.IGNORECASE),                             
    # 4. PDF Export: Literal match (as agreed)
    re.compile(r'^PDF Export$', re.IGNORECASE),                                     
    # 5. Specific Teradata Documentation Print URL (Safely targets the known junk URL structure)
    re.compile(r'^https?://docs\.teradata\.com/internal/api/webapp/print/[\w\d-]+$', re.IGNORECASE),                               
]

# Pre-compile known titles into regex patterns to remove the repeated header line 
# that often follows the H1 we insert. We allow for flexible whitespace.
KNOWN_TITLE_PATTERNS = [
    re.compile(re.escape(title).replace(r'\ ', r'\s*'), re.IGNORECASE) 
    for title in CHAPTER_MAP_TITLES
]

def clean_extracted_text(text):
    """
    Removes common header/footer metadata and repeated chapter titles from 
    the raw extracted PDF text based on predefined patterns.
    """
    lines = text.split('\n')
    cleaned_lines = []
    
    # Combine all patterns for a single check
    all_junk_patterns = JUNK_LINE_PATTERNS + KNOWN_TITLE_PATTERNS

    for line in lines:
        # Strip leading/trailing whitespace and the common non-breaking space/zero width space
        line_stripped = line.strip().replace(u'\xa0', '').replace(u'\u200b', '')
        
        # Skip truly empty lines
        if not line_stripped:
            continue

        is_junk = False
        
        # Check against all known junk patterns
        for pattern in all_junk_patterns:
            # Check for a full match against the stripped line
            if pattern.fullmatch(line_stripped):
                is_junk = True
                break
        
        if not is_junk:
            # Remove embedded junk patterns from within the line
            cleaned_line = line
            for junk_pattern in JUNK_LINE_PATTERNS:
                # Create a version without anchors for embedded removal
                embedded_pattern = re.compile(junk_pattern.pattern.replace(r'^', '').replace(r'$', ''), junk_pattern.flags)
                cleaned_line = embedded_pattern.sub('', cleaned_line)
            cleaned_lines.append(cleaned_line)
        
    # Rejoin the lines
    return '\n'.join(cleaned_lines)


def sanitize_title(title, index):
    """
    Maps the original PDF title to the search-optimized title,
    adds the chapter index, and cleans it for safe filesystem usage.
    """
    # 1. Apply the search-optimized name
    base_title = CUSTOM_TITLE_MAP.get(title, title)
    
    # 2. Add the prefix
    prefixed_title = f"{index:02d} {base_title}"

    # 3. Sanitize for filesystem (replace illegal characters with underscore)
    safe_title = re.sub(r'[^\w\s\(\)\[\]-]', '', prefixed_title).strip()
    safe_title = re.sub(r'\s+', '_', safe_title)
    
    return safe_title

def find_chapter_page_ranges(pdf_path, titles):
    """
    Scans the PDF for the provided titles and returns a list of 
    (title, start_page_1idx, end_page_1idx, start_offset) tuples, 
    where start_offset is the character index where the title is found on the start page.
    Uses a resilient regex to handle potential PDF text extraction issues (like internal newlines).
    """
    if not os.path.exists(pdf_path):
        print(f"Error: PDF file not found at '{pdf_path}'.")
        return []

    print(f"Searching for {len(titles)} section headers in '{pdf_path}'...")
    
    doc = None
    try:
        doc = fitz.open(pdf_path)
        total_pages = doc.page_count
        
        # Stores {title: (start_page_1idx, start_offset)}
        found_starts = {}
        title_index = 0
        
        # Iterate through all pages to find the start of each section
        for page_num_0idx in range(total_pages):
            page = doc.load_page(page_num_0idx)
            text = page.get_text()
            
            # Search for the next expected title
            if title_index < len(titles):
                current_title = titles[title_index]
                
                # --- SINGLE-STAGE RESILIENT REGEX ---
                # 1. Escape the full title to treat all punctuation literally (like periods).
                escaped_title = re.escape(current_title)
                
                # 2. Replace all escaped spaces with a pattern that matches one or more 
                #    whitespace or newline characters, making it resilient to PDF text breaks.
                flexible_pattern_str = escaped_title.replace(r'\ ', r'[\s\n]+')
                    
                search_pattern = re.compile(flexible_pattern_str, re.IGNORECASE | re.DOTALL)
                
                # We use the raw text for searching
                match = search_pattern.search(text)
                
                if match:
                    start_page_1idx = page_num_0idx + 1
                    start_offset = match.start()
                    found_starts[current_title] = (start_page_1idx, start_offset)
                    print(f"Found '{current_title}' on page {start_page_1idx} at offset {start_offset} using resilient search.")
                    title_index += 1
        
        # Now determine the end pages and build the final map
        # Store as (title, start_page_1idx, end_page_1idx, start_offset)
        final_map = []
        
        # Create a list of (title, start_page, start_offset) tuples sorted by page number
        sorted_starts = sorted(found_starts.items(), key=lambda item: item[1][0])
        
        # Iterate through the found sections to calculate the end page
        for i, (title, (start_page, start_offset)) in enumerate(sorted_starts):
            if i < len(sorted_starts) - 1:
                # End page is the page where the next section starts
                next_start_page, next_start_offset = sorted_starts[i+1][1]
                end_page = next_start_page
            else:
                # Last section ends on the last page of the document
                end_page = total_pages
            
            final_map.append((title, start_page, end_page, start_offset))
        
        return final_map
        
    except Exception as e:
        print(f"An error occurred during PDF processing: {e}")
        return []
    finally:
        if doc:
            doc.close()

def create_and_split_files(pdf_path, chapter_map):
    """
    Splits the PDF content into Markdown files based on the determined chapter map,
    using the start offset for precise content extraction on the first page.
    """
    if not chapter_map:
        print("Cannot create files: Chapter map is empty.")
        return

    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        print(f"Created output directory: {OUTPUT_DIR}")

    doc = None
    try:
        doc = fitz.open(pdf_path)

        for idx, (title, start_page_1idx, end_page_1idx, start_offset) in enumerate(chapter_map):
            
            start_page_0idx = start_page_1idx - 1
            end_page_0idx = end_page_1idx - 1
            
            # Use the new sanitize_title which handles prefixing and mapping
            safe_filename = sanitize_title(title, idx + 1)
            output_filename = os.path.join(
                OUTPUT_DIR, 
                f"{safe_filename}.md"
            )
            
            chapter_text = []
            
            # 1. Process the START page (start_page_0idx)
            start_page = doc.load_page(start_page_0idx)
            text = start_page.get_text()
            
            # Extract content starting from the title's offset
            chapter_text.append(text[start_offset:])

            # 2. Process all MIDDLE pages (if any)
            for page_num in range(start_page_0idx + 1, end_page_0idx):
                page = doc.load_page(page_num)
                text = page.get_text()
                chapter_text.append(text)

            # 3. Process the END page (if it's different from the start page)
            if start_page_0idx < end_page_0idx:
                end_page_content = doc.load_page(end_page_0idx).get_text()
                
                # Find where the next chapter starts on this page, if it's the same page
                end_offset = len(end_page_content) # Default to full page
                if idx + 1 < len(chapter_map):
                    next_title, next_start_page, _, next_start_offset = chapter_map[idx+1]
                    if next_start_page == end_page_1idx:
                        end_offset = next_start_offset
                
                chapter_text.append(end_page_content[:end_offset])

            # 4. Join and clean the text
            raw_text = "".join(chapter_text)
            cleaned_content = clean_extracted_text(raw_text)

            # 5. Write the text to the Markdown file
            with open(output_filename, "w", encoding="utf-8") as f:
                # Add the chapter title as a top-level header for easy reading
                f.write(f"# {CUSTOM_TITLE_MAP.get(title, title)}\n\n") 
                f.write(cleaned_content)
            
            print(f"Created: {output_filename} (Pages {start_page_1idx}-{end_page_1idx})")
        
        print("\nFunction reference splitting complete.")

    except Exception as e:
        print(f"An error occurred during file creation: {e}")
    finally:
        if doc:
            doc.close()


if __name__ == "__main__":
    
    # 1. Programmatically determine the page ranges
    calculated_map = find_chapter_page_ranges(PDF_FILE, CHAPTER_MAP_TITLES)
    
    if calculated_map:
        print("\n--- Calculated Chapter Map (Title, Start Page, End Page, Start Offset) ---")
        for title, start, end, offset in calculated_map:
            print(f"(\"{title}\", {start}, {end}, {offset}),")
        print("----------------------------------------------------------\n")
        
        # 2. Create the files based on the calculated map
        create_and_split_files(PDF_FILE, calculated_map)
    else:
        print("Could not generate a chapter map. Please ensure the PDF file is correctly named and contains the expected titles.")


Searching for 25 section headers in 'Teradata Package for Python Function Reference.pdf'...
Found 'Teradata Package for Python Function Reference' on page 1 at offset 858 using resilient search.
Found 'teradataml: Context Manager' on page 3 at offset 1113 using resilient search.
Found 'teradataml: DataFrame' on page 8 at offset 1721 using resilient search.
Found 'Teradata Package for Python Function Reference' on page 1 at offset 858 using resilient search.
Found 'teradataml: Context Manager' on page 3 at offset 1113 using resilient search.
Found 'teradataml: DataFrame' on page 8 at offset 1721 using resilient search.
Found 'teradataml: Time Series Functions' on page 172 at offset 4284 using resilient search.
Found 'teradataml: Time Series Functions' on page 172 at offset 4284 using resilient search.
Found 'teradataml: DataFrameColumn' on page 215 at offset 2266 using resilient search.
Found 'teradataml: DataFrameColumn' on page 215 at offset 2266 using resilient search.
Found 'Geospat

In [None]:
import os
import re

# Directory containing the generated markdown chapters
directory = "teradataml_function_reference"

# Collect parsed function entries
functions = []

# Helper: extract function entries from a single markdown file
def extract_from_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    i = 0
    results = []
    while i < len(lines):
        line = lines[i].rstrip('\n')
        stripped = line.strip()

        # Candidate function name: not a header, not a chapter title and not a location line
        if stripped and not line[0].isspace() and not stripped.startswith('#') and not stripped.lower().startswith('teradataml:') and '=' not in stripped and stripped not in (
            'PARAMETERS:', 'RAISES:', 'RETURNS:', 'EXAMPLES:', 'NOTES:', 'DESCRIPTION:') and not any(char in stripped for char in [' ', ':']) and not stripped.startswith(('>>>', 'from ', 'import ', 'Example', 'The ')) and not any(keyword in stripped for keyword in ['df', '[', 'print', 'load_example_data', '>>>']):
            name = stripped

            # Find the next non-empty line (likely the location with an '=' sign)
            j = i + 1
            while j < len(lines) and not lines[j].strip():
                j += 1

            if j < len(lines) and '=' in lines[j]:
                location_line = lines[j].strip()
                # left side of '=' is the full object path
                if ' = ' in location_line:
                    location = location_line.split(' = ')[0].strip()
                else:
                    location = location_line.split('=')[0].strip()

                # Move to the line after the location and skip blank lines
                k = j + 1
                # Skip continuation lines of the location (lines that start with lowercase, assuming descriptions start with uppercase)
                while k < len(lines) and lines[k].strip() and not lines[k].strip()[0].isupper() and not lines[k].strip().upper().startswith('DESCRIPTION') and lines[k].strip() not in (
                    'PARAMETERS:', 'RAISES:', 'RETURNS:', 'EXAMPLES:', 'NOTES:'):
                    k += 1
                # Skip any remaining blank lines
                while k < len(lines) and not lines[k].strip():
                    k += 1

                desc_lines = []  # Initialize desc_lines

                # Handle optional "DESCRIPTION:" marker
                if k < len(lines) and lines[k].strip().upper().startswith('DESCRIPTION'):
                    # If the line is exactly 'DESCRIPTION:' advance; if it contains text after colon, keep it
                    desc_line = lines[k]
                    if ':' in desc_line and desc_line.strip().endswith(':'):
                        k += 1
                    else:
                        # DESCRIPTION: <text on same line>
                        possible_parts = desc_line.split(':', 1)
                        possible = possible_parts[1].strip() if len(possible_parts) > 1 else ''
                        if possible:
                            desc_lines = [possible]
                            k += 1
                        else:
                            k += 1
                else:
                    # No DESCRIPTION: marker, collect description lines until section header
                    while k < len(lines) and lines[k].strip() not in ('PARAMETERS:', 'RAISES:', 'RETURNS:', 'EXAMPLES:', 'NOTES:'):
                        line = lines[k].strip()
                        if line:
                            desc_lines.append(line)
                        k += 1

                # Collect any additional indented lines (for cases with DESCRIPTION: followed by indented notes)
                while k < len(lines):
                    l = lines[k]
                    if l.strip() == '':
                        k += 1
                        continue
                    # Consider as indented if starts with space, tab or NBSP
                    if l.startswith(' ') or l.startswith('\t') or l.startswith('\u00A0'):
                        # strip leading/trailing whitespace
                        desc_lines.append(l.strip())
                        k += 1
                    else:
                        break

                desc_text = ' '.join(desc_lines).strip()

                # Pick first sentence (or two if the first is very short)
                first_sent = ''
                if desc_text:
                    sentences = re.split(r'(?<=\.)\s+', desc_text)
                    first_sent = sentences[0].strip()
                    word_count = len(first_sent.split())
                    if word_count < 10 and len(sentences) > 1:
                        first_sent = (first_sent + ' ' + sentences[1].strip()).strip()

                results.append({
                    'name': name,
                    'location': location,
                    'description': first_sent
                })

                # Advance i to continue parsing after the description block we consumed
                i = k
            else:
                # No location found, continue scanning
                i += 1
        else:
            i += 1

    return results

# Iterate over the expected files 02..25
if not os.path.isdir(directory):
    print(f"Directory not found: {directory}")
else:
    for idx in range(2, 26):
        fname = None
        for f in os.listdir(directory):
            if f.startswith(f"{idx:02d}_") and f.endswith('.md'):
                fname = f
                break
        if not fname:
            continue

        path = os.path.join(directory, fname)
        try:
            extracted = extract_from_file(path)
        except Exception as e:
            print(f"Error parsing {path}: {e}")
            continue

        for item in extracted:
            item['file'] = fname
            functions.append(item)

# Build a Markdown table and write the output file
table_lines = [
    "| File | Name | Location | Description |",
    "|------|------|----------|-------------|",
]
for func in functions:
    desc = (func.get('description') or '').replace('|', '\\|')
    table_lines.append(f"| {func.get('file','')} | {func.get('name','')} | {func.get('location','')} | {desc} |")

output_path = 'teradataml_function_table.md'
with open(output_path, 'w', encoding='utf-8') as f:
    f.write("# TeradataML Function Reference Table\n\n")
    f.write('\n'.join(table_lines) + '\n')

print(f"Table created: {output_path} with {len(functions)} entries")


In [None]:
import os
import re

directory = "teradataml_function_reference"
functions = []

for i in range(2, 26):
    filename = None
    for f in os.listdir(directory):
        if f.startswith(f"{i:02d}_") and f.endswith(".md"):
            filename = f
            break
    if not filename:
        continue
    
    filepath = os.path.join(directory, filename)
    with open(filepath, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    i_line = 0
    while i_line < len(lines):
        line = lines[i_line].strip()
        if line and not line.startswith('#') and not line.startswith('teradataml:') and '=' not in line and not line in ['PARAMETERS:', 'RAISES:', 'RETURNS:', 'EXAMPLES:', 'NOTES:']:
            # possible function name
            name = line
            if i_line + 1 < len(lines) and '=' in lines[i_line + 1]:
                location_line = lines[i_line + 1].strip()
                if ' = ' in location_line:
                    location = location_line.split(' = ')[0]
                    # collect description
                    i_line += 2  # after location
                    if i_line < len(lines) and lines[i_line].strip() == "DESCRIPTION:":
                        i_line += 1  # skip DESCRIPTION:
                    desc_lines = []
                    while i_line < len(lines):
                        line = lines[i_line].strip()
                        if line and line in ['PARAMETERS:', 'RAISES:', 'RETURNS:', 'EXAMPLES:', 'NOTES:']:
                            break
                        if line:
                            desc_lines.append(line)
                        i_line += 1
                    desc_text = ' '.join(desc_lines).strip()
                    if desc_text:
                        # split sentences
                        sentences = re.split(r'(?<=\.)\s+', desc_text)
                        first_sent = sentences[0].strip() if sentences else ''
                        word_count = len(first_sent.split())
                        if word_count < 10 and len(sentences) > 1:
                            first_sent += ' ' + sentences[1].strip()
                        functions.append({'name': name, 'location': location, 'description': first_sent})
                    else:
                        pass  # no desc
                else:
                    i_line += 1
            else:
                i_line += 1
        else:
            i_line += 1

# create the table
table = "| Name | Location | Description |\n|------|----------|-------------|\n"
for func in functions:
    # escape | in description
    desc = func['description'].replace('|', '\\|')
    table += f"| {func['name']} | {func['location']} | {desc} |\n"

# write to file
with open("teradataml_function_table.md", "w", encoding="utf-8") as f:
    f.write("# TeradataML Function Reference Table\n\n")
    f.write(table)

print("Table created: teradataml_function_table.md")

Table created: teradataml_function_table.md
