In [1]:
print("Hello")

Hello


## Data Extraction with Docling

In this notebook, we'll extract content from PDFs into structured formats:

- **Markdown**: Full document text with page breaks for chunking
- **Images**: Save pages containing large charts/diagrams (>500x500 pixels)
- **Tables**: Extract with 2 paragraphs of context + page number metadata

**Output Structure:**
```
data/rag-data/markdown/{company}/{document}.md
data/rag-data/images/{company}/{document}/page_5.png
data/rag-data/tables/{company}/{document}/table_1_page_5.md
```

In [2]:
from helpers.doclingg import pdf_to_docling_converter
from helpers.common import MARKDOWN_DIR, TABLES_DIR
from pathlib import Path
from typing import List, Tuple


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Directory paths (MARKDOWN_DIR, TABLES_DIR from helpers.common)
DATA_DIR = "data/rag-data/pdfs"
OUTPUT_IMAGES_DIR = "data/rag-data/images"

In [3]:
def extract_metadata_from_filename(filename: str):
    """
    Extract metadata from filename.
    
    Expected format: CompanyName DocType [Quarter] Year.pdf
    Examples:
        - Amazon 10-Q Q1 2024.pdf
        - Microsoft 10-K 2023.pdf
    """

    filename = filename.replace('.pdf', '').replace('.md', '')
    parts = filename.split()

    return {
        'company_name': parts[0],
        'doc_type': parts[1],
        'fiscal_quarter': parts[2] if len(parts)==4 else None,
        'fiscal_year': parts[-1]
    }

extract_metadata_from_filename('apple 10-k 2023.pdf')

{'company_name': 'apple',
 'doc_type': '10-k',
 'fiscal_quarter': None,
 'fiscal_year': '2023'}

## Tables

In [4]:
def extract_context_and_table(lines: List[str], table_index: int):
    """
    Extract context and table content at a specific position.
    
    Args:
        lines: All markdown lines
        table_index: Where the table starts
    
    Returns:
        (combined_content, next_line_index)
    """

    table_lines = []
    i = table_index

    while (i < len(lines)) and (lines[i].startswith('|')):
        table_lines.append(lines[i])
        i = i + 1


    # previous 2 lines as table context
    start = max(0, table_index-2)
    context_lines = lines[start: table_index]

    content = '\n'.join(context_lines) + '\n\n' + '\n'.join(table_lines)

    return content, i
    
def extract_tables_with_context(markdown_text: str):
    """
    Find all tables and extract them with context and page numbers.
    
    Returns:
        List of (content, table_name, page_number)
    """

    lines = markdown_text.split('\n')
    lines = [line for line in lines if line.strip()]
    tables = []
    current_page = 1
    table_num = 1
    i = 0

    while(i< len(lines)):
        # track page numbers
        if '<!-- page break -->' in lines[i]:
            current_page = current_page + 1
            i = i + 1
            continue

        # Table detected
        if lines[i].startswith('|') and lines[i].count('|')>1:
            content, next_i = extract_context_and_table(lines, i)

            tables.append((content, f"table_{table_num}", current_page))
            table_num = table_num + 1
            i = next_i

        else:
            i = i + 1


    return tables
    
def save_tables(markdown_text, tables_dir):

    tables = extract_tables_with_context(markdown_text)

    for table_content, table_name, page_num in tables:
        content_with_page = f"**Page:** {page_num}\n\n{table_content}"
                
        (tables_dir/f"{table_name}_page_{page_num}.md").write_text(content_with_page, encoding='utf-8')


## FINAL Extraction function

In [5]:
def extract_pdf_content(pdf_file):
    metadata = extract_metadata_from_filename(pdf_file.stem)

    company_name = metadata['company_name']

    md_dir = Path(MARKDOWN_DIR) / company_name
    images_dir = Path(OUTPUT_IMAGES_DIR) / company_name / pdf_file.stem
    tables_dir = Path(TABLES_DIR) / company_name / pdf_file.stem

    for dir_path in [md_dir, images_dir, tables_dir]:
        dir_path.mkdir(parents=True, exist_ok=True)


    doc_converter = pdf_to_docling_converter.convert(pdf_file)

    markdown_text = doc_converter.document.export_to_markdown(page_break_placeholder="<!-- page break -->")

    (md_dir / f"{pdf_file.stem}.md").write_text(markdown_text, encoding='utf-8')

    save_tables(markdown_text, tables_dir)


In [6]:
# pdf_file = Path("data/rag-data/pdfs/apple/apple 8-k q4 2023.pdf")
apple_path = Path("data/rag-data/pdfs/apple")
# data_path = Path(DATA_DIR/"apple")
apple_path

PosixPath('data/rag-data/pdfs/apple')

In [None]:
# from tqdm import tqdm

# pdf_files = list(apple_path.rglob("*.pdf")) 

# for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
#     extract_pdf_content(pdf_file)


Processing PDFs: 100%|██████████| 6/6 [07:18<00:00, 73.05s/it]


In [None]:
# from tqdm import tqdm

# amazon_path = Path("data/rag-data/pdfs/amazon/")
# pdf_files = list[Path](amazon_path.rglob("*.pdf")) 
# for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
#     extract_pdf_content(pdf_file)


Processing PDFs: 100%|██████████| 7/7 [14:52<00:00, 127.56s/it]


In [11]:
from tqdm import tqdm

meta_path = Path("data/rag-data/pdfs/meta/")
pdf_files = list[Path](meta_path.rglob("*.pdf")) 
for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
    extract_pdf_content(pdf_file)

Processing PDFs: 100%|██████████| 9/9 [10:50<00:00, 72.26s/it] 


In [13]:
from tqdm import tqdm

google_path = Path("data/rag-data/pdfs/google/")
pdf_files = list[Path](google_path.rglob("*.pdf")) 
for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
    extract_pdf_content(pdf_file)

Processing PDFs: 100%|██████████| 6/6 [16:53<00:00, 168.94s/it]


In [None]:

pdf_file_path = Path("data/rag-data/pdfs/tcs/Analyst Report 1- Q1 FY26.pdf")
extract_pdf_content(pdf_file_path)