In [None]:
# un-comment any of the following:
!pip install pymupdf
!pip install matplotlib
!pip install pandas
!pip install numpy
!pip install pymupdf4llm

In [43]:
from dotenv import load_dotenv
import os
import base64

# Load .env file
load_dotenv()

# Get the API key
openai_api_key = os.getenv("OPENAI_API_KEY")

# Set it for OpenAI client
import openai
openai.api_key = openai_api_key

In [44]:
import fitz
from PIL import Image

from llama_index.core.node_parser import (
    SentenceSplitter,
    SemanticSplitterNodeParser,
)

In [23]:
file="/Users/anandpardeep/AI/CrowdStrikeGlobalThreatReport2025.pdf"
doc = fitz.open(file)


In [13]:
page.get_pixmap

In [None]:
for page_num, page in enumerate(doc, start=1):
    table_finder = page.find_tables()
    tables = table_finder.tables

    print(f"Page {page_num}: Found {len(tables)} table(s)")
    
    for i, table in enumerate(tables, start=1):
        print(f"\nTable {i} (Markdown):")
        print(table.to_pandas())


In [None]:
# Process all pages and combine blocks from each
full_combined_blocks = []

for page_num, page in enumerate(doc, start=1):
    page_blocks = []

    # Extract text blocks
    text_blocks = page.get_text("dict")["blocks"]
    for block in text_blocks:
        if "lines" in block and block["lines"]:
            y_top = block["bbox"][1]
            text = "\n".join([span["text"] for line in block["lines"] for span in line["spans"]])
            page_blocks.append({"y": y_top, "type": "text", "content": text.strip(), "page": page_num})

    # Extract table blocks (if available)
    if hasattr(page, "find_tables"):
        try:
            tables = page.find_tables().tables
            for table in tables:
                y_top = table.bbox[1]
                md_table = table.to_markdown()
                page_blocks.append({"y": y_top, "type": "table", "content": md_table, "page": page_num})
        except Exception as e:
            page_blocks.append({"y": 99999, "type": "error", "content": f"Table extraction error: {str(e)}", "page": page_num})

    # Sort blocks on this page
    page_blocks.sort(key=lambda x: x["y"])
    full_combined_blocks.extend(page_blocks)

# Combine and format the entire document preview
combined_full_preview = "\n\n---\n\n".join(
    f"[PAGE {block['page']}] [{block['type'].upper()}]\n{block['content']}" for block in full_combined_blocks
)

combined_full_preview[:4000]  # Truncate for readability


In [None]:
import pymupdf4llm

md_text = pymupdf4llm.to_markdown(file,show_progress=True,use_glyphs=True,write_images=False,image_path="/Users/anandpardeep/AI/images",page_chunks=True)

import pathlib
pathlib.Path("/Users/anandpardeep/AI/output.md").write_bytes(md_text)

In [None]:
last_page = doc[-1]

# Print raw text
print("Raw text:", last_page.get_text("text"))

# Print block structure
print("Blocks:", last_page.get_text("blocks"))

In [34]:
chunks = pymupdf4llm.to_markdown(file, page_chunks=True)

# Add page headers and combine
md_text = ""
for i, page_md in enumerate(chunks, start=1):
    md_text += f"# Page {i}\n\n{page_md}\n\n---\n\n"

# Save to file
pathlib.Path("/Users/anandpardeep/AI/chunked.md").write_text(md_text, encoding="utf-8")

177910

In [None]:
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import elements_to_json

In [39]:
def describe_image_with_gpt4(base64_png: str) -> str:
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4-vision-preview",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Describe the content and purpose of this image."},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{base64_png}"
                            }
                        }
                    ]
                }
            ],
            max_tokens=300
        )
        return response["choices"][0]["message"]["content"]
    except Exception as e:
        return f"(Image description failed: {e})"

In [47]:
def extract_page_blocks_as_markdown(page, page_number: int) -> str:
    blocks = []

    # Extract text blocks
    for b in page.get_text("dict")["blocks"]:
        if "lines" in b:
            text = "\n".join([span["text"] for line in b["lines"] for span in line["spans"]])
            if text.strip():
                blocks.append({"type": "text", "y": float(b["bbox"][1]), "content": text.strip()})

    # Extract table blocks
    try:
        for table in page.find_tables().tables:
            blocks.append({"type": "table", "y": float(table.bbox[1]), "content": table.to_markdown()})
    except Exception:
        pass  # Safe fallback if table detection fails

    # Extract and summarize images
    for img in page.get_images(full=True):
        xref = img[0]
        base_image = page.parent.extract_image(xref)
        image_bytes = base_image["image"]
        image_b64 = base64.b64encode(image_bytes).decode("utf-8")
        caption = describe_image_with_gpt4(image_b64)
        image_md = f"![{caption}](data:image/png;base64,{image_b64})"
        blocks.append({"type": "image", "y": float(img[5]), "content": image_md})

    # Sort all blocks top-to-bottom
    sorted_blocks = sorted(blocks, key=lambda b: b["y"])

    # Build markdown for this page
    page_markdown = [f"# Page {page_number}"]
    for block in sorted_blocks:
        page_markdown.append(block["content"])
    page_markdown.append("\n---\n")

    return "\n\n".join(page_markdown)

In [41]:
def convert_pdf_to_markdown(pdf_path: str) -> str:
    doc = fitz.open(pdf_path)
    all_pages_md = []

    for page_number, page in enumerate(doc, start=1):
        page_md = extract_page_blocks_as_markdown(page, page_number)
        all_pages_md.append(page_md)

    return "\n".join(all_pages_md)

In [48]:
# Call the function and save output
full_markdown = convert_pdf_to_markdown(file)

# Save to file
output_path = "/Users/anandpardeep/AI/semantic.md"
with open(output_path, "w", encoding="utf-8") as f:
    f.write(full_markdown)

output_path  # Return the path to the saved Markdown file


ValueError: could not convert string to float: 'ICCBased'