In [16]:
import pdfplumber
import openai
import base64
import io
from PIL import Image
from dotenv import load_dotenv
from openai import OpenAI
import openai
import os
import ollama

In [17]:
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = OpenAI(api_key=openai_api_key)

In [54]:
def page_to_base64(page):
    #img = page.to_image(resolution=300).original.convert("RGB")
    img = page.to_image(resolution=300).original
    buffered = io.BytesIO()
    img.save(buffered, format="PNG")
    return base64.b64encode(buffered.getvalue()).decode("utf-8")

In [78]:
%%time
pdf_path="CrowdStrikeGlobalThreatReport2025.pdf"
with pdfplumber.open(pdf_path) as pdf:
    pages = range(len(pdf.pages))


CPU times: user 90.6 ms, sys: 50 µs, total: 90.7 ms
Wall time: 89.9 ms


In [41]:
page_num=10

In [83]:
def process_with_vision_model(page,num):
    response = ollama.chat(
    model='llama3.2-vision:90b',
    stream = False,
    messages=[
        {
            'role': 'system',
            'content': 'You are a markdown extractor. Do not summarize. Extract all visible text, headings, and tables as markdown.'
        },        
        {        
        'role': 'user',
        'content': (f"This is page {page_num} of a PDF. Convert it to clean, structured markdown. "
                    "Use tables when needed. Just return the markdown content."),
        'images': [page_to_base64(page)]
    }],
       options={
            'temperature': 0.0,
            'top_p': 0.1,     
        },
    )
    return response['message']['content']

In [None]:
%%time
for i in pages:
    print(f"Processing page {i + 1}...")
    page = pdf.pages[i]
    page_markdown = process_with_vision_model(page, i + 1)                                        
    md_file=f"llama-vision/page{i+1}.md"
    with open(md_file, 'w', encoding='utf-8') as f:
        f.write(page_markdown)

In [84]:
%%time
page = pdf.pages[page_num]
text = ""
try:
    text = page.extract_text()
    if not text or len(text.strip()) < 10:
        print(f"[Page {page_num}] likely complex/multimodal — no useful text extracted.")
    else:
        print(f"[Page {page_num}] extracted {len(text.split())} words of text.")
        
except Exception as e:
    print(f"[Page {page_num}] extract_text() failed: {e}")

page_markdown = process_with_vision_model(page, page_num + 1)   
md_file=f"llama-vision/page{page_num+1}.md"
if(is_similar(page_markdown,text)):
    text=''
    
with open(md_file, 'w', encoding='utf-8') as f:
    f.write(f"{page_markdown}\n\n\n{text}")

[Page 10] extract_text() failed: seek of closed file
CPU times: user 492 ms, sys: 149 µs, total: 492 ms
Wall time: 2min 50s


In [10]:
response['message']['content']

"# NAMING CONVENTIONS\n\n| ADVERSARY | NATION-STATE OR CATEGORY |\n| --- | --- |\n| BEAR | RUSSIA |\n| BUFFALO | VIETNAM |\n| CHOLLIMA | DPRK (NORTH KOREA) |\n| CRANE | ROK (REPUBLIC OF KOREA) |\n| HAWK | SYRIA |\n| JACKAL | HACKTIVIST |\n| KITTEN | IRAN |\n| LEOPARD | PAKISTAN |\n| LYNX | GEORGIA |\n| OCELOT | COLOMBIA |\n| PANDA | PEOPLE'S REPUBLIC OF CHINA |\n| SAIGA | KAZAKHSTAN |\n| SPHINX | EGYPT |\n| SPIDER | eCRIME |\n| TIGER | INDIA |\n| WOLF | TURKEY |"

In [70]:
from difflib import SequenceMatcher

similarity = SequenceMatcher(None, "hello world", "hello there").ratio()
print(similarity)

0.6363636363636364


In [71]:
def is_similar(a: str, b: str, threshold=0.85) -> bool:
    return SequenceMatcher(None, a, b).ratio() > threshold