In [8]:
# Env: OpenAI310


In [17]:
import requests
import json
import time
import re
import os
import base64
import pypdfium2 as pdfium
from openai import OpenAI
from docling.document_converter import DocumentConverter

api_url = 'https://www.chatmol.org/ollama/api/generate'

def encode_image(image_path):
    with open(image_path, 'rb') as f:
        return base64.b64encode(f.read()).decode("utf-8")
        
# LLM_CLIENT
def get_llm_client(provider):
    # OpenAI client
    if (provider == "OpenAI"):
        openai_api_key = os.environ['OPENAI_API_KEY']
        client = OpenAI(api_key=openai_api_key)
    # DeepSeek client
    elif (provider == "DeepSeek"):
        ds_api_key = os.environ["DS_API_KEY"]
        # model: deepseek-chat, 128k context window size, 8k max output tokens
        client = OpenAI(api_key=ds_api_key, base_url="https://api.deepseek.com")
    # Ollama client
    elif (provider == "Ollama"):
        # Using OpenAI interface example
        client = OpenAI(
            # base_url = 'https://www.chatmol.org/ollama/v1/',
            base_url = 'http://100.89.180.132:11434/v1/',
            api_key='ollama',  # required but ignored
        )
    else:
        print("Unknown LLM provider")
        client = None
    return client

#OpenAI client
# client = get_llm_client(provider="OpenAI")
# client = get_llm_client(provider="DeepSeek")
# client = get_llm_client(provider="Ollama")

In [18]:
def get_correct_headings(doc_md, llm_client, llm_model):
    # Create the data payload
    prompt = """Your task is to correct the markdown heading levels (the number of hash symbol #). In the following markdown text, all the headings are all level 2,
    (i.e. each heading starts with ##). Let's start the top section headings with ##, and if a section has subsections, these subsection headings should
    start with ###, and so on for sub-subsections. Please set the heading levels correctly according to the content structures. For simplicity in your 
    output, you can only response with all headings. Please consider the following rules:
    
    1. Let's start from level 2, like: ## <seciton_header>
    2. If the section header has a number, please also keep the number.
    3. Please don't add anything (such as level) that is not in the original headers. 
    4. Please only export headings starting with hash symbol #
    """
    max_tokens = 4096
    print("doc_md length = ", len(doc_md))
    lines = doc_md.split("\n")
    old_headings = ""
    print(len(lines))
    for line in lines:
        if (line.find("##") == 0):
            old_headings += line+"\n";
    responses = llm_client.chat.completions.create(
        model = llm_model,
        messages = [
            {"role": "system","content": prompt},
            {"role": "user", "content": f"Here is the current markdown text:\n\n{doc_md}"},
            {"role": "user", "content": f"Here is list of the headings in the current markdown: \n\n{old_headings}"}, 
            {"role": "user", "content": "Please correct the heading levels based on the full markdown document, and export the corrected list of heading, each per line."},
        ],
        temperature = 0.0,
        max_tokens = max_tokens,
    )
    new_headings = responses.choices[0].message.content
    return new_headings

In [19]:
def replace_headings(original_markdown, correct_headings):
    # Split headers into a list
    correct_heading_list = correct_headings.strip().split('\n')
    
    # Generate a mapping of old to new headers
    header_mapping = {}
    
    for new_header in correct_heading_list:
        # Extract the header text without the markdown levels
        header_text = new_header.lstrip('# ').strip()
        # Create a regex to find headers with varying levels
        regex = re.compile(r'^(#{1,6}\s*)' + re.escape(header_text) + r'$', re.MULTILINE)
        # Replace all occurrences with the correct level
        header_mapping[regex] = new_header
    
    # Replace headers in the original markdown
    updated_markdown = original_markdown
    for pattern, replacement in header_mapping.items():
        updated_markdown = pattern.sub(replacement, updated_markdown)
    
    return updated_markdown

In [20]:
def docling_pdf_parser(pdf_source):
    converter = DocumentConverter()
    result = converter.convert(pdf_source)
    print("Done with docling convert")
    raw_md = result.document.export_to_markdown()
    #print(raw_md)
    return raw_md

In [21]:
# PDF Parser: convert PDF into markdown format using visual LLMs
def llm_pdf_parser(pdf_file_path, client, model):
    prompt = """
    You are an expert to convert a PDF file of a scientific paper into markdown text. This markdown text from the PDF should match the structure of the the 
    content in PDF. Only export pure markdown and nothing else. Do not explain the output. All headerings will start with ##, ###, ####, and so on. 

    A scientific paper usually includes a title of the paper, a list of authors and their affiliations. Please extract all of them

    Don't add any extra headings if not in the original PDF. For example, don't add a heading of continuation. 

    Don't add extra marks in your output, such as '```markdown'!

    Don't include page numbers in the markdown, don't use page numbers as markdown headings.

    If you see a table in PDF, convert it into a markdown table. If there is a table title, put the table content immediately after the table 
    title. If there are notes of the table, also put the notes immediately after the table without blank line. 
    
    """
    file_name = os.path.basename(pdf_file_path)
    pages = pdfium.PdfDocument(pdf_file_path)
    n_pages = len(pages)
    images_b64 = []
    n_dpi = 108
    max_tokens = 2048

    # Have some overlap 
    windows = 5
    batch_size = 1
    n_batch = int(n_pages/batch_size)
    if (n_pages > n_batch*batch_size):
        n_batch += 1

    pre_batch_text = ''
    page_counter = 0
    image_contents = []
    token_usage = 0

    md_text = ""

    for k in range(n_batch):
        nstart = k*batch_size
        nend = nstart + batch_size
        if (nend > n_pages):
            nend = n_pages
        current_batch_text = ""
        image_contents = []
        for i in range(nstart, nend):
            page = pages[i]
            page_counter += 1
            p_number = i+1
            image = page.render(scale = n_dpi/72).to_pil()
            image.save('tmp_image.jpeg',"JPEG")
            b64_image = encode_image('tmp_image.jpeg')
            image_item = [{"type": "text","text": f"This is page {p_number}"},
                          {"type": "image_url", "image_url": {
                              "url": f"data:image/png;base64,{b64_image}"}
                          }]
            image_contents += image_item
        messages = [
            {"role": "system", "content": prompt},
            {"role": "user", "content": image_contents}]
        if (p_number == 1):
            messages.append({"role": "user", "content": "Please extract all text in each page, including the title of the paper, the author list and their contact information"})
        else:
            messages.append({"role": "user", "content": "Please extract all text in each page"})

        responses = client.chat.completions.create(
            model = model,
            messages = messages,
            temperature = 0.0,
            max_tokens = max_tokens,
        )
        current_batch_text = responses.choices[0].message.content
        print("Finish reason", responses.choices[0].finish_reason)
        token_usage += responses.usage.total_tokens

        # Check if the generation is done for the current batch
        while (response.choices[0].finish_reason != "stop"):
            responses = client.chat.completions.create(
                model = model,
                messages = [
                {"role": "system", "content": prompt},
                {"role": "user", "content": image_contents},
                {"role": "user", "content": "This is the markdown generated from the PDF so far:"},
                {"role": "user", "content": f"{pre_batch_text + current_batch_text}"},
                {"role": "user", "content": "Please complete the remaining markdown content."},
                ],
                temperature = 0.0, 
                max_tokens = max_tokens,
            )
            md_text2 = responses.choices[0].message.content
            current_batch_text += md_text2
            token_usage += responses.usage.total_tokens
        md_text += current_batch_text + "\n"
        pre_batch_text = current_batch_text
    return md_text, token_usage

In [22]:
def pdf_to_markdown(pdf_source, method='docling', reflection_provider="Ollama", reflection_model="llama33-16k:latest"):
    if (method == 'docling'):
        raw_md = docling_pdf_parser(pdf_source)
    else:
        client = get_llm_client("OpenAI")
        model = 'gpt-4o'
        raw_md, token_usage = llm_pdf_parser(pdf_source, client, model)
    # Self-reflection for markdown heading corrections
    doc_md = raw_md
    reflection_client = get_llm_client(reflection_provider)
    # new_headings = get_correct_headings(raw_md,reflection_client,reflection_model)
    # print(new_headings)
    # doc_md = replace_headings(raw_md, new_headings) 
    return doc_md

In [7]:
# pdf_file = "2024.langmol-1.7.pdf"
# time1 = time.time()
# doc_md = pdf_to_markdown(pdf_file, 'docling', 'OpenAI', 'gpt-4o')
# print("Time = ", time.time()-time1)
# print(doc_md)

In [8]:
# #source = "https://arxiv.org/pdf/2408.09869"
# pdf_source = "https://aclanthology.org/2024.langmol-1.7.pdf"
# time1 = time.time()
# # DeepSeek V3 context window size upto 128k
# # Output size: 
# doc_md = pdf_to_markdown(pdf_source, 'docling', 'DeepSeek', 'deepseek-chat')
# print("Time = ", time.time()-time1)
# print(doc_md)

In [9]:
# #source = "https://arxiv.org/pdf/2408.09869"
# pdf_source = "https://aclanthology.org/2024.langmol-1.7.pdf"
# time1 = time.time()
# # llama3.3:70B
# doc_md = pdf_to_markdown(pdf_source, 'docling', 'Ollama', 'llama33-16k:latest')
# print("Time = ", time.time()-time1)
# print(doc_md)

In [23]:
import pandas as pd

papers_folder = "papers"
output_folder = "output"

os.makedirs(output_folder, exist_ok=True)

timing_results = []

pdf_files = [f for f in os.listdir(papers_folder) if f.endswith(".pdf")]

models = [
    # {"platform": "OpenAI", "model": "gpt-4o"},
    {"platform": "DeepSeek", "model": "deepseek-chat"},
    #{"platform": "Ollama", "model": "llama33-32k:latest"},
    {"platform": "Ollama", "model": "qwen2.5:32b-32k"}
]

for pdf_file in pdf_files:
    pdf_path = os.path.join(papers_folder, pdf_file)
    
    for model in models:
        output_filename = f"{os.path.splitext(pdf_file)[0]}_{model['platform']}.md"
        output_path = os.path.join(output_folder, output_filename)
        
        # Skip processing if the output file already exists
        if os.path.exists(output_path):
            print(f"Skipping {output_filename}, already exists.")
            continue
        
        time1 = time.time()
        doc_md = pdf_to_markdown(pdf_path, 'docling', model['platform'], model['model'])
        elapsed_time = time.time() - time1
        
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(doc_md)
        
        timing_results.append({
            "PDF File": pdf_file,
            "Platform": model['platform'],
            "Model": model['model'],
            "Time (s)": elapsed_time
        })

        print(f"Processed {pdf_file} with {model['model']} on {model['platform']} in {elapsed_time:.2f} seconds.")

timing_df = pd.DataFrame(timing_results)

timing_csv_path = os.path.join(output_folder, "timing_results.csv")
timing_df.to_csv(timing_csv_path, index=False)

timing_df

Skipping Man-Superman_DeepSeek.md, already exists.
Skipping Man-Superman_Ollama.md, already exists.
Skipping ChatMol2024ACL_DeepSeek.md, already exists.
Skipping ChatMol2024ACL_Ollama.md, already exists.
Skipping Bio_34_DeepSeek.md, already exists.
Skipping Bio_34_Ollama.md, already exists.
Skipping CS_16_DeepSeek.md, already exists.
Skipping CS_16_Ollama.md, already exists.
Skipping CS_8_DeepSeek.md, already exists.
Skipping CS_8_Ollama.md, already exists.
Skipping Sociology_26_DeepSeek.md, already exists.
Skipping Sociology_26_Ollama.md, already exists.
Skipping Linguistics_42_DeepSeek.md, already exists.
Skipping Linguistics_42_Ollama.md, already exists.
Skipping Psychology_20_DeepSeek.md, already exists.
Skipping Psychology_20_Ollama.md, already exists.
Skipping Political_18_DeepSeek.md, already exists.
Skipping Political_18_Ollama.md, already exists.
Skipping Physics_7_DeepSeek.md, already exists.
Skipping Physics_7_Ollama.md, already exists.


In [24]:
lines = doc_md.split("\n")
old_headings = ""
print(len(lines))
for line in lines:
    if (line.find("##") == 0):
        old_headings += line+"\n";
print(old_headings)


NameError: name 'doc_md' is not defined

In [25]:
print(lines[14])
line.find

NameError: name 'lines' is not defined

In [26]:
import os
import time
from pdf2image import convert_from_path  # pip install pdf2image
from PIL import Image  # pip install Pillow
from google import genai
from google.genai import types


# Load your Gemini API key from environment variable GEMINI_API_KEY
api_key = os.getenv("GEMINI_API_KEY")
api_key = 'AIzaSyC74zCUfFe2HsoQQT4kflPglQ-C-u81yYI'
if not api_key:
    raise ValueError("Please set your GEMINI_API_KEY environment variable.")

# Initialize Gemini client (using v1alpha API)
client = genai.Client(
    api_key=api_key,
    http_options={"api_version": "v1alpha"},
)

def composite_images(images):
    """
    Composite a list of PIL images vertically.
    """
    widths, heights = zip(*(img.size for img in images))
    max_width = max(widths)
    total_height = sum(heights)
    composite = Image.new('RGB', (max_width, total_height), color=(255, 255, 255))
    y_offset = 0
    for img in images:
        composite.paste(img, (0, y_offset))
        y_offset += img.size[1]
    return composite

def pdf_to_markdown_multi_page(pdf_path, pages_per_group=3, model="gemini-2.0-flash-exp"):
    """
    Processes the PDF in groups of pages to preserve multi-page structures.
    - Converts the PDF pages to images.
    - Groups pages (default 3 per group) and composites them vertically.
    - Sends each composite image with a prompt to Gemini.
    - Concatenates and returns the Markdown output.
    """
    # Convert PDF to images
    all_pages = convert_from_path(pdf_path)
    markdown_output = ""
    
    # Process pages in groups
    for group_start in range(0, len(all_pages), pages_per_group):
        group = all_pages[group_start : group_start + pages_per_group]
        composite_img = composite_images(group)
        
        # Craft a prompt indicating that the composite contains consecutive pages.
        prompt = (
            f"Please convert the following multi-page PDF section into well-formatted Markdown. "
            f"Keep continuity across pages (e.g., tables or lists that span pages) intact. "
            f"This composite image includes pages {group_start+1} to {group_start+len(group)}."
            f"Please just return pure markdown text, no header like '```markdown' is needed"
        )
        
        # Send the composite image and prompt to Gemini
        response = client.models.generate_content(
            model=model,
            contents=[prompt, composite_img],
            config=types.GenerateContentConfig(
                max_output_tokens=4096,
                temperature=0.5,
                top_p=0.95,
                top_k=40,
            ),
        )
        markdown_output += f"\n\n<!-- Pages {group_start+1} to {group_start+len(group)} -->\n" + response.text

    return markdown_output

if __name__ == "__main__":
    pdf_file = "papers/ChatMol2024ACL.pdf"  # Replace with your PDF file path
    time1 = time.time()
    md_text = pdf_to_markdown_multi_page(pdf_file, pages_per_group=3)

    print("Time = ", time.time()-time1)
    # Save the resulting Markdown to a file
    with open("ChatMol2024ACL.md", "w", encoding="utf-8") as f:
        f.write(md_text)
    print(md_text)
    
    print("Conversion complete. Markdown output saved to output.md")

Time =  55.52021503448486


<!-- Pages 1 to 3 -->
ChatMol Copilot: An Agent for Molecular Modeling and Computation
Powered by LLMs

Jinyuan Sun¹, Auston Li¹², Yifan Deng¹, Jiabo Li¹²
¹ChatMol Team ²Wecomput Technology Co., Ltd.
Correspondence: jinyuansun@chatmol.org; jiaboli@chatmol.org

Abstract

Large Language Models (LLMs) like ChatGPT
excel at diverse tasks when given explicit in-
structions, yet they often struggle with special-
ized domains such as molecular science, lack-
ing in-depth reasoning and sophisticated plan-
ning capabilities. To address these limitations,
we introduce ChatMol Copilot, a chatbot-like
agent specifically engineered for protein de-
sign and small molecule computations. Chat-
Mol Copilot employs a multi-level abstraction
framework to expand the LLM's capability. At
the basic level, it integrates external compu-
tational tools through function calls, thus of-
floading complex tasks and enabling a focus on
strategic decision-making. The second level is
data 

In [8]:
#!pip install google-genai

In [13]:
group_start = 0
group =[0,0,0]
prompt = (
    f"Please convert the following multi-page PDF section into well-formatted Markdown. "
    f"Keep continuity across pages (e.g., tables or lists that span pages) intact. "
    f"This composite image includes pages {group_start+1} to {group_start+len(group)}."
)

print(prompt)

Please convert the following multi-page PDF section into well-formatted Markdown. Keep continuity across pages (e.g., tables or lists that span pages) intact. This composite image includes pages 1 to 3.


In [None]:
from openai import OpenAI
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
model = "gpt-4o"
responses = client.chat.completions.create(
        model = model,
        messages = [{"role": "user", "content": "hello"}])
answer = responses.choices[0].message.content

In [16]:
with open("ChatMol2024ACL.md", "r") as f:
    x = str(f.read())
    print(x)



<!-- Pages 1 to 3 -->
# ChatMol Copilot: An Agent for Molecular Modeling and Computation Powered by LLMs

Jinyuan Sun¹, Auston Li¹², Yifan Deng¹, Jiabo Li¹²

¹ChatMol Team ²Wecomput Technology Co., Ltd.

Correspondence: jinyuansun@chatmol.org; jiaboli@chatmol.org

## Abstract

Large Language Models (LLMs) like ChatGPT excel at diverse tasks when given explicit instructions, yet they often struggle with specialized domains such as molecular science, lacking in-depth reasoning and sophisticated planning capabilities. To address these limitations, we introduce ChatMol Copilot, a chatbot-like agent specifically engineered for protein design and small molecule computations. Chat-Mol Copilot employs a multi-level abstraction framework to expand the LLM's capability. At the basic level, it integrates external computational tools through function calls, thus offloading complex tasks and enabling a focus on strategic decision-making. The second level is data abstraction. Large data sets (such