In [1]:
import requests
import json
import time
import re
import os
import base64
import pypdfium2 as pdfium
from openai import OpenAI
from docling.document_converter import DocumentConverter

api_url = 'https://www.chatmol.org/ollama/api/generate'

def encode_image(image_path):
    with open(image_path, 'rb') as f:
        return base64.b64encode(f.read()).decode("utf-8")
        
# LLM_CLIENT
def get_llm_client(provider):
    # OpenAI client
    if (provider == "OpenAI"):
        openai_api_key = os.environ['OPENAI_API_KEY']
        client = OpenAI(api_key=openai_api_key)
    # DeepSeek client
    elif (provider == "DeepSeek"):
        ds_api_key = os.environ["DS_API_KEY"]
        # model: deepseek-chat, 128k context window size, 8k max output tokens
        client = OpenAI(api_key=ds_api_key, base_url="https://api.deepseek.com")
    # Ollama client
    elif (provider == "Ollama"):
        # Using OpenAI interface example
        client = OpenAI(
            base_url = 'https://www.chatmol.org/ollama/v1/',
            #base_url = 'http://100.89.180.132:11434/v1/',
            api_key='ollama',  # required but ignored
        )
    else:
        print("Unknown LLM provider")
        client = None
    return client

#OpenAI client
# client = get_llm_client(provider="OpenAI")
# client = get_llm_client(provider="DeepSeek")
# client = get_llm_client(provider="Ollama")

In [2]:
def get_correct_headings(doc_md, llm_client, llm_model):
    # Create the data payload
    prompt = """In the following markdown text, all the headers are on the same level. The top level headers are sections. Some sections may have
    sub-sections or even sub-sub-sections. Please set the header levels correctly according to the content structures. For simplicity in your 
    output, you can only response with all headers. Please consider the following rules:
    
    1. Let's start from level 2, like: ## <seciton_header>
    2. If the section header has a number, please also keep the number.
    3. Please don't add anything (such as level) that is not in the original headers. 
    """
    max_tokens = 2048
    responses = llm_client.chat.completions.create(
        model = llm_model,
        messages = [
            {"role": "system","content": prompt},
            {"role": "user","content": f"Here is the current markdown text:\n\n{doc_md}"},
            {"role": "user", "content": "Please export correct markdow headings, each per line."},
        ],
        temperature = 0.0,
        max_tokens = max_tokens,
    )

    new_headings = responses.choices[0].message.content
    return new_headings

In [3]:
def replace_headings(original_markdown, correct_headings):
    # Split headers into a list
    correct_heading_list = correct_headings.strip().split('\n')
    
    # Generate a mapping of old to new headers
    header_mapping = {}
    
    for new_header in correct_heading_list:
        # Extract the header text without the markdown levels
        header_text = new_header.lstrip('# ').strip()
        # Create a regex to find headers with varying levels
        regex = re.compile(r'^(#{1,6}\s*)' + re.escape(header_text) + r'$', re.MULTILINE)
        # Replace all occurrences with the correct level
        header_mapping[regex] = new_header
    
    # Replace headers in the original markdown
    updated_markdown = original_markdown
    for pattern, replacement in header_mapping.items():
        updated_markdown = pattern.sub(replacement, updated_markdown)
    
    return updated_markdown

In [4]:
def docling_pdf_parser(pdf_source):
    converter = DocumentConverter()
    result = converter.convert(pdf_source)
    print("Done with docling convert")
    raw_md = result.document.export_to_markdown()
    return raw_md

In [5]:
# PDF Parser: convert PDF into markdown format using visual LLMs
def llm_pdf_parser(pdf_file_path, client, model):
    prompt = """
    You are an expert to convert a PDF file of a scientific paper into markdown text. This markdown text from the PDF should match the structure of the the 
    content in PDF. Only export pure markdown and nothing else. Do not explain the output. All headerings will start with ##, ###, ####, and so on. 

    A scientific paper usually includes a title of the paper, a list of authors and their affiliations. Please extract all of them

    Don't add any extra headings if not in the original PDF. For example, don't add a heading of continuation. 

    Don't add extra marks in your output, such as '```markdown'!

    Don't include page numbers in the markdown, don't use page numbers as markdown headings.

    If you see a table in PDF, convert it into a markdown table. If there is a table title, put the table content immediately after the table 
    title. If there are notes of the table, also put the notes immediately after the table without blank line. 
    
    """
    file_name = os.path.basename(pdf_file_path)
    pages = pdfium.PdfDocument(pdf_file_path)
    n_pages = len(pages)
    images_b64 = []
    n_dpi = 108
    max_tokens = 2048

    # Have some overlap 
    windows = 5
    batch_size = 1
    n_batch = int(n_pages/batch_size)
    if (n_pages > n_batch*batch_size):
        n_batch += 1

    pre_batch_text = ''
    page_counter = 0
    image_contents = []
    token_usage = 0

    md_text = ""

    for k in range(n_batch):
        nstart = k*batch_size
        nend = nstart + batch_size
        if (nend > n_pages):
            nend = n_pages
        current_batch_text = ""
        image_contents = []
        for i in range(nstart, nend):
            page = pages[i]
            page_counter += 1
            p_number = i+1
            image = page.render(scale = n_dpi/72).to_pil()
            image.save('tmp_image.jpeg',"JPEG")
            b64_image = encode_image('tmp_image.jpeg')
            image_item = [{"type": "text","text": f"This is page {p_number}"},
                          {"type": "image_url", "image_url": {
                              "url": f"data:image/png;base64,{b64_image}"}
                          }]
            image_contents += image_item
        messages = [
            {"role": "system", "content": prompt},
            {"role": "user", "content": image_contents}]
        if (p_number == 1):
            messages.append({"role": "user", "content": "Please extract all text in each page, including the title of the paper, the author list and their contact information"})
        else:
            messages.append({"role": "user", "content": "Please extract all text in each page"})

        responses = client.chat.completions.create(
            model = model,
            messages = messages,
            temperature = 0.0,
            max_tokens = max_tokens,
        )
        current_batch_text = responses.choices[0].message.content
        print("Finish reason", responses.choices[0].finish_reason)
        token_usage += responses.usage.total_tokens

        # Check if the generation is done for the current batch
        while (response.choices[0].finish_reason != "stop"):
            responses = client.chat.completions.create(
                model = model,
                messages = [
                {"role": "system", "content": prompt},
                {"role": "user", "content": image_contents},
                {"role": "user", "content": "This is the markdown generated from the PDF so far:"},
                {"role": "user", "content": f"{pre_batch_text + current_batch_text}"},
                {"role": "user", "content": "Please complete the remaining markdown content."},
                ],
                temperature = 0.0, 
                max_tokens = max_tokens,
            )
            md_text2 = responses.choices[0].message.content
            current_batch_text += md_text2
            token_usage += responses.usage.total_tokens
        md_text += current_batch_text + "\n"
        pre_batch_text = current_batch_text
    return md_text, token_usage

In [7]:
def pdf_to_markdown(pdf_source, method='docling', reflection_provider="Ollama", reflection_model="llama33-16k:latest"):
    if (method == 'docling'):
        raw_md = docling_pdf_parser(pdf_source)
    else:
        client = get_llm_client("OpenAI")
        model = 'gpt-4o'
        raw_md, token_usage = llm_pdf_parser(pdf_source, client, model)
    # Self-reflection for markdown heading corrections
    reflection_client = get_llm_client(reflection_provider)
    new_headings = get_correct_headings(raw_md,reflection_client,reflection_model)
    print(new_headings)
    doc_md = replace_headings(raw_md, new_headings) 
    return doc_md

In [12]:
pdf_file = "2024.langmol-1.7.pdf"
time1 = time.time()
doc_md = pdf_to_markdown(pdf_file, 'docling', 'OpenAI', 'gpt-4o')
print("Time = ", time.time()-time1)
print(doc_md)

  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  model.load_state_dict(torch.load(model_path, map_location=device))


Done with docling convert
## ChatMol Copilot: An Agent for Molecular Modeling and Computation Powered by LLMs
## Abstract
## 1 Introduction
## 2 ChatMol Copilot Architecture
### 2.1 Equipped Tools
### 2.2 Integration with Microservices
### 2.3 Code as Actions and Redis Cache
## 3 Use Cases of ChatMol Copilot for Molecular Modeling
### 3.1 General Protein Design Task
### 3.2 Peptide/MHC-II Binding Affinity Prediction
### 3.3 Molecular docking task
### 3.4 Molecule generation and filtering with generated Python code
## 4 Discussion and Conclusions
## References
## A Cases of using ChatMol Copilot
### A.1 Protein stability engineering task
### A.2 Generate a set of molecules, compute the molecular properties and display the results in a table
## B All tools
### B.1 Ligand binding pocket prediction
### B.2 Protein structure prediction
### B.3 Mutation effect prediction
### B.4 Protein structure visualisation
### B.5 Docking
### B.6 Blind Docking
### B.7 Protein sequence design
## C Other d

In [14]:
#source = "https://arxiv.org/pdf/2408.09869"
pdf_source = "https://aclanthology.org/2024.langmol-1.7.pdf"
time1 = time.time()
# DeepSeek V3 context window size upto 128k
# Output size: 
doc_md = pdf_to_markdown(pdf_source, 'docling', 'DeepSeek', 'deepseek-chat')
print("Time = ", time.time()-time1)
print(doc_md)

  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  model.load_state_dict(torch.load(model_path, map_location=device))


Done with docling convert
## ChatMol Copilot: An Agent for Molecular Modeling and Computation Powered by LLMs  
## Abstract  
## 1 Introduction  
## 2 ChatMol Copilot Architecture  
### 2.1 Equipped Tools  
### 2.2 Integration with Microservices  
### 2.3 Code as Actions and Redis Cache  
## 3 Use Cases of ChatMol Copilot for Molecular Modeling  
### 3.1 General Protein Design Task  
### 3.2 Peptide/MHC-II Binding Affinity Prediction  
### 3.3 Molecular docking task  
### 3.4 Molecule generation and filtering with generated Python code  
## 4 Discussion and Conclusions  
## References  
## A Cases of using ChatMol Copilot  
### A.1 Protein stability engineering task  
### A.2 Generate a set of molecules, compute the molecular properties and display the results in a table  
## B All tools  
### B.1 Ligand binding pocket prediction  
### B.2 Protein structure prediction  
### B.3 Mutation effect prediction  
### B.4 Protein structure visualisation  
### B.5 Docking  
### B.6 Blind Dockin

In [9]:
#source = "https://arxiv.org/pdf/2408.09869"
pdf_source = "https://aclanthology.org/2024.langmol-1.7.pdf"
time1 = time.time()
# Ollama/llama33-16k: 16k context window size
# Output size: 
doc_md = pdf_to_markdown(pdf_source, 'docling', 'Ollama', 'llama33-16k:latest')
print("Time = ", time.time()-time1)
print(doc_md)

  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  model.load_state_dict(torch.load(model_path, map_location=device))


Done with docling convert
## ChatMol Copilot: An Agent for Molecular Modeling and Computation Powered by LLMs
## Abstract
## 1 Introduction
## 2 ChatMol Copilot Architecture
### 2.1 Equipped Tools
### 2.2 Integration with Microservices
### 2.3 Code as Actions and Redis Cache
## 3 Use Cases of ChatMol Copilot for Molecular Modeling
### 3.1 General Protein Design Task
### 3.2 Peptide/MHC-II Binding Affinity Prediction
### 3.3 Molecular docking task
### 3.4 Molecule generation and filtering with generated Python code
## 4 Discussion and Conclusions
## A Cases of using ChatMol Copilot
### A.1 Protein stability engineering task
### A.2 Generate a set of molecules, compute the molecular properties and display the results in a table
## B Other details of ChatMol Copilot
### B.1 Ligand binding pocket prediction
### B.2 Protein structure prediction
### B.3 Mutation effect prediction
### B.4 Protein structure visualisation
### B.5 Docking
### B.6 Blind Docking
### B.7 Protein sequence design
## 