# Environment Setup
#### I use Google Colab to run this demonstration; if you're using a different environment, the setup process may vary. - Yichuan

In [1]:
import os
from google.colab import drive

if not os.path.ismount('/content/drive'):
    drive.mount('/content/drive')

In [2]:
!cp -r /content/drive/MyDrive/.ssh/ ~/
!ls ~/.ssh/ -a
!ssh -T git@github.com
!git clone git@github.com:Yichuan0712/dbl_llm.git

.  ..  github_id_rsa  hellbender_id_rsa  id_rsa  id_rsa.pub  known_hosts
Hi Yichuan0712! You've successfully authenticated, but GitHub does not provide shell access.
Cloning into 'dbl_llm'...
remote: Enumerating objects: 67, done.[K
remote: Counting objects: 100% (67/67), done.[K
remote: Compressing objects: 100% (54/54), done.[K
remote: Total 67 (delta 17), reused 61 (delta 11), pack-reused 0 (from 0)[K
Receiving objects: 100% (67/67), 2.95 MiB | 2.65 MiB/s, done.
Resolving deltas: 100% (17/17), done.


In [3]:
!cp /content/drive/MyDrive/OSU/yichuan_gemini.env /content/.env  # Get your own gemini key
!cp /content/drive/MyDrive/OSU/ngrok_authtoken.txt /content/dbl_llm/ngrok_authtoken.txt # To run the graphical interface on Colab, you’ll also need to set up an ngrok key.

In [4]:
!pip install pyngrok -q
!pip install python-dotenv -q
!pip install fake_useragent -q
!pip install shortuuid -q
!pip install ratelimit -q
!pip install langchain-openai -q
!pip install -U langchain-google-genai

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/125.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m122.9/125.8 kB[0m [31m4.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.8/125.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for ratelimit (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m420.1/420.1 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-google-genai
  Downloading langchain_google_genai-2.1.1-py3-none-any.whl.metadata (4.7 kB)
Collecting filetype<2.0.0,>=1.2.0 (from lan

In [5]:
from dotenv import load_dotenv
load_dotenv()

True

In [6]:
import os
os.chdir('/content/dbl_llm')

# Function Definitions

In [7]:
from extractor.article_retriever import ArticleRetriever
from extractor.html_table_extractor import HtmlTableExtractor
import os
import time
import re
import requests
from pydantic import BaseModel
from langchain.prompts import PromptTemplate
from langchain.output_parsers import OutputFixingParser, PydanticOutputParser
from langchain.chains import LLMChain
from langchain_google_genai import ChatGoogleGenerativeAI
from bs4 import BeautifulSoup

In [8]:
def extract_abstract(html):
    soup = BeautifulSoup(html, 'html.parser')

    article_section = soup.find('section', {'class': 'abstract'})

    if article_section:
        return '\n'.join([p.get_text() for p in article_section.find_all('p')])
    else:
        return "NOT FOUND"

In [9]:
# Output Schema
class EnzymeSubstratePair(BaseModel):
    enzyme: str
    substrate: str

class RelationshipList(BaseModel):
    reasoning: str
    pairs: list[EnzymeSubstratePair]

# Prompt
prompt_extract_enzyme_substrate_pairs = PromptTemplate.from_template("""
The following text is an excerpt from a scientific paper discussing enzymatic activity:

{text}

---

### Task: Extract Enzyme–Substrate Relationships

Analyze the text and perform these steps:

1. Identify enzymes and their substrates, only if the action is clearly stated or implied (e.g., "Enzyme A phosphorylates B").
2. Normalize enzyme and substrate names using gene symbols, protein names, or EC numbers where possible.
3. Show your reasoning: briefly explain how you identified each pair (with supporting sentence).
4. Return only a single valid JSON in this example format:

{{
  "reasoning": "Step-by-step explanation here...",
  "pairs": [
    {{"enzyme": "PPM1D", "substrate": "RUNX2"}},
    {{"enzyme": "CDK1", "substrate": "Histone H1"}}
  ]
}}
""")

# Function with Retry
def extract_enzyme_substrate_pairs(llm, chain, prompt_template, full_text, retries=3, wait=2):
    for attempt in range(1, retries + 1):
        try:
            result = chain.run(text=full_text)
            return result
        except Exception as e:
            print(f"[Attempt {attempt}] Failed: {e}")
            if attempt < retries:
                time.sleep(wait)
                wait *= 2
            else:
                raw_output = llm.invoke(prompt_template.format(text=full_text)).content
                print("Final raw output:\n", raw_output)
                raise RuntimeError("All retries failed.")

In [12]:
# Output Schema
class IndexPair(BaseModel):
    enzyme: int
    substrate: int

class MatchSelection(BaseModel):
    reasoning: str
    pairs: IndexPair

# Prompt
prompt_match_best_uniprot_indices = PromptTemplate.from_template("""
The following text is an excerpt from a scientific paper discussing enzymatic activity:

{text}

From this text, we have extracted the following enzyme–substrate relationship:

{enzyme_name} – {substrate_name}

We then searched UniProt (uniprot.org) and found several possible matches for both entities:

**Enzyme search results**:
{enzyme_uniprot_results}

**Substrate search results**:
{substrate_uniprot_results}

---

### Task: Select the Best-Matching UniProt Entries

Analyze the relationship within the context of the original text and perform the following steps:

1. Examine each UniProt search result for both the enzyme and the substrate.
2. Consider organism (e.g., human preferred), gene/protein name relevance, functional description, and contextual fit.
3. Show your reasoning: briefly explain how you selected each match, and which part of the original text supports your choice.
4. Return a single valid JSON object in the following example format:

{{
  "reasoning": "Step-by-step explanation here...",
  "pairs": {{
    "enzyme": 0,
    "substrate": 1
  }}
}}
""")

# Function with Retry
def match_best_uniprot_indices(
    llm,
    chain,
    prompt_template,
    full_text,
    enzyme_name,
    substrate_name,
    enzyme_uniprot_results,
    substrate_uniprot_results,
    retries=3,
    wait=2
):
    for attempt in range(1, retries + 1):
        try:
            result = chain.run(
                text=full_text,
                enzyme_name=enzyme_name,
                substrate_name=substrate_name,
                enzyme_uniprot_results=enzyme_uniprot_results,
                substrate_uniprot_results=substrate_uniprot_results
            )
            return result
        except Exception as e:
            print(f"[Attempt {attempt}] Failed: {e}")
            if attempt < retries:
                time.sleep(wait)
                wait *= 2
            else:
                # Fallback: dump raw LLM output to debug
                raw_output = llm.invoke(
                    prompt_template.format(
                        text=full_text,
                        enzyme_name=enzyme_name,
                        substrate_name=substrate_name,
                        enzyme_uniprot_results=enzyme_uniprot_results,
                        substrate_uniprot_results=substrate_uniprot_results
                    )
                ).content
                print("Final raw output:\n", raw_output)
                raise RuntimeError("All retries failed.")

In [11]:
def detect_query_type(identifier):
    if re.match(r'^[OPQ][0-9][A-Z0-9]{3}[0-9]$', identifier) or re.match(r'^[A-NR-Z][0-9]{5}$', identifier):
        return f'accession:{identifier}'  # UniProt Accession
    elif re.match(r'^\d+\.\d+\.\d+\.\d+$', identifier):
        return f'ec:{identifier}'  # EC number
    elif identifier.isupper() and len(identifier) <= 10:
        return f'gene_exact:{identifier}'  # Gene symbol
    else:
        return identifier  # Free text or protein name

def get_uniprot_details_from_accession(accession):
    url = f"https://rest.uniprot.org/uniprotkb/{accession}.json"
    response = requests.get(url)
    if response.ok:
        data = response.json()

        # Protein name fallback logic
        try:
            protein_name = data["proteinDescription"]["recommendedName"]["fullName"]["value"]
        except KeyError:
            try:
                alt_names = data["proteinDescription"].get("alternativeNames", [])
                protein_name = alt_names[0]["fullName"]["value"] if alt_names else "Unknown"
            except Exception:
                protein_name = "Unknown"

        # Genes list, safely handle None
        genes = [g.get('geneName', {}).get('value') for g in data.get("genes", [])]
        genes = [g for g in genes if g]  # filter None

        return {
            "primary": data.get("primaryAccession"),
            "secondary": data.get("secondaryAccessions", []),
            "protein": protein_name,
            "genes": genes,
            "organism": data.get("organism", {}).get("scientificName")
        }
    return None

def query_uniprot_all(identifier, max_results=3):
    query = detect_query_type(identifier)
    url = "https://rest.uniprot.org/uniprotkb/search"
    params = {
        "query": query,
        "fields": "accession",
        "format": "tsv",
        "size": max_results
    }

    response = requests.get(url, params=params)
    if not response.ok or response.text.strip().count("\n") < 1:
        print(f"No UniProt entry found for '{identifier}'")
        raise Exception(f"No UniProt entry found for '{identifier}'")
        # print(f"No UniProt entry found for '{identifier}'")
        # return "", []

    accessions = [line.split("\t")[0] for line in response.text.strip().split("\n")[1:]]

    output_lines = []
    details_list = []

    for i, acc in enumerate(accessions):
        header = f"\nsearch_result[{i}] for input: {identifier}"
        details = get_uniprot_details_from_accession(acc)

        if details:
            details_list.append(details)

            block = (
                f"{header}\n"
                f"UniProt ID: {details['primary']}\n"
                f"Protein: {details['protein']}\n"
                f"Gene(s): {', '.join(details['genes']) if details['genes'] else 'N/A'}\n"
                f"Organism: {details['organism']}\n"
                f"Secondary Accessions: {', '.join(details['secondary']) if details['secondary'] else 'None'}"
            )
        else:
            block = f"{header}\nFailed to get details for accession: {acc}"

        output_lines.append(block)

    return "\n".join(output_lines), details_list


# Run with Scripts

In [21]:
pmid = "22065775"
retriever = ArticleRetriever()
res, html_content, code = retriever.request_article(pmid)

cleaned_text = extract_abstract(html_content)
print("\033[32mAbstract:\033[0m")
print(cleaned_text)

os.environ["GOOGLE_API_KEY"] = os.environ.get("GEMINI_15_API_KEY", None)

llm = ChatGoogleGenerativeAI(model="models/gemini-1.5-pro-latest", temperature=0)

parser_extract_enzyme_substrate_pairs = PydanticOutputParser(pydantic_object=RelationshipList)

output_parser_extract_enzyme_substrate_pairs = OutputFixingParser.from_llm(parser=parser_extract_enzyme_substrate_pairs, llm=llm)

chain_extract_enzyme_substrate_pairs = LLMChain(llm=llm, prompt=prompt_extract_enzyme_substrate_pairs, output_parser=output_parser_extract_enzyme_substrate_pairs)

result_extract_enzyme_substrate_pairs = extract_enzyme_substrate_pairs(llm, chain_extract_enzyme_substrate_pairs, prompt_extract_enzyme_substrate_pairs, cleaned_text)

print("\n\033[32mReasoning:\033[0m")
print(result_extract_enzyme_substrate_pairs.reasoning)
print("\033[32mResult:\033[0m")
for pair in result_extract_enzyme_substrate_pairs.pairs:
    print(f"{pair.enzyme} acts on {pair.substrate}")


enzyme_list = []
substrate_list = []


for pair in result_extract_enzyme_substrate_pairs.pairs:
    print("\n\033[32mReasoning:\033[0m")
    print(f"""Searched UniProt (uniprot.org) and found several possible matches for both entities:""")
    print("\033[32mResult:\033[0m")
    print(f"""\n\033[33mEnzyme search results\033[0m:
{query_uniprot_all(pair.enzyme)[0]}
\n\033[33mSubstrate search results\033[0m:
{query_uniprot_all(pair.substrate)[0]}""")

    parser_match_best_uniprot_indice = PydanticOutputParser(pydantic_object=MatchSelection)
    output_parser_match_best_uniprot_indice = OutputFixingParser.from_llm(parser=parser_match_best_uniprot_indice, llm=llm)
    chain_match_best_uniprot_indice = LLMChain(llm=llm, prompt=prompt_match_best_uniprot_indices, output_parser=output_parser_match_best_uniprot_indice)
    result_match_best_uniprot_indice = match_best_uniprot_indices(
        llm=llm,
        chain=chain_match_best_uniprot_indice,
        prompt_template=prompt_match_best_uniprot_indices,
        full_text=cleaned_text,
        enzyme_name=pair.enzyme,
        substrate_name=pair.substrate,
        enzyme_uniprot_results=query_uniprot_all(pair.enzyme)[0],
        substrate_uniprot_results=query_uniprot_all(pair.substrate)[0]
    )
    print("\n\033[32mReasoning:\033[0m")
    print(result_match_best_uniprot_indice.reasoning)
    print("\033[32mResult:\033[0m")
    print(result_match_best_uniprot_indice.pairs)

    enzyme_list.append(query_uniprot_all(pair.enzyme)[1][result_match_best_uniprot_indice.pairs.enzyme])
    substrate_list.append(query_uniprot_all(pair.substrate)[1][result_match_best_uniprot_indice.pairs.substrate])

    print(enzyme_list[-1])
    print(substrate_list[-1])


make get request to https://www.ncbi.nlm.nih.gov/pmc/articles/pmid/22065775/
[32mAbstract:[0m
The inactivation of the p53 tumor suppressor pathway in many cancers often increases their resistance to anticancer therapy. Here we show that a previously proposed strategy directed to Wip1 inhibition could be ineffective in tumors lacking p53. On the contrary, Wip1 overexpression sensitized these tumors to chemotherapeutic agents. This effect was mediated through interaction between Wip1 and RUNX2 that resulted, in response to anticancer treatment, in RUNX2-dependent transcriptional induction of the proapoptotic Bax protein. The potentiating effects of Wip1 overexpression on chemotherapeutic agents were directed only to tumor cells lacking p53. The overexpression of Wip1 in normal tissues provided protection from cisplatin-induced apoptosis through decreased strength of upstream signaling to p53. Thus, Wip1 phosphatase promotes apoptosis in p53-negative tumors and protects normal tissues d

# Run with Graphical Interface

In [22]:
import os
from flask import Flask, request, render_template_string
from pyngrok import ngrok

# ================== 1. Read ngrok token and open tunnel ==================
with open('ngrok_authtoken.txt', 'r') as f:
    NGROK_AUTH_TOKEN = f.read().strip()

ngrok.set_auth_token(NGROK_AUTH_TOKEN)
port = 5000
public_url = ngrok.connect(port).public_url
print(f"Public URL: {public_url}")

# ================== 2. Initialize Flask app ==================
app = Flask(__name__)
app.secret_key = 'the_secret_key'

# ================== 3. Global variables ==================
#     Store: abstract text, a combined list of reasoning+result (reason_result_list),
#     and final lists for enzyme and substrate IDs
abstract_text = ""
reason_result_list = []  # Each element: {"reasoning": "...", "result": "..."}
enzyme_list = []
substrate_list = []

# ================== 4. Three-column layout HTML template ==================
HTML_TEMPLATE = r"""
<!DOCTYPE html>
<html>
<head>
    <meta charset="utf-8">
    <title>Flask + pyngrok Demo</title>
    <style>
        /* Overall Page & Body */
        body {
            margin: 0;
            padding: 0;
            font-family: "Segoe UI", "Roboto", "Helvetica Neue", Arial, sans-serif;
            background-color: #f0f2f5; /* Light background, akin to ChatGPT day mode */
            color: #333;
            height: 100vh; /* Ensures body fills screen height */
        }

        /* Container that holds the three columns */
        .container {
            display: flex;
            flex-direction: row;
            width: 100%;
            height: 100%;
            box-sizing: border-box;
        }

        /* Left, Middle, and Right Columns */
        .left, .middle, .right {
            padding: 20px;
            overflow-y: auto;
            box-sizing: border-box;
            background-color: #fff; /* White background for contrast */
        }
        .left {
            flex: 0 0 25%;
            border-right: 1px solid #e0e0e0;
        }
        .middle {
            flex: 1;
            border-right: 1px solid #e0e0e0;
        }
        .right {
            flex: 0 0 25%;
        }

        /* Headings in columns */
        h3 {
            margin: 0 0 10px 0;
            font-size: 18px;
            font-weight: bold;
            color: #333;
        }

        /* Abstract box and Reasoning/Result boxes */
        .abstract-box,
        .reason-result-box {
            margin-top: 5px;
            padding: 15px;
            background-color: #fff;
            border: 1px solid #e0e0e0;
            border-radius: 5px;
            white-space: pre-wrap;
        }

        /* Smaller label-like headings for "Reasoning" and "Result" */
        .title {
            font-size: 14px;
            font-weight: bold;
            margin-bottom: 5px;
            color: #555;
        }

        /* Horizontal line divider inside reason-result-box */
        hr {
            border: none;
            border-top: 1px solid #eee;
            margin: 5px 0;
        }

        /* Table for final enzyme/substrate info */
        table {
            border-collapse: collapse;
            margin-top: 10px;
            width: 100%;
            background-color: #fff;
        }
        td, th {
            border: 1px solid #e0e0e0;
            padding: 8px;
            text-align: left;
            font-size: 14px;
        }

        /* Form elements */
        form {
            margin-top: 10px;
        }
        label {
            font-size: 14px;
            margin-bottom: 5px;
            display: inline-block;
        }
        input[type="text"] {
            padding: 6px;
            font-size: 14px;
            margin-top: 5px;
            box-sizing: border-box;
            border: 1px solid #ccc;
            border-radius: 4px;
        }
        button {
            cursor: pointer;
            padding: 8px 16px;
            font-size: 14px;
            border: none;
            border-radius: 4px;
            background-color: #4caf50;
            color: #fff;
        }
        button:hover {
            background-color: #43a047;
        }
    </style>
</head>
<body>
<div class="container">

    <!-- Left Column: Enter PMID & retrieve abstract -->
    <div class="left">
        <h3>Enter PMID</h3>
        <form method="POST">
            <label for="pmid">PMID:</label><br>
            <input type="text" id="pmid" name="pmid" style="width:80%;" required
            value="{{ current_pmid }}">
            <br><br>
            <button type="submit" name="action" value="search">Search Abstract</button>
        </form>

        {% if abstract_text %}
        <div class="abstract-box">
            <strong>Abstract Retrieved:</strong><br>
            {{ abstract_text }}
        </div>
        <form method="POST">
            <input type="hidden" name="pmid" value="{{ current_pmid }}">
            <button type="submit" name="action" value="extract">Extract Information</button>
        </form>
        {% endif %}
    </div>

    <!-- Middle Column: Combined Reasoning + Result in one box -->
    <div class="middle">
        <h3>Reasoning &amp; Results</h3>
        {% for rr in reason_result_list %}
        <div class="reason-result-box">
            <div class="title">Reasoning:</div>
            {{ rr.reasoning }}
            <hr>
            <div class="title">Result:</div>
            {{ rr.result }}
        </div>
        {% endfor %}
    </div>

    <!-- Right Column: Display final enzyme_list & substrate_list -->
    <div class="right">
        <h3>Final Enzyme / Substrate Information</h3>
        <table>
            <tr>
                <th>Enzyme</th>
                <th>Substrate</th>
            </tr>
            {% for e, s in enz_sub_pairs %}
            <tr>
                <td>{{ e }}</td>
                <td>{{ s }}</td>
            </tr>
            {% endfor %}
        </table>
    </div>

</div>
</body>
</html>

"""

# ================== 5. Main route: Three-column website ==================
@app.route("/", methods=["GET", "POST"])
def index():
    global abstract_text, reason_result_list, enzyme_list, substrate_list

    current_pmid = ""

    if request.method == 'POST':
        action = request.form.get("action", "")
        pmid_input = request.form.get("pmid", "").strip()
        current_pmid = pmid_input  # Used for display if user re-renders the page

        if action == "search":
            # ========== 1) Retrieve and extract abstract by PMID ==========
            #     Clear old reason/result data each time a new search is done
            try:
                retriever = ArticleRetriever()
                res, html_content, code = retriever.request_article(pmid_input)
                cleaned = extract_abstract(html_content)
                abstract_text = cleaned

                reason_result_list.clear()
                enzyme_list.clear()
                substrate_list.clear()

            except Exception as e:
                # If retrieval fails, show it in the abstract box
                abstract_text = f"Failed to retrieve abstract: {str(e)}"

        elif action == "extract" and abstract_text:
            # ========== 2) Use the core backend code to extract/parse ==========
            try:
                pmid = pmid_input

                # Step 1: Retrieve & Abstract
                retriever = ArticleRetriever()
                res, html_content, code = retriever.request_article(pmid)
                cleaned_text = extract_abstract(html_content)

                # Show the abstract in Reasoning/Result as well
                reason_result_list.append({
                    "reasoning": "Extracted the abstract from article",
                    "result": f"Abstract:\n{cleaned_text}"
                })

                # Step 2: Set GOOGLE_API_KEY
                os.environ["GOOGLE_API_KEY"] = os.environ.get("GEMINI_15_API_KEY", None)

                parser_extract_enzyme_substrate_pairs = PydanticOutputParser(pydantic_object=RelationshipList)
                llm = ChatGoogleGenerativeAI(model="models/gemini-1.5-pro-latest", temperature=0)
                output_parser_extract_enzyme_substrate_pairs = OutputFixingParser.from_llm(parser=parser_extract_enzyme_substrate_pairs, llm=llm)
                chain_extract_enzyme_substrate_pairs = LLMChain(llm=llm, prompt=prompt_extract_enzyme_substrate_pairs, output_parser=output_parser_extract_enzyme_substrate_pairs)

                result_extract_enzyme_substrate_pairs = extract_enzyme_substrate_pairs(llm, chain_extract_enzyme_substrate_pairs, prompt_extract_enzyme_substrate_pairs, cleaned_text)

                # Combine reasoning & result for the first extraction
                pairs_str = ""
                for pair in result_extract_enzyme_substrate_pairs.pairs:
                    pairs_str += f"{pair.enzyme} acts on {pair.substrate}\n"
                reason_result_list.append({
                    "reasoning": result_extract_enzyme_substrate_pairs.reasoning,
                    "result": pairs_str
                })

                # Step 3: Initialize enzyme_list / substrate_list
                enzyme_list = []
                substrate_list = []

                # For each pair, do UniProt searching, matching, etc.
                for pair in result_extract_enzyme_substrate_pairs.pairs:
                    try:
                        # Part A: Searching UniProt
                        text_uniprot_enzyme = query_uniprot_all(pair.enzyme)
                        text_uniprot_substr = query_uniprot_all(pair.substrate)
                        reason_result_list.append({
                            "reasoning": "Searched UniProt (uniprot.org) for possible matches",
                            "result": f"""\n**Enzyme search results**:
    {text_uniprot_enzyme[0]}
    \n**Substrate search results**:
    {text_uniprot_substr[0]}"""
                        })

                        # Part B: Matching best UniProt indices
                        parser_match_best_uniprot_indices = PydanticOutputParser(pydantic_object=MatchSelection)
                        output_parser_match_best_uniprot_indices = OutputFixingParser.from_llm(parser=parser_match_best_uniprot_indices, llm=llm)
                        chain_match_best_uniprot_indices = LLMChain(llm=llm, prompt=prompt_match_best_uniprot_indices, output_parser=output_parser_match_best_uniprot_indices)

                        result_match_best_uniprot_indices = match_best_uniprot_indices(
                            llm=llm,
                            chain=chain_match_best_uniprot_indices,
                            prompt_template=prompt_match_best_uniprot_indices,
                            full_text=cleaned_text,
                            enzyme_name=pair.enzyme,
                            substrate_name=pair.substrate,
                            enzyme_uniprot_results=text_uniprot_enzyme[0],
                            substrate_uniprot_results=text_uniprot_substr[0]
                        )
                        reason_result_list.append({
                            "reasoning": result_match_best_uniprot_indices.reasoning,
                            "result": f"Use search_result[{result_match_best_uniprot_indices.pairs.enzyme}] as the Enzyme and search_result[{result_match_best_uniprot_indices.pairs.substrate}] as the Substrate."
                        })

                        # Finally, store the chosen IDs
                        best_e_id = query_uniprot_all(pair.enzyme)[1][result_match_best_uniprot_indices.pairs.enzyme]
                        best_s_id = query_uniprot_all(pair.substrate)[1][result_match_best_uniprot_indices.pairs.substrate]
                        enzyme_list.append(best_e_id)
                        substrate_list.append(best_s_id)
                    except Exception as ex:
                        pass

                # Summarize final enzyme_list and substrate_list
                reason_result_list.append({
                    "reasoning": "Collected final enzyme and substrate IDs",
                    "result": f"Enzyme List: {enzyme_list}\nSubstrate List: {substrate_list}"
                })
                # ---------------------------------------------------------

            except Exception as ex:
                reason_result_list.append({
                    "reasoning": "Error occurred while extracting information",
                    "result": str(ex)
                })

    # Construct the right-column pairs of (enzyme_list, substrate_list)
    enz_sub_pairs = list(zip(enzyme_list, substrate_list))

    return render_template_string(
        HTML_TEMPLATE,
        abstract_text=abstract_text,
        reason_result_list=reason_result_list,
        enz_sub_pairs=enz_sub_pairs,
        current_pmid=current_pmid
    )

# ================== 7. Entry point: Run the Flask app ==================
if __name__ == "__main__":
    app.run(port=port)


Public URL: https://3e4d-35-229-229-180.ngrok-free.app
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [26/Mar/2025 21:09:38] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [26/Mar/2025 21:09:38] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -


make get request to https://www.ncbi.nlm.nih.gov/pmc/articles/pmid/22065775/


INFO:werkzeug:127.0.0.1 - - [26/Mar/2025 21:09:48] "POST / HTTP/1.1" 200 -


make get request to https://www.ncbi.nlm.nih.gov/pmc/articles/pmid/22065775/


INFO:werkzeug:127.0.0.1 - - [26/Mar/2025 21:10:13] "POST / HTTP/1.1" 200 -
