## For first try we hard coded the values, but we later extracted the species name from the biomodel(.xml)

In [38]:
# Cell 1: Imports and species list

import logging
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.vectorstores import InMemoryVectorStore
# Updated import for embeddings:
from langchain_openai import OpenAIEmbeddings

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# List of species
species_list = [
    "Ab_R",
    "Ab_sR_IL6{gut}",
    "Ab_sR_IL6{liver}",
    "Ab_sR_IL6{serum}",
    "Ab_sR{gut}",
    "Ab_sR{liver}",
    "Ab_sR{serum}",
    "Ab{gut}",
    "Ab{liver}",
    "Ab{peripheral}",
    "Ab{serum}",
    "CRP (% of baseline)",
    "CRP Suppression (%)",
    "CRPExtracellular",
    "CRP{liver}",
    "CRP{serum}",
    "geneProduct",
    "gp130{gut}",
    "gp130{liver}",
    "IL6{gut}",
    "IL6{liver}",
    "IL6{serum}",
    "pSTAT3{gut}",
    "pSTAT3{liver}",
    "R",
    "R_IL6",
    "R_IL6_gp130{gut}",
    "R_IL6_gp130{liver}",
    "Reactive{gut}",
    "Reactive{liver}",
    "sgp130{gut}",
    "sgp130{liver}",
    "sgp130{serum}",
    "sR_IL6_sgp130{gut}",
    "sR_IL6_sgp130{liver}",
    "sR_IL6_sgp130{serum}",
    "sR_IL6{gut}",
    "sR_IL6{liver}",
    "sR_IL6{serum}",
    "sR{gut}",
    "sR{liver}",
    "sR{serum}",
    "STAT3{gut}",
    "STAT3{liver}"
]

print("Available species:", species_list)


Available species: ['Ab_R', 'Ab_sR_IL6{gut}', 'Ab_sR_IL6{liver}', 'Ab_sR_IL6{serum}', 'Ab_sR{gut}', 'Ab_sR{liver}', 'Ab_sR{serum}', 'Ab{gut}', 'Ab{liver}', 'Ab{peripheral}', 'Ab{serum}', 'CRP (% of baseline)', 'CRP Suppression (%)', 'CRPExtracellular', 'CRP{liver}', 'CRP{serum}', 'geneProduct', 'gp130{gut}', 'gp130{liver}', 'IL6{gut}', 'IL6{liver}', 'IL6{serum}', 'pSTAT3{gut}', 'pSTAT3{liver}', 'R', 'R_IL6', 'R_IL6_gp130{gut}', 'R_IL6_gp130{liver}', 'Reactive{gut}', 'Reactive{liver}', 'sgp130{gut}', 'sgp130{liver}', 'sgp130{serum}', 'sR_IL6_sgp130{gut}', 'sR_IL6_sgp130{liver}', 'sR_IL6_sgp130{serum}', 'sR_IL6{gut}', 'sR_IL6{liver}', 'sR_IL6{serum}', 'sR{gut}', 'sR{liver}', 'sR{serum}', 'STAT3{gut}', 'STAT3{liver}']


## Load from the unannotated biomodel

In [39]:
import basico
 
print(basico.__version__)
# load SBML model
model = basico.load_model("docs/notebooks/talk2biomodels/Dwivedi_Model537_empty.xml")
# get the model's species
# print(basico.get_species())
 
# load SBML model
model = basico.load_model("docs/notebooks/talk2biomodels/Dwivedi_Model537_empty.xml")
# get the model's species
print(basico.get_species())
 
# get model species
species = basico.get_species()
species_list = species["display_name"].tolist()
print(species_list)
 
 

0.78
                    compartment        type    unit  initial_concentration  \
name                                                                         
IL6                       serum   reactions  nmol/l           4.356289e-04   
sgp130                    serum   reactions  nmol/l           3.900000e+00   
sR_IL6_sgp130             serum   reactions  nmol/l           8.740607e-02   
CRP                       serum   reactions  nmol/l           2.210637e+02   
sR                        serum   reactions  nmol/l           4.253507e+00   
sR_IL6                    serum   reactions  nmol/l           1.094243e-03   
Ab                        serum   reactions  nmol/l           2.381820e-29   
Ab_sR                     serum   reactions  nmol/l           6.104391e-26   
Ab_sR_IL6                 serum   reactions  nmol/l          -7.413309e-29   
CRP Suppression (%)       serum  assignment  nmol/l          -0.000000e+00   
CRP (% of baseline)       serum  assignment  nmol/l        

## Getting the description for just one specifc species name from the list

In [40]:
import logging
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

# Initialize logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize the updated OpenAI embeddings and LLM (ensure your API key is set)
text_embedding_model = OpenAIEmbeddings(openai_api_key="api_key")
llm = OpenAI(temperature=0, openai_api_key="api_key")

def query_species_description(pdf_file_name, text_embedding_model, species_name):
    """
    Load a PDF, embed its pages, perform a similarity search using a query that includes
    the species name, and then generate a concise summary description of the species in the
    context of the article using an LLM.
    """
    # Construct the retrieval query prompt (preserving curly braces in species_name)
# Construct the retrieval query prompt (preserving curly braces in species_name)
    retrieval_prompt = (
        f"Given the name of the species '{species_name}', please provide a concise description explaining its biological or functional significance. "
        "Your description should be clear, standalone, and should not reference the source article or include any citations."
    )
    logger.info("Retrieval Query Prompt: %s", retrieval_prompt)

    
    # Load the PDF pages using PyPDFLoader.
    loader = PyPDFLoader(pdf_file_name)
    pages = [page for page in loader.lazy_load()]
    logger.info("Loaded %d pages from the PDF", len(pages))
    
    # Create a vector store from the pages using the provided embedding model.
    vector_store = InMemoryVectorStore.from_documents(pages, text_embedding_model)
    logger.info("Performing similarity search with the embedded query")
    
    # Retrieve documents based on the query
    docs = vector_store.similarity_search(retrieval_prompt)
    retrieved_text = "\n".join([doc.page_content for doc in docs])
    
    # Truncate the retrieved text if it exceeds a reasonable length to fit the model's context window.
    max_chars = 3000
    if len(retrieved_text) > max_chars:
        retrieved_text = retrieved_text[:max_chars]
        logger.info("Retrieved text truncated to %d characters.", max_chars)
    
    # Define a prompt template for summarization.
    summary_prompt = PromptTemplate(
        input_variables=["retrieved_text", "species_name"],
        template=(
            "You are provided with the following text excerpts:\n\n"
            "{retrieved_text}\n\n"
            "Based on this information, provide a concise, clear summary that explains what the species '{species_name}' means. "
            "Focus solely on the biological or functional description of the species and do not refer to the source text, article, or citations."
        )
    )

    
    # Create a chain that uses the LLM and the prompt template.
    chain = LLMChain(llm=llm, prompt=summary_prompt)
    summary = chain.run({"retrieved_text": retrieved_text, "species_name": species_name})
    return summary

# Specify the path to your PDF file (update this path as needed)
pdf_file_path = "psp201364a.pdf"  # <-- Replace with your actual file path

# Select a species (for example, "IL6{liver}" with curly braces preserved)
species_name = "IL6{liver}"

# Query the PDF and generate the species description summary
species_summary = query_species_description(pdf_file_path, text_embedding_model, species_name)

print("Species Summary:\n", species_summary)


INFO:__main__:Retrieval Query Prompt: Given the name of the species 'IL6{liver}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.
INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"


Species Summary:
 

IL6{liver} is a species that plays a role in immune regulation in Crohn's disease. It is a complex formed by the interaction of IL-6 and sIL-6Rα, and targeting this complex may lead to greater efficacy in modulating immunological biomarkers.


## Store the output in to a dictionary with key value pairs

In [41]:
# Create a dictionary to store species descriptions
species_descriptions = {}

# Example: Use a species from your species list
species_name = "IL6{liver}"

# Query the PDF to generate the description for the species
description = query_species_description(pdf_file_path, text_embedding_model, species_name)

# Save the result in the dictionary
species_descriptions[species_name] = description

# Print out the dictionary with the species and its corresponding description
print("Species Description Dictionary:")
for key, value in species_descriptions.items():
    print(f"{key}:\n{value}\n")


INFO:__main__:Retrieval Query Prompt: Given the name of the species 'IL6{liver}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.
INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"


Species Description Dictionary:
IL6{liver}:


IL6{liver} is a species that plays a role in immune regulation in Crohn's disease. It is a complex formed by the interaction of IL-6 and sIL-6Rα, and targeting this complex may lead to greater efficacy in modulating immunological biomarkers.



## Now we Iterate over all the species in the list and generate description for it

In [42]:
# Create an empty dictionary to store descriptions for each species
all_species_descriptions = {}

# Iterate over each species in the species list
for species in species_list:
    try:
        print(f"Processing species: {species}")
        # Query the PDF for the description of the current species
        description = query_species_description(pdf_file_path, text_embedding_model, species)
        # Save the description in the dictionary
        all_species_descriptions[species] = description
    except Exception as e:
        print(f"Error processing {species}: {e}")
        all_species_descriptions[species] = f"Error: {e}"

# Print the dictionary of species descriptions
print("\nAll Species Descriptions:")
for species, desc in all_species_descriptions.items():
    print(f"\nSpecies: {species}\nDescription:\n{desc}\n")


INFO:__main__:Retrieval Query Prompt: Given the name of the species 'IL6{serum}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: IL6{serum}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'sgp130{serum}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: sgp130{serum}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'sR_IL6_sgp130{serum}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: sR_IL6_sgp130{serum}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'CRP{serum}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: CRP{serum}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'sR{serum}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: sR{serum}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'sR_IL6{serum}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: sR_IL6{serum}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'Ab{serum}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: Ab{serum}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'Ab_sR{serum}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: Ab_sR{serum}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'Ab_sR_IL6{serum}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: Ab_sR_IL6{serum}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'CRP Suppression (%)', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: CRP Suppression (%)


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'CRP (% of baseline)', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: CRP (% of baseline)


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'gp130{liver}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: gp130{liver}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'R_IL6_gp130{liver}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: R_IL6_gp130{liver}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'sR_IL6{liver}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: sR_IL6{liver}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'R', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: R


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'IL6{liver}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: IL6{liver}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'R_IL6', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: R_IL6


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'Ractive{liver}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: Ractive{liver}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'STAT3{liver}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: STAT3{liver}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'pSTAT3{liver}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: pSTAT3{liver}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'CRP{liver}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: CRP{liver}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'sR{liver}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: sR{liver}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'CRPExtracellular', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: CRPExtracellular


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'sgp130{liver}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: sgp130{liver}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'sR_IL6_sgp130{liver}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: sR_IL6_sgp130{liver}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'Ab_sR{liver}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: Ab_sR{liver}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'Ab{liver}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: Ab{liver}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'Ab_R', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: Ab_R


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'Ab_sR_IL6{liver}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: Ab_sR_IL6{liver}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'sR_IL6{gut}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: sR_IL6{gut}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'gp130{gut}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: gp130{gut}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'R_IL6_gp130{gut}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: R_IL6_gp130{gut}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'Ractive{gut}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: Ractive{gut}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'STAT3{gut}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: STAT3{gut}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'pSTAT3{gut}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: pSTAT3{gut}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'geneProduct', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: geneProduct


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'sR{gut}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: sR{gut}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'IL6{gut}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: IL6{gut}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'sgp130{gut}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: sgp130{gut}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'sR_IL6_sgp130{gut}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: sR_IL6_sgp130{gut}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'Ab{gut}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: Ab{gut}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'Ab_sR{gut}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: Ab_sR{gut}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'Ab_sR_IL6{gut}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: Ab_sR_IL6{gut}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
INFO:__main__:Retrieval Query Prompt: Given the name of the species 'Ab{peripheral}', please provide a concise description explaining its biological or functional significance. Your description should be clear, standalone, and should not reference the source article or include any citations.


Processing species: Ab{peripheral}


INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search with the embedded query
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"



All Species Descriptions:

Species: IL6{serum}
Description:


IL6{serum} is a complex of the cytokine IL-6 and the soluble form of its receptor (sIL-6Rα) found in the blood. It plays a role in immune regulation and has been proposed as a therapeutic target for diseases such as Crohn's disease. Targeting this complex alone may not be as effective as targeting IL-6 or sIL-6Rα individually due to the high baseline levels of sIL-6Rα in circulation.


Species: sgp130{serum}
Description:


The species 'sgp130{serum}' refers to a natural antagonist of IL-6 trans-signal-ing, which is a potential therapeutic target for Crohn's disease. It is a component of a multiscale model of IL-6-mediated immune regulation and is involved in regulating the immune response in the body.


Species: sR_IL6_sgp130{serum}
Description:


The species 'sR_IL6_sgp130{serum}' refers to a protein complex involved in the immune response in Crohn's disease. It is formed by the binding of IL-6 and sgp130, and is found in 

In [43]:
# import json

# print(json.dumps(all_species_descriptions, indent=4))
import json

# Create a new dictionary with cleaned values (removing newline characters)
cleaned_species_descriptions = {
    species: description.replace("\n", " ").strip()
    for species, description in all_species_descriptions.items()
}

# Print the cleaned dictionary in JSON format with curly braces
print(json.dumps(cleaned_species_descriptions, indent=4))


{
    "IL6{serum}": "IL6{serum} is a complex of the cytokine IL-6 and the soluble form of its receptor (sIL-6R\u03b1) found in the blood. It plays a role in immune regulation and has been proposed as a therapeutic target for diseases such as Crohn's disease. Targeting this complex alone may not be as effective as targeting IL-6 or sIL-6R\u03b1 individually due to the high baseline levels of sIL-6R\u03b1 in circulation.",
    "sgp130{serum}": "The species 'sgp130{serum}' refers to a natural antagonist of IL-6 trans-signal-ing, which is a potential therapeutic target for Crohn's disease. It is a component of a multiscale model of IL-6-mediated immune regulation and is involved in regulating the immune response in the body.",
    "sR_IL6_sgp130{serum}": "The species 'sR_IL6_sgp130{serum}' refers to a protein complex involved in the immune response in Crohn's disease. It is formed by the binding of IL-6 and sgp130, and is found in the serum. This complex is a potential target for therapeut

# 

# All the species name concatenated, getting sumary for each species

In [44]:
# # Define a function to generate one summary for all species in the list
# def query_all_species_description(pdf_file_name, text_embedding_model, species_list):
#     # Concatenate all species names into a single string, separated by commas.
#     concatenated_species = ", ".join(species_list)
    
#     # Construct a retrieval query prompt that uses the concatenated species list.
#     retrieval_prompt = (
#         f"Given the following list of species names: {concatenated_species}, "
#         "please provide a concise summary describing the roles, significance, and interrelationships "
#         "of these species, focusing solely on their biological characteristics."
#     )
#     logger.info("Retrieval Query Prompt for all species: %s", retrieval_prompt)

    
#     # Load the PDF pages using PyPDFLoader.
#     loader = PyPDFLoader(pdf_file_name)
#     pages = [page for page in loader.lazy_load()]
#     logger.info("Loaded %d pages from the PDF", len(pages))
    
#     # Create a vector store from the pages using the provided embedding model.
#     vector_store = InMemoryVectorStore.from_documents(pages, text_embedding_model)
#     logger.info("Performing similarity search with the embedded query for all species")
    
#     # Retrieve the relevant documents using the concatenated prompt.
#     docs = vector_store.similarity_search(retrieval_prompt)
#     retrieved_text = "\n".join([doc.page_content for doc in docs])
    
#     # If the retrieved text is too long, truncate it to fit within the model's context window.
#     max_chars = 3000
#     if len(retrieved_text) > max_chars:
#         retrieved_text = retrieved_text[:max_chars]
#         logger.info("Retrieved text truncated to %d characters.", max_chars)
    
#     # Create a prompt template for summarizing the retrieved text.
#     summary_prompt = PromptTemplate(
#         input_variables=["retrieved_text", "concatenated_species"],
#         template=(
#             "You are provided with the following information:\n\n"
#             "{retrieved_text}\n\n"
#             "Based on this information, provide a concise summary that explains the roles, significance, "
#             "and interrelationships of the following species:\n\n"
#             "{concatenated_species}\n\n"
#             "Focus solely on the biological or functional characteristics of these species. Do not mention any "
#             "article or include references or citations in your summary."
#         )
#     )
    
#     # Create an LLM chain with the LLM and the prompt template.
#     chain = LLMChain(llm=llm, prompt=summary_prompt)
#     summary = chain.run({"retrieved_text": retrieved_text, "concatenated_species": concatenated_species})
#     return summary

# # Now call the function using the entire species list.
# all_species_summary = query_all_species_description(pdf_file_path, text_embedding_model, species_list)

# print("Summary for all species:\n", all_species_summary)


# All the species name concatenated
# One long string

In [45]:
# Define a function to generate one summary for all species in the list
def query_all_species_description(pdf_file_name, text_embedding_model, species_list):
    # Concatenate all species names into a single comma-separated string.
    concatenated_species = ", ".join(species_list)
    
    # Construct the retrieval query prompt using the concatenated species list.
    retrieval_prompt = (
        f"Given the following list of species names: {concatenated_species}, "
        "please provide a concise summary describing the roles, significance, and interrelationships "
        "of these species, focusing solely on their biological characteristics."
    )
    logger.info("Retrieval Query Prompt for all species: %s", retrieval_prompt)

    # Load the PDF pages using PyPDFLoader.
    loader = PyPDFLoader(pdf_file_name)
    pages = [page for page in loader.lazy_load()]
    logger.info("Loaded %d pages from the PDF", len(pages))
    
    # Create a vector store from the pages using the provided embedding model.
    vector_store = InMemoryVectorStore.from_documents(pages, text_embedding_model)
    logger.info("Performing similarity search with the embedded query for all species")
    
    # Retrieve the relevant documents using the concatenated prompt.
    docs = vector_store.similarity_search(retrieval_prompt)
    retrieved_text = "\n".join([doc.page_content for doc in docs])
    
    # If the retrieved text is too long, truncate it to fit within the model's context window.
    max_chars = 3000
    if len(retrieved_text) > max_chars:
        retrieved_text = retrieved_text[:max_chars]
        logger.info("Retrieved text truncated to %d characters.", max_chars)
    
    # Create a prompt template for summarizing the retrieved text into one cohesive narrative.
    summary_prompt = PromptTemplate(
        input_variables=["retrieved_text", "concatenated_species"],
        template=(
            "You are provided with the following information:\n\n"
            "{retrieved_text}\n\n"
            "Based on this information, provide a single cohesive paragraph that explains the roles, significance, "
            "and interrelationships of the following species:\n\n"
            "{concatenated_species}\n\n"
            "Write your summary as a continuous narrative that seamlessly integrates the descriptions of each species. "
            "Focus solely on their biological or functional characteristics without mentioning the article or including citations."
        )
    )
    
    # Create an LLM chain with the LLM and the prompt template.
    chain = LLMChain(llm=llm, prompt=summary_prompt)
    summary = chain.run({"retrieved_text": retrieved_text, "concatenated_species": concatenated_species})
    return summary

# Now call the function using the entire species list.
all_species_summary = query_all_species_description(pdf_file_path, text_embedding_model, species_list)

print("Summary for all species:\n", all_species_summary)


INFO:__main__:Retrieval Query Prompt for all species: Given the following list of species names: IL6{serum}, sgp130{serum}, sR_IL6_sgp130{serum}, CRP{serum}, sR{serum}, sR_IL6{serum}, Ab{serum}, Ab_sR{serum}, Ab_sR_IL6{serum}, CRP Suppression (%), CRP (% of baseline), gp130{liver}, R_IL6_gp130{liver}, sR_IL6{liver}, R, IL6{liver}, R_IL6, Ractive{liver}, STAT3{liver}, pSTAT3{liver}, CRP{liver}, sR{liver}, CRPExtracellular, sgp130{liver}, sR_IL6_sgp130{liver}, Ab_sR{liver}, Ab{liver}, Ab_R, Ab_sR_IL6{liver}, sR_IL6{gut}, gp130{gut}, R_IL6_gp130{gut}, Ractive{gut}, STAT3{gut}, pSTAT3{gut}, geneProduct, sR{gut}, IL6{gut}, sgp130{gut}, sR_IL6_sgp130{gut}, Ab{gut}, Ab_sR{gut}, Ab_sR_IL6{gut}, Ab{peripheral}, please provide a concise summary describing the roles, significance, and interrelationships of these species, focusing solely on their biological characteristics.
INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK

Summary for all species:
 

IL6 is a cytokine that plays a crucial role in immune regulation in Crohn's disease. It can bind to its receptor, sgp130, to initiate signaling pathways that lead to the activation of STAT3. This activation can result in the production of CRP, a biomarker of inflammation. In order to regulate IL6 signaling, various therapeutic strategies have been proposed, including targeting IL6 or its receptor, using sgp130 as a natural antagonist, or inhibiting both classical and trans signaling pathways. These strategies are currently being studied in clinical trials, but data on their efficacy is limited. To better understand the effects of these strategies, a multiscale system model has been developed that integrates knowledge about IL6 signaling at the cellular, organ, and systemic levels. This model includes a simplified version of the IL6 signaling pathway, as well as the target organs relevant in Crohn's disease. It also incorporates a pharmacokinetic model for mo

# Just the PDF Embedding Description

In [46]:
import logging
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

# Initialize logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize the OpenAI embeddings and LLM (ensure your API key is set)
text_embedding_model = OpenAIEmbeddings(openai_api_key="api_key")
llm = OpenAI(temperature=0, openai_api_key="api_key")


def query_pdf_summary(pdf_file_name, text_embedding_model):
    """
    Load a PDF, embed its pages, perform a similarity search using a generic query,
    and then generate a concise summary of the content that focuses solely on the scientific insights,
    without referring to any study, article, or publication details.
    """
    # Construct a retrieval prompt that doesn't reference any publication details.
    retrieval_prompt = (
        "Provide a concise summary of the content, highlighting the main points and findings, "
        "without mentioning any species or publication details."
    )
    logger.info("Retrieval Query Prompt for summary: %s", retrieval_prompt)
    
    # Load the PDF pages.
    loader = PyPDFLoader(pdf_file_name)
    pages = [page for page in loader.lazy_load()]
    logger.info("Loaded %d pages from the PDF", len(pages))
    
    # Create a vector store from the pages using the embedding model.
    vector_store = InMemoryVectorStore.from_documents(pages, text_embedding_model)
    logger.info("Performing similarity search for summary")
    
    # Retrieve relevant documents based on the generic prompt.
    docs = vector_store.similarity_search(retrieval_prompt)
    retrieved_text = "\n".join([doc.page_content for doc in docs])
    
    # Truncate the retrieved text if needed to fit within the model's context window.
    max_chars = 3000
    if len(retrieved_text) > max_chars:
        retrieved_text = retrieved_text[:max_chars]
        logger.info("Retrieved text truncated to %d characters.", max_chars)
    
    # Define an updated prompt template that instructs the LLM to provide a cohesive narrative
    # without mentioning any study, article, or publication details.
    summary_prompt = PromptTemplate(
        input_variables=["retrieved_text"],
        template=(
            "You are provided with the following excerpts:\n\n"
            "{retrieved_text}\n\n"
            "Based on this information, write a concise, standalone summary that explains the scientific insights, "
            "key points, and main findings regarding the use of systems pharmacology in understanding complex diseases such as Crohn's disease. "
            "Focus on explaining the role of interleukin-6 (IL-6), its receptor interactions, and the use of multiscale models "
            "in drug discovery and development. Do not mention that the content comes from a study, article, or any publication."
        )
    )
    
    # Create an LLM chain with the LLM and the updated prompt template.
    chain = LLMChain(llm=llm, prompt=summary_prompt)
    summary = chain.run({"retrieved_text": retrieved_text})
    return summary

# Specify the path to your PDF file (update this as needed)
pdf_file_path = "psp201364a.pdf"  # <-- Replace with your actual PDF file path

# Generate a summary description of the entire PDF without referencing study or publication details.
pdf_summary = query_pdf_summary(pdf_file_path, text_embedding_model)

print("PDF Summary:\n", pdf_summary)


INFO:__main__:Retrieval Query Prompt for summary: Provide a concise summary of the content, highlighting the main points and findings, without mentioning any species or publication details.
INFO:__main__:Loaded 9 pages from the PDF
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Performing similarity search for summary
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:__main__:Retrieved text truncated to 3000 characters.
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"


PDF Summary:
 

Systems pharmacology, a multidisciplinary approach combining systems biology and PK/PD modeling, has been increasingly utilized in drug discovery and development to understand complex diseases such as Crohn's disease. A key factor in this disease is the proinflammatory cytokine IL-6 and its receptor interactions. Multiscale models have been developed to investigate potential therapeutic strategies and demonstrate the potential of systems pharmacology in understanding the interactions between drugs and complex biological systems underlying diseases. This approach holds promise for developing a holistic understanding of drug-target interactions and improving drug discovery and development.


## Load Model the primekgb pytorch geometric model

In [47]:
# Import necessary libraries
import os
import sys
import pickle
import pandas as pd
import torch
sys.path.append('../../..')
from aiagents4pharma.talk2knowledgegraphs.utils.embeddings.ollama import EmbeddingWithOllama
# # Set the logging level for httpx to WARNING to suppress INFO messages
import logging
logging.getLogger("httpx").setLevel(logging.WARNING)

#### Load nomic embed

In [48]:
# Using nomic-ai/nomic-embed-text-v1.5 model
emb_model = EmbeddingWithOllama(model_name='nomic-embed-text')

## Embedding for each species name - description

In [49]:
# Generate embeddings for each species description in the all_species_descriptions dictionary
outputs = {
    key: emb_model.embed_documents([description])
    for key, description in all_species_descriptions.items()
}

# Print each key and its corresponding embedding
for key, embedding in outputs.items():
    print(f"{key}: {embedding}")

IL6{serum}: [[-0.009476925, 0.03406937, -0.18136509, -0.009581383, 0.040284477, -0.070854336, 0.008647992, 0.0009830362, 0.067290165, -0.056489106, -0.019182833, 0.03509883, 0.099507295, -0.024187917, 0.0885131, -0.023229988, -0.0045579793, -0.002462297, 0.036772426, 0.077142686, -0.06051961, -0.020453246, 0.009202199, -0.032843698, 0.062223554, 0.046701524, -0.01840114, 6.740738e-05, -0.005753076, -0.022992805, 0.04727588, -0.022138152, 0.05103602, -0.057428326, 0.008659815, -0.03700168, 0.0077566174, 0.035007693, -0.03317812, 0.016082931, -0.018496666, 0.009251606, 0.07642939, -0.052299116, 0.04023288, 0.056111224, -0.017528132, 0.037925187, 0.045131955, -0.070363946, -0.038367856, -0.055483203, -0.019983774, -0.043834254, 0.06613605, -0.0013929934, -0.040008277, -0.034705214, 0.0033435244, 0.019122643, 0.071812876, 0.05414301, -0.06525527, -0.0035616437, 0.028394321, -0.04017396, -0.044178564, 0.04946491, -0.054630555, -0.027116714, 0.03577823, -0.0111757945, 0.050453823, -0.0025026

## Embedding for a concatenated species description

In [50]:
# Generate embedding for the single summary description of all species
all_species_embedding = emb_model.embed_documents([all_species_summary])

# Print the embedding for the summary
print("Embedding for all species summary:", all_species_embedding)

Embedding for all species summary: [[-0.026586989, 0.06296524, -0.17099059, -0.028831169, 0.05570798, -0.08819349, 0.00037819333, 0.04228348, 0.056750663, -0.039785273, -0.0130337, 0.04755228, 0.11318354, -0.023515489, 0.07477751, -0.03301294, -0.060176175, -0.00398064, 0.020590324, 0.07865021, -0.056615908, -0.029959075, 0.014546391, -0.05233313, 0.07302744, 0.055524837, -0.013193222, -0.001067757, -0.031178232, -0.022759298, 0.05376224, -0.02861187, 0.025374524, -0.061993454, -0.029368335, -0.02612207, 0.015708717, 0.022062488, -0.022627994, -0.010452561, 0.0056094346, 0.009893005, 0.07546864, -0.059123352, -0.004510531, 0.06866869, -0.006897894, 0.032632016, 0.04431806, -0.045187585, -0.027706424, -0.043687977, -0.024101852, -0.03220657, 0.06296168, -0.027301753, -0.043478344, -0.051422212, 0.016184902, -0.014382225, 0.04473387, 0.048960123, -0.045200102, 0.0004173814, 0.039621808, -0.008791292, -0.052421417, 0.052922007, -0.03942522, -0.048565596, 0.052738547, -0.0082830135, 0.0494

## Embedding for just the pdf

In [51]:
# Generate embedding for the PDF summary
pdf_summary_embedding = emb_model.embed_documents([pdf_summary])

# Print the embedding for the PDF summary
print("Embedding for PDF summary:", pdf_summary_embedding)


Embedding for PDF summary: [[0.008728662, 0.075855896, -0.16407794, -0.016984804, 0.027395817, -0.072660185, -0.019700421, -0.016954008, 0.01817495, -0.036783718, 0.03665849, 0.039021403, 0.14888068, -0.014098557, -0.0010082372, -0.041262686, -0.035919935, 0.01547976, -0.0064209276, 0.031283453, -0.062177986, -0.044028927, 0.0056007183, -0.034105793, 0.029923663, 0.07574344, -0.018804286, -0.039978087, -0.038132757, -0.019764429, 0.033404525, -0.010265762, -0.014922334, -0.03777369, -0.051157393, -0.039571475, 0.033884764, 0.01616549, -0.02582273, 8.784593e-05, -0.00091273215, 0.033312287, 0.044714797, -0.034856338, -0.008565525, 0.0234918, -0.017637981, 0.015961803, 0.017642768, -0.008952924, 0.0073094135, -0.010835046, -0.032119747, -0.018081054, 0.07836729, 0.007436607, 0.0074367146, -0.007850862, 0.018911919, -0.04816547, 0.087054886, 0.067486286, -0.011199943, 0.013944195, 0.07829493, -0.042779703, -0.021497468, 0.044745665, -0.021385336, -0.068284504, 0.039911088, -0.022368303, 0

In [52]:
# Load the knowledge graph
pyg_file = "aiagents4pharma/talk2knowledgegraphs/tests/files/primekg_ibd_pyg_graph.pkl"
with open(pyg_file, "rb") as f:
    pyg_data = pickle.load(f)


In [53]:
# Convert the PyG data to a pandas DataFrame for node
df_nodes = pd.DataFrame({
    "node_id": pyg_data.node_id,
    "node_name": pyg_data.node_name,
    "node_type": pyg_data.node_type,
    "enriched_node": pyg_data.enriched_node,
    "embedded_node": pyg_data.x.tolist(),
})
df_nodes.head()

Unnamed: 0,node_id,node_name,node_type,enriched_node,embedded_node
0,SMAD3_(144),SMAD3,gene/protein,SMAD3 belongs to gene/protein category. The SM...,"[0.02653600461781025, 0.05420931056141853, -0...."
1,IL10RB_(179),IL10RB,gene/protein,IL10RB belongs to gene/protein category. The p...,"[0.02476494573056698, 0.02278200164437294, -0...."
2,GNA12_(192),GNA12,gene/protein,GNA12 belongs to gene/protein category. Predic...,"[0.00479594711214304, 0.04921527951955795, -0...."
3,HNF4A_(279),HNF4A,gene/protein,HNF4A belongs to gene/protein category. The pr...,"[0.013905026949942112, 0.032602787017822266, -..."
4,VCAM1_(417),VCAM1,gene/protein,VCAM1 belongs to gene/protein category. This g...,"[0.04729974642395973, 0.03262118622660637, -0...."


## Find most similar node per specie description (embedded)

Using cosine similarity.

In [None]:
# Generate embeddings for each species description in the all_species_descriptions dictionary
outputs = {
    key: emb_model.embed_documents([description])[0]
    for key, description in all_species_descriptions.items()
}

# Convert the embeddings to a tensor
embeddings_tensor = torch.tensor([outputs[key] for key in outputs])



# Calculate cosine similarity between the embeddings and the embedded nodes in df_nodes
n_prizes = torch.nn.CosineSimilarity(dim=-1)(embeddings_tensor.unsqueeze(1), torch.tensor(df_nodes.embedded_node.tolist()).unsqueeze(0))

# Print the similarity scores
print(n_prizes)

# Find the index of the maximum similarity score
max_indices = torch.argmax(n_prizes, dim=-1)
print(max_indices)

tensor([[0.5855, 0.6659, 0.5716,  ..., 0.6241, 0.6115, 0.5575],
        [0.6558, 0.6564, 0.6380,  ..., 0.6436, 0.6418, 0.5786],
        [0.6625, 0.6632, 0.6311,  ..., 0.6656, 0.6684, 0.5876],
        ...,
        [0.6826, 0.6838, 0.6573,  ..., 0.6385, 0.6497, 0.6396],
        [0.6316, 0.6662, 0.6207,  ..., 0.6278, 0.6396, 0.6114],
        [0.6475, 0.6175, 0.5964,  ..., 0.6411, 0.6221, 0.6220]])
tensor([  14,   14,   14,  942,   14,   14, 2888,   14,   14,  791,  942,   14,
        3348, 3348, 3348,   14,  794, 2858,    7,    7,   25,   14,  942,  795,
        3348,   14,   33,  795, 3347, 3348, 3348,   14,  942,    7,    7,   14,
         935,   14, 3348,   14,   33,   33, 3348,  942])


In [113]:
embeded_outputs = {
    key: torch.tensor(outputs[key])
    for key in outputs.keys()
}

In [128]:
# Generate embeddings for each species description in the all_species_descriptions dictionary
outputs = {
    key: emb_model.embed_documents([description])[0]
    for key, description in all_species_descriptions.items()
}

# Convert the embeddings to a tensor
#embeddings_tensor = torch.tensor([outputs[key] for key in outputs])

tensored_embeddings = {
    key: torch.tensor(outputs[key])
    for key in outputs.keys()
}

computed_cosines = {
    key:  torch.nn.CosineSimilarity(dim=-1)(tensored_embeddings[key], torch.tensor(df_nodes.embedded_node.tolist()).unsqueeze(0))
    for key in tensored_embeddings.keys()
}

# Calculate cosine similarity between the embeddings and the embedded nodes in df_nodes
#n_prizes = torch.nn.CosineSimilarity(dim=-1)(embeddings_tensor.unsqueeze(1), torch.tensor(df_nodes.embedded_node.tolist()).unsqueeze(0))

# Print the similarity scores
#print(n_prizes)
best_nodes_index = {
    key: int(torch.argmax(computed_cosines[key]))
    for key in computed_cosines.keys()
}
# Find the index of the maximum similarity score
#max_indices = torch.argmax(n_prizes, dim=-1)

In [129]:
best_nodes_index

{'IL6{serum}': 14,
 'sgp130{serum}': 14,
 'sR_IL6_sgp130{serum}': 14,
 'CRP{serum}': 942,
 'sR{serum}': 14,
 'sR_IL6{serum}': 14,
 'Ab{serum}': 2888,
 'Ab_sR{serum}': 14,
 'Ab_sR_IL6{serum}': 14,
 'CRP Suppression (%)': 791,
 'CRP (% of baseline)': 942,
 'gp130{liver}': 14,
 'R_IL6_gp130{liver}': 3348,
 'sR_IL6{liver}': 3348,
 'R': 3348,
 'IL6{liver}': 14,
 'R_IL6': 794,
 'Ractive{liver}': 2858,
 'STAT3{liver}': 7,
 'pSTAT3{liver}': 7,
 'CRP{liver}': 25,
 'sR{liver}': 14,
 'CRPExtracellular': 942,
 'sgp130{liver}': 795,
 'sR_IL6_sgp130{liver}': 3348,
 'Ab_sR{liver}': 14,
 'Ab{liver}': 33,
 'Ab_R': 795,
 'Ab_sR_IL6{liver}': 3347,
 'sR_IL6{gut}': 3348,
 'gp130{gut}': 3348,
 'R_IL6_gp130{gut}': 14,
 'Ractive{gut}': 942,
 'STAT3{gut}': 7,
 'pSTAT3{gut}': 7,
 'geneProduct': 14,
 'sR{gut}': 935,
 'IL6{gut}': 14,
 'sgp130{gut}': 3348,
 'sR_IL6_sgp130{gut}': 14,
 'Ab{gut}': 33,
 'Ab_sR{gut}': 33,
 'Ab_sR_IL6{gut}': 3348,
 'Ab{peripheral}': 942}

In [130]:
specie_node = {
    key: df_nodes.loc[best_nodes_index[key], "node_id"]
    for key in best_nodes_index.keys()
}

print(specie_node)


{'IL6{serum}': 'IL6_(1567)', 'sgp130{serum}': 'IL6_(1567)', 'sR_IL6_sgp130{serum}': 'IL6_(1567)', 'CRP{serum}': 'Crohn disease_(37784)', 'sR{serum}': 'IL6_(1567)', 'sR_IL6{serum}': 'IL6_(1567)', 'Ab{serum}': 'antimicrobial humoral immune response mediated by antimicrobial peptide_(114073)', 'Ab_sR{serum}': 'IL6_(1567)', 'Ab_sR_IL6{serum}': 'IL6_(1567)', 'CRP Suppression (%)': 'Adalimumab_(17584)', 'CRP (% of baseline)': 'Crohn disease_(37784)', 'gp130{liver}': 'IL6_(1567)', 'R_IL6_gp130{liver}': 'IL-6-type cytokine receptor ligand interactions_(128815)', 'sR_IL6{liver}': 'IL-6-type cytokine receptor ligand interactions_(128815)', 'R': 'IL-6-type cytokine receptor ligand interactions_(128815)', 'IL6{liver}': 'IL6_(1567)', 'R_IL6': 'YSIL6_(17587)', 'Ractive{liver}': 'hepatic immune response_(113213)', 'STAT3{liver}': 'STAT3_(729)', 'pSTAT3{liver}': 'STAT3_(729)', 'CRP{liver}': 'CRP_(2384)', 'sR{liver}': 'IL6_(1567)', 'CRPExtracellular': 'Crohn disease_(37784)', 'sgp130{liver}': 'PN0621_(

In [131]:
import re

node_id = {
    key: int(re.sub('[()]', '', specie_node[key].split('_')[-1]))
    for key in specie_node.keys()
}

Get original prime knowledge draph

In [122]:
import sys
sys.path.append('../../..')
from aiagents4pharma.talk2knowledgegraphs.datasets.primekg import PrimeKG

In [123]:
primekg_data = PrimeKG(local_dir=r"/data/primekg")

In [124]:
# Invoke a method to load the data
primekg_data.load_data()

# Get primekg_nodes and primekg_edges
primekg_nodes = primekg_data.get_nodes()

Loading nodes of PrimeKG dataset ...
data/primekg\primekg_nodes.tsv.gz already exists. Loading the data from the local directory.
Loading edges of PrimeKG dataset ...
/data/primekg\primekg_edges.tsv.gz already exists. Loading the data from the local directory.


Get full node information

In [132]:
import pandas as pd

# Create a dataframe from the node_id dictionary
df_node_id = pd.DataFrame(list(node_id.items()), columns=['specie_name', 'node_index'])


In [135]:
df_node_id.head()

Unnamed: 0,specie_name,node_index
0,IL6{serum},1567
1,sgp130{serum},1567
2,sR_IL6_sgp130{serum},1567
3,CRP{serum},37784
4,sR{serum},1567


In [138]:
# Perform an inner join between df_node_id and primekg_nodes on node_index
merged_df = pd.merge(df_node_id, primekg_nodes, on='node_index', how='inner')

In [139]:
merged_df

Unnamed: 0,specie_name,node_index,node_name,node_source,node_id,node_type
0,IL6{serum},1567,IL6,NCBI,3569,gene/protein
1,sgp130{serum},1567,IL6,NCBI,3569,gene/protein
2,sR_IL6_sgp130{serum},1567,IL6,NCBI,3569,gene/protein
3,CRP{serum},37784,Crohn disease,MONDO_grouped,5011_5535,disease
4,sR{serum},1567,IL6,NCBI,3569,gene/protein
5,sR_IL6{serum},1567,IL6,NCBI,3569,gene/protein
6,Ab{serum},114073,antimicrobial humoral immune response mediated...,GO,61844,biological_process
7,Ab_sR{serum},1567,IL6,NCBI,3569,gene/protein
8,Ab_sR_IL6{serum},1567,IL6,NCBI,3569,gene/protein
9,CRP Suppression (%),17584,Adalimumab,DrugBank,DB00051,drug


In [None]:
import re

# Define the regular expression pattern to match content between curly braces
pattern = r'\{(.*?)\}'

# List of example strings
example_strings = [
    "STAT3{gut}",
    "Ab_sR_IL6{gut}",
    "Ab_sR_IL6{liver}",
    "Ab_sR_IL6{serum}",
    "Ab_sR{gut}",
    "Ab_sR{liver}",
    "Ab_sR{serum}",
    "Ab{gut}",
    "Ab{liver}",
    "Ab{peripheral}",
    "Ab{serum}",
    "CRP{liver}",
    "CRP{serum}",
    "gp130{gut}",
    "gp130{liver}",
    "IL6{gut}",
    "IL6{liver}",
    "IL6{serum}",
    "pSTAT3{gut}",
    "pSTAT3{liver}",
    "R_IL6_gp130{gut}",
    "R_IL6_gp130{liver}",
    "Reactive{gut}",
    "Reactive{liver}",
    "sgp130{gut}",
    "sgp130{liver}",
    "sgp130{serum}",
    "sR_IL6_sgp130{gut}",
    "sR_IL6_sgp130{liver}",
    "sR_IL6_sgp130{serum}",
    "sR_IL6{gut}",
    "sR_IL6{liver}",
    "sR_IL6{serum}",
    "sR{gut}",
    "sR{liver}",
    "sR{serum}",
    "STAT3{gut}",
    "STAT3{liver}"
]

# Find all matches in the list of strings
matches = set()
for string in example_strings:
    matches.update(re.findall(pattern, string))

# Print the unique matches
print(matches)

{'gut', 'peripheral', 'liver', 'serum'}


In [None]:
import requests 
def search_ols_term(ontology, term):
    base_url = "https://www.ebi.ac.uk/ols/api/search"
    params = {
        "q": term,
        "ontology": ontology,
        "type": "class"
    }
    response = requests.get(base_url, params=params, headers={"Accept": "application/json"}, timeout=10)
    if response.status_code == 200:
        data = response.json()
        results = data.get("response", {}).get("docs", [])
        return results
    else:
        return f"Error: {response.status_code}"

In [61]:
import pandas as pd

compartments = ['gut', 'peripheral', 'liver', 'serum']
for c in compartments:
    results = search_ols_term("bto", c)
    if "Error" not in results:
        df_result = pd.DataFrame(results)
        if not df_result.empty and c in df_result['label'].values:
            obo_id = df_result.query(f"label == '{c}'").obo_id.values[0]
            desc = df_result.query(f"label == '{c}'").description.values[0][0]
        else:
            obo_id = None
            desc = None
        print(f"{c}, obo_id: {obo_id}, description: {desc}")
    else:
        print(f"{c}, {results}")

gut, obo_id: BTO:0000545, description: 1: The alimentary canal or a portion thereof, especially the intestine or stomach. 2: The embryonic digestive tube, consisting of the foregut, the midgut, and the hindgut.
peripheral, obo_id: None, description: None
liver, obo_id: BTO:0000759, description: 1: A large very vascular glandular organ of vertebrates that secretes bile and causes important changes in many of the substances contained in the blood (as by converting sugars into glycogen which it stores up until required and by forming urea). 2: Any of various large compound glands associated with the digestive tract of invertebrate animals and probably concerned with the secretion of digestive enzymes.
serum, obo_id: BTO:0001239, description: 1: The watery portion of an animal fluid remaining after coagulation: a (1): blood serum (2): antiserum b: whey c: a normal or pathological serous fluid (as in a blister). 2: The watery part of a plant fluid.


In [66]:
import pandas as pd

# Extract only the 'compartment' and 'obo_id' from the compartments data
compartments_info = [(compartment['compartment'], compartment['obo_id']) for compartment in compartments_data]

# Convert the extracted information into a DataFrame
df_compartments_info = pd.DataFrame(compartments_info, columns=['compartment', 'obo_id'])

# Print the DataFrame
print(df_compartments_info)

  compartment       obo_id
0         gut  BTO:0000545
1  peripheral         None
2       liver  BTO:0000759
3       serum  BTO:0001239


In [63]:
import pandas as pd

# Define the compartments and their corresponding OBO IDs and descriptions
compartments_data = [
    {'compartment': 'gut', 'obo_id': 'BTO:0000545', 'description': '1: The alimentary canal or a portion thereof, especially the intestine or stomach. 2: The embryonic digestive tube, consisting of the foregut, the midgut, and the hindgut.'},
    {'compartment': 'peripheral', 'obo_id': None, 'description': None},
    {'compartment': 'liver', 'obo_id': 'BTO:0000759', 'description': '1: A large very vascular glandular organ of vertebrates that secretes bile and causes important changes in many of the substances contained in the blood (as by converting sugars into glycogen which it stores up until required and by forming urea). 2: Any of various large compound glands associated with the digestive tract of invertebrate animals and probably concerned with the secretion of digestive enzymes.'},
    {'compartment': 'serum', 'obo_id': 'BTO:0001239', 'description': '1: The watery portion of an animal fluid remaining after coagulation: a (1): blood serum (2): antiserum b: whey c: a normal or pathological serous fluid (as in a blister). 2: The watery part of a plant fluid.'}
]

# Create a DataFrame from the compartments data
df_compartments = pd.DataFrame(compartments_data)

# Filter the DataFrame to include only the 'serum' compartment
df_serum = df_compartments[df_compartments['compartment'] == 'serum'][['compartment', 'obo_id']]

# Print the resulting DataFrame
print(df_serum)

  compartment       obo_id
3       serum  BTO:0001239


In [62]:
df_result

Unnamed: 0,iri,ontology_name,ontology_prefix,short_form,description,label,obo_id,type
0,http://purl.obolibrary.org/obo/BTO_0001239,bto,BTO,BTO_0001239,[1: The watery portion of an animal fluid rema...,serum,BTO:0001239,class
1,http://purl.obolibrary.org/obo/BTO_0000133,bto,BTO,BTO_0000133,[The cell-free portion of the blood from which...,blood serum,BTO:0000133,class
2,http://purl.obolibrary.org/obo/BTO_0000448,bto,BTO,BTO_0000448,[],fetal serum,BTO:0000448,class
3,http://purl.obolibrary.org/obo/BTO_0003416,bto,BTO,BTO_0003416,[],culture condition:bovine serum albumin-grown cell,BTO:0003416,class
4,http://purl.obolibrary.org/obo/BTO_0000220,bto,BTO,BTO_0000220,"[Rat hepatoma, a clonal strain of epithelial c...",MH1C1 cell,BTO:0000220,class
5,http://purl.obolibrary.org/obo/BTO_0000011,bto,BTO,BTO_0000011,[L1 is a continuous substrain of 3T3 Swiss alb...,3T3-L1 cell,BTO:0000011,class
6,http://purl.obolibrary.org/obo/BTO_0000237,bto,BTO,BTO_0000237,[The serumlike fluid that circulates through t...,cerebrospinal fluid,BTO:0000237,class


In [None]:
# Import necessary libraries
import time
import json
import zlib
import requests
from requests.adapters import HTTPAdapter, Retry
from urllib.parse import urlparse, parse_qs, urlencode
import pandas as pd
import os
import pickle


# Define variables to perform UniProt ID mapping
# Adopted from https://www.uniprot.org/help/id_mapping
API_URL = "https://rest.uniprot.org"
POLLING_INTERVAL = 5
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))

def submit_id_mapping(from_db, to_db, ids) -> str:
    """
    Function to submit a job to perform ID mapping.

    Args:
        from_db (str): The source database.
        to_db (str): The target database.
        ids (list): The list of IDs to map.

    Returns:
        str: The job ID.
    """
    request = requests.post(f"{API_URL}/idmapping/run",
                            data={"from": from_db,
                                  "to": to_db,
                                  "ids": ",".join(ids)},)
    try:
        request.raise_for_status()
    except requests.HTTPError:
        print(request.json())
        raise

    return request.json()["jobId"]

def check_id_mapping_results_ready(job_id):
    """
    Function to check if the ID mapping results are ready.

    Args:
        job_id (str): The job ID.

    Returns:
        bool: True if the results are ready, False otherwise.
    """
    while True:
        request = session.get(f"{API_URL}/idmapping/status/{job_id}")

        try:
            request.raise_for_status()
        except requests.HTTPError:
            print(request.json())
            raise

        j = request.json()
        if "jobStatus" in j:
            if j["jobStatus"] in ("NEW", "RUNNING"):
                print(f"Retrying in {POLLING_INTERVAL}s")
                time.sleep(POLLING_INTERVAL)
            else:
                raise Exception(j["jobStatus"])
        else:
            return bool(j["results"] or j["failedIds"])

def get_id_mapping_results_link(job_id):
    """
    Function to get the link to the ID mapping results.

    Args:
        job_id (str): The job ID.

    Returns:
        str: The link to the ID mapping results.
    """
    url = f"{API_URL}/idmapping/details/{job_id}"
    request = requests.Session().get(url)

    try:
        request.raise_for_status()
    except requests.HTTPError:
        print(request.json())
        raise

    return request.json()["redirectURL"]

def decode_results(response, file_format, compressed):
    """
    Function to decode the ID mapping results.

    Args:
        response (requests.Response): The response object.
        file_format (str): The file format of the results.
        compressed (bool): Whether the results are compressed.

    Returns:
        str: The ID mapping results
    """

    if compressed:
        decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)
        if file_format == "json":
            j = json.loads(decompressed.decode("utf-8"))
            return j
        elif file_format == "tsv":
            return [line for line in decompressed.decode("utf-8").split("\n") if line]
        elif file_format == "xlsx":
            return [decompressed]
        elif file_format == "xml":
            return [decompressed.decode("utf-8")]
        else:
            return decompressed.decode("utf-8")
    elif file_format == "json":
        return response.json()
    elif file_format == "tsv":
        return [line for line in response.text.split("\n") if line]
    elif file_format == "xlsx":
        return [response.content]
    elif file_format == "xml":
        return [response.text]
    return response.text

def get_id_mapping_results_stream(url):
    """
    Function to get the ID mapping results from a stream.

    Args:
        url (str): The URL to the ID mapping results.

    Returns:
        str: The ID mapping results.
    """
    if "/stream/" not in url:
        url = url.replace("/results/", "/results/stream/")

    request = session.get(url)

    try:
        request.raise_for_status()
    except requests.HTTPError:
        print(request.json())
        raise

    parsed = urlparse(url)
    query = parse_qs(parsed.query)
    file_format = query["format"][0] if "format" in query else "json"
    compressed = (
        query["compressed"][0].lower() == "true" if "compressed" in query else False
    )
    return decode_results(request, file_format, compressed)

# Submit a job to perform ID mapping
inputs = ['6774', '3569','3586','1401']
job_id = submit_id_mapping(
    from_db="GeneID", to_db="UniProtKB", ids=inputs
)

# Check and get the ID mapping results
if check_id_mapping_results_ready(job_id):
    link = get_id_mapping_results_link(job_id)
    mapping_results = get_id_mapping_results_stream(link)
    print(mapping_results)
 

Retrying in 5s
{'results': [{'from': '6774', 'to': {'entryType': 'UniProtKB reviewed (Swiss-Prot)', 'primaryAccession': 'P40763', 'secondaryAccessions': ['A8K7B8', 'K7ENL3', 'O14916', 'Q9BW54'], 'uniProtkbId': 'STAT3_HUMAN', 'entryAudit': {'firstPublicDate': '1995-02-01', 'lastAnnotationUpdateDate': '2025-02-05', 'lastSequenceUpdateDate': '2004-06-07', 'entryVersion': 248, 'sequenceVersion': 2}, 'annotationScore': 5.0, 'organism': {'scientificName': 'Homo sapiens', 'commonName': 'Human', 'taxonId': 9606, 'lineage': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo']}, 'proteinExistence': '1: Evidence at protein level', 'proteinDescription': {'recommendedName': {'fullName': {'evidences': [{'evidenceCode': 'ECO:0000305'}], 'value': 'Signal transducer and activator of transcription 3'}}, 'alternativeNames': [{'fullName': {'evidences': [{'evidenceCode': 'ECO:

In [None]:

# Convert mapping results to a dataframe
protein_mapped_df = pd.DataFrame(mapping_results["results"])
protein_mapped_df.head()   

Unnamed: 0,from,to
0,6774,{'entryType': 'UniProtKB reviewed (Swiss-Prot)...
1,6774,"{'entryType': 'UniProtKB unreviewed (TrEMBL)',..."
2,3569,{'entryType': 'UniProtKB reviewed (Swiss-Prot)...
3,3569,"{'entryType': 'UniProtKB unreviewed (TrEMBL)',..."
4,3569,"{'entryType': 'UniProtKB unreviewed (TrEMBL)',..."


In [None]:
# Checking duplicated entries based on their entryType
protein_mapped_df.apply(lambda x: x['to']['entryType'], axis=1).value_counts(0)

UniProtKB unreviewed (TrEMBL)      6
UniProtKB reviewed (Swiss-Prot)    4
Name: count, dtype: int64

In [None]:
# There are two entryType. We choose the reviewed one.
protein_reviewed_df = protein_mapped_df[protein_mapped_df.apply(lambda x: x['to']['entryType'], axis=1) == 'UniProtKB reviewed (Swiss-Prot)']
protein_reviewed_df.reset_index(drop=True, inplace=True)
protein_reviewed_df.head()

Unnamed: 0,from,to
0,6774,{'entryType': 'UniProtKB reviewed (Swiss-Prot)...
1,3569,{'entryType': 'UniProtKB reviewed (Swiss-Prot)...
2,3586,{'entryType': 'UniProtKB reviewed (Swiss-Prot)...
3,1401,{'entryType': 'UniProtKB reviewed (Swiss-Prot)...


In [None]:
for key in protein_reviewed_df['to'][0].keys():
    protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]
protein_reviewed_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_reviewed_df[key] = [x[key] if key in x else 'N/A' for x in protein_reviewed_df['to']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  protein_reviewed_

Unnamed: 0,from,to,entryType,primaryAccession,secondaryAccessions,uniProtkbId,entryAudit,annotationScore,organism,proteinExistence,proteinDescription,genes,comments,features,keywords,references,uniProtKBCrossReferences,sequence,extraAttributes
0,6774,{'entryType': 'UniProtKB reviewed (Swiss-Prot)...,UniProtKB reviewed (Swiss-Prot),P40763,"[A8K7B8, K7ENL3, O14916, Q9BW54]",STAT3_HUMAN,"{'firstPublicDate': '1995-02-01', 'lastAnnotat...",5.0,"{'scientificName': 'Homo sapiens', 'commonName...",1: Evidence at protein level,{'recommendedName': {'fullName': {'evidences':...,[{'geneName': {'evidences': [{'evidenceCode': ...,[{'texts': [{'evidences': [{'evidenceCode': 'E...,"[{'type': 'Initiator methionine', 'location': ...","[{'id': 'KW-0002', 'category': 'Technical term...","[{'referenceNumber': 1, 'citation': {'id': '75...","[{'database': 'EMBL', 'id': 'L29277', 'propert...",{'value': 'MAQWNQLQQLDTRYLEQLHQLYSDSFPMELRQFLA...,"{'countByCommentType': {'FUNCTION': 1, 'SUBUNI..."
1,3569,{'entryType': 'UniProtKB reviewed (Swiss-Prot)...,UniProtKB reviewed (Swiss-Prot),P05231,"[Q9UCU2, Q9UCU3, Q9UCU4]",IL6_HUMAN,"{'firstPublicDate': '1987-08-13', 'lastAnnotat...",5.0,"{'scientificName': 'Homo sapiens', 'commonName...",1: Evidence at protein level,{'recommendedName': {'fullName': {'evidences':...,[{'geneName': {'evidences': [{'evidenceCode': ...,[{'texts': [{'evidences': [{'evidenceCode': 'E...,"[{'type': 'Signal', 'location': {'start': {'va...","[{'id': 'KW-0002', 'category': 'Technical term...","[{'referenceNumber': 1, 'citation': {'id': '34...","[{'database': 'EMBL', 'id': 'X04430', 'propert...",{'value': 'MNSFSTSAFGPVAFSLGLLLVLPAAFPAPVPPGED...,"{'countByCommentType': {'FUNCTION': 3, 'SUBUNI..."
2,3586,{'entryType': 'UniProtKB reviewed (Swiss-Prot)...,UniProtKB reviewed (Swiss-Prot),P22301,,IL10_HUMAN,"{'firstPublicDate': '1991-08-01', 'lastAnnotat...",5.0,"{'scientificName': 'Homo sapiens', 'commonName...",1: Evidence at protein level,{'recommendedName': {'fullName': {'value': 'In...,[{'geneName': {'value': 'IL10'}}],[{'texts': [{'evidences': [{'evidenceCode': 'E...,"[{'type': 'Signal', 'location': {'start': {'va...","[{'id': 'KW-0002', 'category': 'Technical term...","[{'referenceNumber': 1, 'citation': {'id': '18...","[{'database': 'EMBL', 'id': 'M57627', 'propert...",{'value': 'MHSSALLCCLVLLTGVRASPGQGTQSENSCTHFPG...,"{'countByCommentType': {'FUNCTION': 1, 'SUBUNI..."
3,1401,{'entryType': 'UniProtKB reviewed (Swiss-Prot)...,UniProtKB reviewed (Swiss-Prot),P02741,"[A8K078, D3DVD9, D3DVE0, Q08AK3, Q8WW75]",CRP_HUMAN,"{'firstPublicDate': '1986-07-21', 'lastAnnotat...",5.0,"{'scientificName': 'Homo sapiens', 'commonName...",1: Evidence at protein level,{'recommendedName': {'fullName': {'value': 'C-...,"[{'geneName': {'value': 'CRP'}, 'synonyms': [{...",[{'texts': [{'value': 'Displays several functi...,"[{'type': 'Signal', 'location': {'start': {'va...","[{'id': 'KW-0002', 'category': 'Technical term...","[{'referenceNumber': 1, 'citation': {'id': '29...","[{'database': 'EMBL', 'id': 'M11880', 'propert...",{'value': 'MEKLLCFLVLTSLSHAFGQTDMSRKAFVFPKESDT...,"{'countByCommentType': {'FUNCTION': 1, 'COFACT..."


Get the obd in addition to gene id

In [67]:
df_compartments_info

Unnamed: 0,compartment,obo_id
0,gut,BTO:0000545
1,peripheral,
2,liver,BTO:0000759
3,serum,BTO:0001239
