In [1]:
import os
import sys
sys.path.append(os.path.join(os.getcwd(), os.pardir, os.pardir))
from src.prompts import *
from src.db.vector_store import load_config, load_vector_store, OPENAI_API_KEY, QDRANT_API_KEY
from qdrant_client import QdrantClient
from langchain_openai import ChatOpenAI

config = load_config()
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ["QDRANT_API_KEY"] = QDRANT_API_KEY


EMBEDDING_MODEL = "text-embedding-3-small"

QDRANT_URL = config.get('qdrant').get('url')
QDRANT_COLLECTION_NAME = config.get('qdrant').get('collection_name')

Qdrant API Key:  f3TPb
OpenAI API Key:  sk-pr


In [2]:
chat = ChatOpenAI(
    model='gpt-4o-mini'
)

In [3]:
from langchain_qdrant import Qdrant
from langchain_openai import OpenAIEmbeddings

embed_model = OpenAIEmbeddings(model=EMBEDDING_MODEL)

qdrant = Qdrant.from_existing_collection(
    url=QDRANT_URL,
    embedding=embed_model,
    collection_name=QDRANT_COLLECTION_NAME,
    vector_name="content", 
    content_payload_key='content',
    distance_strategy='COSINE'
)

In [4]:
from langchain.schema import (
    SystemMessage,    # Generic system message
    HumanMessage,     # HumanMessage is a message from the human
    AIMessage,        # AIMessage is a message from the AI
)

def gennerate_query_to_function_prompt(query: str,
                                        qdrant: Qdrant, top_k:int=5) -> HumanMessage:

    results = qdrant.similarity_search(query=query, k=top_k)

    # the context will be the content of each document
    context = ""

    for i, result in enumerate(results):
        context += f"{i+1}. Document:\n{result.page_content}\n\n"

    prompt = HumanMessage(content=LLM_RAG_PROMPT.format(
        query=query,
        context=context
    ))

    return prompt

In [5]:
query = "Method to align two structures"
query = "select the 1a8o protein (not fetch it)"
query = "select the 1a8o protein"

history = [
    SystemMessage(content=SYSTEM_MESSAGE_PROMPT),
    ]

history.append(gennerate_query_to_function_prompt(query=query, qdrant=qdrant, top_k=3))


In [6]:
result = chat.invoke(history)

In [7]:
result

AIMessage(content='```python\n{\n    "thought": "The user wants to retrieve a specific protein structure (1a8o) from the Protein Data Bank (PDB) and load it into PyMOL. The appropriate function for this task is \'fetch\', which allows users to download and load structures directly into PyMOL.",\n    "explanation": "The \'fetch\' command can be used to download the protein structure with the identifier \'1a8o\' from the PDB. By default, the structure will be loaded into the current PyMOL session, allowing the user to visualize and manipulate it.",\n    "not_related": False,\n    "missing_requirements": "",\n    "usage": "cmd.fetch(\'1a8o\')",\n    "reference": "https://pymolwiki.org/index.php?title=Fetch"\n}\n```', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 172, 'prompt_tokens': 1688, 'total_tokens': 1860, 'completion_tokens_details': {'audio_tokens': None, 'reasoning_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': None, 'cached_t

In [8]:
print(result.content)

```python
{
    "thought": "The user wants to retrieve a specific protein structure (1a8o) from the Protein Data Bank (PDB) and load it into PyMOL. The appropriate function for this task is 'fetch', which allows users to download and load structures directly into PyMOL.",
    "explanation": "The 'fetch' command can be used to download the protein structure with the identifier '1a8o' from the PDB. By default, the structure will be loaded into the current PyMOL session, allowing the user to visualize and manipulate it.",
    "not_related": False,
    "missing_requirements": "",
    "usage": "cmd.fetch('1a8o')",
    "reference": "https://pymolwiki.org/index.php?title=Fetch"
}
```


In [9]:
import ast

def parse_dict(content: str):
    content = content.replace("```python", "")
    content = content.replace("```", "")
    return ast.literal_eval(content)


best_documents = parse_dict(result.content)
print(best_documents)

{'thought': "The user wants to retrieve a specific protein structure (1a8o) from the Protein Data Bank (PDB) and load it into PyMOL. The appropriate function for this task is 'fetch', which allows users to download and load structures directly into PyMOL.", 'explanation': "The 'fetch' command can be used to download the protein structure with the identifier '1a8o' from the PDB. By default, the structure will be loaded into the current PyMOL session, allowing the user to visualize and manipulate it.", 'not_related': False, 'missing_requirements': '', 'usage': "cmd.fetch('1a8o')", 'reference': 'https://pymolwiki.org/index.php?title=Fetch'}


In [11]:
best_documents['usage']

"cmd.fetch('1a8o')"

In [12]:
# step 2 color the protein
query = "color the protein by chain"
history.append(gennerate_query_to_function_prompt(query=query, qdrant=qdrant, top_k=3))
result = chat.invoke(history)
print(result.content)

```python
{
    "thought": "The query is asking to color the protein by chain, which can be achieved using the util.cbc function from the utilities library that colors molecules based on their chain identity.",
    "explanation": "The util.cbc function colors each chain in the protein structure with a different color, making it visually easier to distinguish between different chains. The default selection is 'all', which means it will color all chains in the loaded protein.",
    "not_related": False,
    "missing_requirements": "The command requires a protein structure to be loaded in PyMOL. If no structure is loaded, the command cannot be executed.",
    "usage": "util.cbc(selection='(all)', first_color=7, quiet=1)",
    "reference": "https://pymolwiki.org/index.php?title=CBC&oldid=7440"
}
```


In [13]:
best_documents = parse_dict(result.content)
print(best_documents)

{'thought': 'The query is asking to color the protein by chain, which can be achieved using the util.cbc function from the utilities library that colors molecules based on their chain identity.', 'explanation': "The util.cbc function colors each chain in the protein structure with a different color, making it visually easier to distinguish between different chains. The default selection is 'all', which means it will color all chains in the loaded protein.", 'not_related': False, 'missing_requirements': 'The command requires a protein structure to be loaded in PyMOL. If no structure is loaded, the command cannot be executed.', 'usage': "util.cbc(selection='(all)', first_color=7, quiet=1)", 'reference': 'https://pymolwiki.org/index.php?title=CBC&oldid=7440'}


In [14]:
query = "select the water molecules"
history.append(gennerate_query_to_function_prompt(query=query, qdrant=qdrant, top_k=3))
result = chat.invoke(history)
print(result.content)

```python
{
    "thought": "The query is about selecting water molecules, which can be accomplished using the 'select' command in PyMOL. Water molecules are typically represented by the residue name 'HOH' or 'WAT'.",
    "explanation": "To select water molecules in PyMOL, you can use the 'select' command along with a selection expression that identifies water molecules based on their residue name. This allows you to create a named selection of all water molecules in the currently loaded structure.",
    "not_related": false,
    "missing_requirements": "There is no missing requirement for this command as long as a structure containing water molecules is loaded in PyMOL.",
    "usage": "cmd.select('water_molecules', 'resn HOH')",
    "reference": "https://pymolwiki.org/index.php?title=Select"
}
```


: 