In [1]:
import os
import sys

# two directories up
sys.path.append(os.path.join(os.getcwd(), os.pardir, os.pardir))
from src.db.vector_store import load_config, load_vector_store, OPENAI_API_KEY, QDRANT_API_KEY
from qdrant_client import QdrantClient
from langchain_openai import ChatOpenAI

config = load_config()
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ["QDRANT_API_KEY"] = QDRANT_API_KEY


EMBEDDING_MODEL = "text-embedding-3-small"

QDRANT_URL = config.get('qdrant').get('url')
QDRANT_COLLECTION_NAME = config.get('qdrant').get('collection_name')

Qdrant API Key:  f3TPb
OpenAI API Key:  sk-pr


In [2]:
chat = ChatOpenAI(
    model='gpt-4o-mini'
)

In [3]:
from langchain_qdrant import Qdrant
from langchain_openai import OpenAIEmbeddings

embed_model = OpenAIEmbeddings(model=EMBEDDING_MODEL)

qdrant = Qdrant.from_existing_collection(
    url=QDRANT_URL,
    embedding=embed_model,
    collection_name=QDRANT_COLLECTION_NAME,
    vector_name="content", 
    content_payload_key='content',
    distance_strategy='COSINE'
)

In [88]:
from langchain.schema import (
    SystemMessage,    # Generic system message
    HumanMessage,     # HumanMessage is a message from the human
    AIMessage,        # AIMessage is a message from the AI
)

def generate_grader_prompt(query: str, qdrant: Qdrant, top_k:int=3) -> HumanMessage:
    LLM_GRADER_PROMPT = """
   You are provided with a query and retrieved documents from a PyMoL function database. Your task is to grade each document for relevance based on the query.

    Step-by-step instructions:

    Analyze each document and its relevance to the query. Assign a score from 0.0 to 1.0 based on how well it answers or aligns with the query. A score of 1.0 indicates high relevance, while 0.0 means no relevance.
    Explain your reasoning behind each score.
    Suggest usage of the document by identifying important Python methods (starting with cmd.) and providing optimal parameters for these methods, if applicable.
    Output format: Return a list of dictionaries with the following keys:

    document: The content of the document.
    relevance_score: A float from 0.0 to 1.0 indicating relevance.
    explanation: A brief reasoning for the assigned score.
    usage: (Optional) Suggested PyMoL functions and parameters relevant to the query.
    Query: {query}

    Context: {context}
    """
    results = qdrant.similarity_search(query=query, k=top_k)

    # the context will be the content of each document
    context = ""

    for i, result in enumerate(results):
        context += f"{i+1}. Document:\n{result.page_content}\n\n"

    prompt = HumanMessage(content=LLM_GRADER_PROMPT.format(
        query=query,
        context=context
    ))

    return prompt

In [113]:
query = "Method to align two structures"
query = "select the 1a8o protein"

history = [
    SystemMessage(content="""Your are a helpful assistant expert in the field of molecular biology, 
specifically using PyMoL (Python Molecular Viewer) to visualize and manipulate
3D molecular structures. You are tasked with helping a user with a question, you may
need to know if the user is asking for a command, a question, or a task. In that case,
use the most appropiate tool to help the user.
                  
You can use the following commands to start:
    - Load a structure from the PDB: cmd.fetch('1a8o')
    - Select the structure that has been loaded with fetch: cmd.select('1a8o')
"""),
    ]

history.append(generate_grader_prompt(query=query, qdrant=qdrant, top_k=3))


In [114]:
result = chat.invoke(history)

In [115]:
result

AIMessage(content='Here are the relevance scores and analyses for the provided documents based on the query to select the 1a8o protein:\n\n1. **Document 1:**\n   - **document:** Fetch Fetch retrieves a protein structure from the PDB and loads it into PyMOL...\n   - **relevance_score:** 1.0\n   - **explanation:** This document directly discusses the `fetch` command, which is necessary for retrieving the 1a8o protein from the PDB. It provides detailed information about how to use the command, relevant parameters, and examples that are closely related to the query.\n   - **usage:** Suggested PyMOL function: `cmd.fetch(\'1a8o\')` - This command will retrieve the specified protein structure from the PDB.\n\n2. **Document 2:**\n   - **document:** Indicate indicate shows a visual representation of an atom selection...\n   - **relevance_score:** 0.2\n   - **explanation:** This document discusses the `indicate` command, which is related to visual representation of selections in PyMOL. However, 

In [116]:
print(result.content)

Here are the relevance scores and analyses for the provided documents based on the query to select the 1a8o protein:

1. **Document 1:**
   - **document:** Fetch Fetch retrieves a protein structure from the PDB and loads it into PyMOL...
   - **relevance_score:** 1.0
   - **explanation:** This document directly discusses the `fetch` command, which is necessary for retrieving the 1a8o protein from the PDB. It provides detailed information about how to use the command, relevant parameters, and examples that are closely related to the query.
   - **usage:** Suggested PyMOL function: `cmd.fetch('1a8o')` - This command will retrieve the specified protein structure from the PDB.

2. **Document 2:**
   - **document:** Indicate indicate shows a visual representation of an atom selection...
   - **relevance_score:** 0.2
   - **explanation:** This document discusses the `indicate` command, which is related to visual representation of selections in PyMOL. However, it does not directly pertain to 

In [117]:
import ast

def parse_dict(content: str):
    content = content.replace("```python", "")
    content = content.replace("```", "")
    return ast.literal_eval(content)

import json

def extract_and_parse_json(response: str):
    try:
        # Find the start and end of the JSON part in the response
        json_start = response.find("[")  # Find where the JSON array starts
        json_end = response.rfind("]")   # Find where the JSON array ends

        # If valid JSON section is found
        if json_start != -1 and json_end != -1:
            # Extract the JSON substring
            json_str = response[json_start:json_end+1]

            # Attempt to parse the extracted JSON string
            parsed_dict = json.loads(json_str)
            return parsed_dict
        else:
            print("Could not find valid JSON in the response.")
            return None
    except json.JSONDecodeError as e:
        print("Error parsing the response to a dictionary:", e)
        return None

best_documents = extract_and_parse_json(result.content)
print(best_documents)

[{'document': 'Fetch Fetch retrieves a protein structure from the PDB and loads it into PyMOL...', 'relevance_score': 1.0, 'explanation': 'This document directly discusses the fetch command, which is necessary for retrieving the 1a8o protein from the PDB.', 'usage': "cmd.fetch('1a8o')"}, {'document': 'Indicate indicate shows a visual representation of an atom selection...', 'relevance_score': 0.2, 'explanation': 'This document discusses the indicate command, which is related to visual representation of selections in PyMOL, but does not pertain to fetching the 1a8o protein.', 'usage': "cmd.indicate('your_selection_here')"}, {'document': 'Unpick unpick deletes the special "pk" atom selections (pk1, pk2, etc.)...', 'relevance_score': 0.1, 'explanation': 'This document describes the unpick command, which has no relevance to fetching or selecting the 1a8o protein.', 'usage': 'cmd.unpick()'}]


In [80]:
best_documents[0]['usage']

'fetch 1a8o'