# Explicit Query: Query Uniprot using Langchain SPARQL chain
Shows uniprot query using explicit schema

## Setup

### Notebook Pre-Req

You can upload this notebook into a Jupyter environment configured to use Neptune Workbench. I tested this on an Amazon Sagemaker notebook running Python 3.10.x. 

### Python Pre-Req

I tested this on an Amazon Sagemaker notebook running Python 3.10.x. You need Python 3.9 or higher.

### Neptune Pre-Req

Your Neptune cluster must run engine version 1.2.x or higher

### Anthropic

Obtain API key from Anthropic and store in .env file
https://python.langchain.com/v0.1/docs/integrations/platforms/anthropic/

## Install Langchain
Need python 3.9 or greater plus langchain 0.0.341 ish

In [None]:
%%bash 
# 3.9 or higher?
python --version

In [None]:
!pip install --upgrade --force-reinstall langchain

In [None]:
!pip install langchain-community

In [None]:
!pip install -U langchain-aws

In [None]:
!pip show langchain

## Build Generation Prompt

We want four prompts. Each prompt will include the schema. Additionally:
- One prompt has no examples
- One prompt has few shots
- One prompt has tips
- One prompt has few shots plus tips

### Elements of the prompt
Let's first build the optional parts.

In [None]:
import yaml
from pathlib import Path

resources = Path.cwd() / "resources"
tips = yaml.safe_load((resources / "tips.yaml").read_text())
pfx = (Path.cwd() / "resources" / "prefixes.txt").read_text()
ground_truth = yaml.safe_load((resources / "ground-truth.yaml").read_text())

TIPS="""
Here are additional tips that you might find helpful:
<tips>
"""
for t in tips:
    TIPS += f"<tip>{t.replace('{', '{{').replace('}', '}}')}</tip>\n"
TIPS+="""
</tips>
"""

PREFIXES=f"""
Include all of the following prefixes in the SPARQL query you generate:

{pfx}
"""

def add_few_shot(idx):
    hold_out = ground_truth[idx] 
    training_examples = ground_truth[:idx] + ground_truth[idx+1:]
    assert hold_out["SPARQL"] not in {x["SPARQL"] for x in training_examples}
    GT="""
Some examples:

    """
    for t in training_examples:
        GT += f"""
<question>
{t['question']}
</question>

<sparql>
{t['SPARQL']}
</sparql>
"""
    
    return GT.replace('{', '{{').replace('}', '}}')


### Create Gen Prompt
Langchain SPARQL chain has a default prompt, but let's create one custom. Allow it to include tips and/or fewshots. 


In [None]:
from langchain_core.prompts.prompt import PromptTemplate

def make_gen_prompt(arg_tips, arg_fewshot):
    UNI_SPARQL_GENERATION_SELECT_TEMPLATE_INTRO = f"""Task: Generate a SPARQL SELECT statement for querying a graph database.
    Convert an English language description of a question into a SPARQL query against the Uniprot knowledgebase that answers the question.
    For instance, to find all taxa from the UniProt taxonomy, the following query in backticks would be suitable:

    ```
    SELECT ?taxon
    WHERE
    {{{{
      ?taxon a up:Taxon .
    }}}}
    LIMIT 20
    ```
    
    Instructions:
    Use only the node types and properties provided in the schema.
    Do not use any node types and properties that are not explicitly provided.
    Limit the SPARQL results to 20. Add a LIMIT clause to the SPARQL query.
    Include all necessary prefixes. The following set of prefixes can be used:
    
    ```
    {PREFIXES}
    ```

    {arg_tips}


    {arg_fewshot}

    You can use the following keywords:

    <keywords>
        <keyword><ARN>keywords:5</ARN><name>Acetoin biosynthesis</name></keyword>
        <keyword><ARN>keywords:47</ARN><name>Antifreeze protein</name></keyword>
    </keywords>
    """
    
    UNI_SPARQL_GENERATION_SELECT_TEMPLATE=UNI_SPARQL_GENERATION_SELECT_TEMPLATE_INTRO + """
    Schema:
    {schema}
    Note: Be as concise as possible.
    Do not include any explanations or apologies in your responses.
    Do not respond to any questions that ask for anything else than for you to construct a SPARQL query.
    Do not include any text except the SPARQL query generated.

    The question is:
    {prompt}"""

    UNI_SPARQL_GENERATION_SELECT_PROMPT = PromptTemplate(
        input_variables=["schema", "prompt"], template=UNI_SPARQL_GENERATION_SELECT_TEMPLATE
    )
    
    return UNI_SPARQL_GENERATION_SELECT_PROMPT


### Initialize Langchain Graph object

In [None]:
import utilities as u
from langchain_community.graphs import NeptuneRdfGraph

graph = NeptuneRdfGraph(
    host=u.GRAPH_NOTEBOOK_HOST,
    port=int(u.GRAPH_NOTEBOOK_PORT),
    use_iam_auth=u.USE_IAM_AUTH,
    region_name=u.AWS_REGION
)


## Create and invoke chain
### Define LLMs


In [None]:
import boto3
from langchain_aws import ChatBedrock
#from langchain.chat_models import BedrockChat
#from langchain.llms import Bedrock

from botocore.config import Config

config = Config(
   retries = {
      'max_attempts': 5,
      'mode': 'standard'
   }
)


MODEL_ID='anthropic.claude-3-opus-20240229-v1:0'
bedrock_client = boto3.client(
    "bedrock-runtime", 
    region_name=u.AWS_REGION,
    config=config
)
llm = ChatBedrock(
    model_id = MODEL_ID,
    client = bedrock_client
)

### The chainmakers
We'll need separate chains for each of the four options. Any fewshot needs a chain PER query, because we exclude from fewshot the query we are attempting!

In [None]:
from langchain.chains.graph_qa.neptune_sparql import NeptuneSparqlQAChain

default_chain = NeptuneSparqlQAChain.from_llm(
    llm=llm, 
    sparql_prompt=make_gen_prompt("", ""),
    graph=graph, 
    verbose=False, 
    top_K=10, 
    return_intermediate_steps=True, 
    return_direct=True
)

make_default_chain = lambda index: default_chain

tips_chain = NeptuneSparqlQAChain.from_llm(
    llm=llm, 
    sparql_prompt=make_gen_prompt(TIPS, ""),
    graph=graph, 
    verbose=False, 
    top_K=10, 
    return_intermediate_steps=True, 
    return_direct=True
)

make_tips_chain = lambda index: tips_chain


make_few_chain = lambda index: NeptuneSparqlQAChain.from_llm(
    llm=llm, 
    sparql_prompt=make_gen_prompt("", add_few_shot(index)),
    graph=graph, 
    verbose=False, 
    top_K=10, 
    return_intermediate_steps=True, 
    return_direct=True
)

make_tips_few_chain = lambda index: NeptuneSparqlQAChain.from_llm(
    llm=llm, 
    sparql_prompt=make_gen_prompt(TIPS, add_few_shot(index)),
    graph=graph, 
    verbose=False, 
    top_K=10, 
    return_intermediate_steps=True, 
    return_direct=True
)

### Run the test on each chain

In [None]:
import os
import json
import csv
import utilities as u
import time

# In our testing our account had a throttle limit on Bedrock runtime model invocation.
# Besides using boto3 for retries and backoff, we also introduced a sleep between calls.
# Set this to -1 if you do NOT wish to sleep.
SLEEP_INTERVAL_SECS=70

def run_one_test(index, chain_maker, folder_name):
    q=ground_truth[index]
    nlq=q['question']
    expected_sparql=q['SPARQL']
    error_msg=""
    gen_sparql=""
    res=None

    try:
        res=chain_maker(index).invoke(nlq) # result has 'query', 'result/detailedMEssage, result/code, result.message, result.intermediateSteps
        gen_sparql=res['intermediate_steps'][0]['query'].replace("\n", " ")
        if 'message' in res['result']:
            error_msg=res['result']['message'].replace("\n", " ")
                        
        if not(folder_name is None):
            u.write_sparql_res(folder_name, str(index), nlq, expected_sparql, gen_sparql, res['result'], error_msg)
        else:
            print(nlq)
            print(gen_sparql)
            print(res)

    except Exception as e:
        print(f"Error on {index}")
        print("Exception: {}".format(type(e).__name__))
        print("Exception message: {}".format(e))
        error_msg="Exception message: {}".format(e).replace("\n", " ")
        if not(folder_name is None):
            u.write_sparql_res(folder_name, str(index), nlq, q['SPARQL'], gen_sparql, [], error_msg)

def run_tests(folder_name, chain_maker):

    folder=f"./{folder_name}" 
    if not(os.path.exists(folder) and os.path.isdir(folder)):
        os.mkdir(folder)

    for index, q in enumerate(ground_truth):
        if SLEEP_INTERVAL_SECS > 0:
            time.sleep(SLEEP_INTERVAL_SECS)
        print(f"{folder_name} {str(index)}")
        run_one_test(index, chain_maker, folder_name)
        
def run_yourown_query(nlq):
    res=make_tips_chain(-1).invoke(nlq) 
    return res



In [None]:
run_tests("schema_zero", make_default_chain)

run_tests("schema_tips", make_tips_chain)

run_tests("schema_few", make_few_chain)

run_tests("schema_few_tips", make_few_tips_chain)


In [None]:
u.make_report("schema_zero")
u.make_report("schema_tips")
u.make_report("schema_few")
u.make_report("schema_few_tips")

## One-off queries
With schema and tips

In [None]:
run_yourown_query("Which proteins do frogs have")