# Explicit Query: Query Uniprot using Langchain SPARQL chain
Shows uniprot query using explicit schema

## Setup

### Notebook Pre-Req

You can upload this notebook into a Jupyter environment configured to use Neptune Workbench. I tested this on an Amazon Sagemaker notebook running Python 3.10.x. 

### Python Pre-Req

I tested this on an Amazon Sagemaker notebook running Python 3.10.x. You need Python 3.9 or higher.

### Neptune Pre-Req

Your Neptune cluster must run engine version 1.2.x or higher

### Anthropic

Obtain API key from Anthropic and store in .env file
https://python.langchain.com/v0.1/docs/integrations/platforms/anthropic/

In [None]:
MODEL_SELECTOR="anthropic"

## Install Langchain
Need python 3.9 or greater plus langchain 0.0.341 ish

In [None]:
%%bash 
# 3.9 or higher?
python --version

In [None]:
!pip install --upgrade --force-reinstall langchain

In [None]:
!pip install langchain-community

In [None]:
!pip show langchain

## Build Generation Prompt

We want four prompts. Each prompt will include the schema. Additionally:
- One prompt has no examples
- One prompt has few shots
- One prompt has tips
- One prompt has few shots plus tips

### Elements of the prompt
Let's first build the optional parts.

In [None]:
import yaml
from pathlib import Path

resources = Path.cwd() / "resources"
tips = yaml.safe_load((resources / "tips.yaml").read_text())
pfx = (Path.cwd() / "resources" / "prefixes.txt").read_text()
ground_truth = yaml.safe_load((resources / "ground-truth.yaml").read_text())

TIPS="""
Here are additional tips that you might find helpful:
<tips>
"""
for t in tips:
    TIPS += f"<tip>{t.replace('{', '{{').replace('}', '}}')}</tip>\n"
TIPS+="""
</tips>
"""

PREFIXES=f"""
Include all of the following prefixes in the SPARQL query you generate:

{pfx}
"""

def add_few_shot(idx):
    hold_out = ground_truth[idx] 
    training_examples = ground_truth[:idx] + ground_truth[idx+1:]
    assert hold_out["SPARQL"] not in {x["SPARQL"] for x in training_examples}
    GT="""
Some examples:

    """
    for t in training_examples:
        GT += f"""
<question>
{t['question']}
</question>

<sparql>
{t['SPARQL']}
</sparql>
"""
    
    return hold_out, GT


### Create Gen Prompt
Langchain SPARQL chain has a default prompt, but let's create one custom. Allow it to include tips and/or fewshots. 


In [None]:
from langchain_core.prompts.prompt import PromptTemplate

def make_gen_prompt(arg_tips, arg_fewshot):
    UNI_SPARQL_GENERATION_SELECT_TEMPLATE_INTRO = f"""Task: Generate a SPARQL SELECT statement for querying a graph database.
    Convert an English language description of a question into a SPARQL query against the Uniprot knowledgebase that answers the question.
    For instance, to find all taxa from the UniProt taxonomy, the following query in backticks would be suitable:

    ```
    SELECT ?taxon
    WHERE
    {{{{
      ?taxon a up:Taxon .
    }}}}
    LIMIT 20
    ```
    
    Instructions:
    Use only the node types and properties provided in the schema.
    Do not use any node types and properties that are not explicitly provided.
    Limit the SPARQL results to 20. Add a LIMIT clause to the SPARQL query.
    Include all necessary prefixes. The following set of prefixes can be used:
    
    ```
    {PREFIXES}
    ```

    {arg_tips}


    {arg_fewshot}

    You can use the following keywords:

    <keywords>
        <keyword><ARN>keywords:5</ARN><name>Acetoin biosynthesis</name></keyword>
        <keyword><ARN>keywords:47</ARN><name>Antifreeze protein</name></keyword>
    </keywords>
    """

    UNI_SPARQL_GENERATION_SELECT_TEMPLATE=UNI_SPARQL_GENERATION_SELECT_TEMPLATE_INTRO + """
    Schema:
    {schema}
    Note: Be as concise as possible.
    Do not include any explanations or apologies in your responses.
    Do not respond to any questions that ask for anything else than for you to construct a SPARQL query.
    Do not include any text except the SPARQL query generated.

    The question is:
    {prompt}"""


    UNI_SPARQL_GENERATION_SELECT_PROMPT = PromptTemplate(
        input_variables=["schema", "prompt"], template=UNI_SPARQL_GENERATION_SELECT_TEMPLATE
    )
    
    return UNI_SPARQL_GENERATION_SELECT_PROMPT


### Initialize Langchain Graph object

In [None]:
import utilities as u
from langchain_community.graphs import NeptuneRdfGraph

# Grab Neptune cluster host/port from notebook instance environment variables
GRAPH_NOTEBOOK_HOST= u.get_neptune_env("GRAPH_NOTEBOOK_HOST")
GRAPH_NOTEBOOK_PORT= u.get_neptune_env("GRAPH_NOTEBOOK_PORT")
AWS_REGION= u.get_neptune_env("AWS_REGION")
USE_IAM=u.get_neptune_env("GRAPH_NOTEBOOK_AUTH_MODE")!="DEFAULT"

graph = NeptuneRdfGraph(
    host=GRAPH_NOTEBOOK_HOST,
    port=int(GRAPH_NOTEBOOK_PORT),
    use_iam_auth=True,
    region_name=AWS_REGION
)


## Create and invoke chain
### Define LLMs


In [None]:
!pip install langchain-anthropic

In [None]:
from langchain_anthropic import ChatAnthropic
llm = ChatAnthropic(model='claude-3-opus-20240229')

In [None]:
import boto3
from langchain.chat_models import BedrockChat
from langchain.llms import Bedrock

MODEL_ID='anthropic.claude-3-sonnet-20240229-v1:0'
bedrock_client = boto3.client('bedrock-runtime')
llm = BedrockChat(
    model_id = MODEL_ID,
    client = bedrock_client
)

### The chainmakers
We'll need separate chains for each of the four options. Any fewshot needs a chain PER query, because we exclude from fewshot the query we are attempting!

In [None]:
from langchain.chains.graph_qa.neptune_sparql import NeptuneSparqlQAChain

default_chain = NeptuneSparqlQAChain.from_llm(
    llm=llm, 
    sparql_prompt=make_gen_prompt("", ""),
    graph=graph, 
    verbose=False, 
    top_K=10, 
    return_intermediate_steps=True, 
    return_direct=True
)

make_default_chain = lambda index: default_chain

tips_chain = NeptuneSparqlQAChain.from_llm(
    llm=llm, 
    sparql_prompt=make_gen_prompt(TIPS, ""),
    graph=graph, 
    verbose=False, 
    top_K=10, 
    return_intermediate_steps=True, 
    return_direct=True
)

make_tips_chain = lambda index: tips_chain


make_few_chain = lambda index: NeptuneSparqlQAChain.from_llm(
    llm=llm, 
    sparql_prompt=make_gen_prompt("", add_few_shot(index)),
    graph=graph, 
    verbose=False, 
    top_K=10, 
    return_intermediate_steps=True, 
    return_direct=True
)

make_tips_few_chain = lambda index: NeptuneSparqlQAChain.from_llm(
    llm=llm, 
    sparql_prompt=make_gen_prompt(TIPS, add_few_shot(index)),
    graph=graph, 
    verbose=False, 
    top_K=10, 
    return_intermediate_steps=True, 
    return_direct=True
)

### Run the test on each chain

In [None]:
import os
import json
import csv


def run_test(folder_name, chain_maker):

    folder=f"./{folder_name}" 
    os.mkdir(folder)
    def write_results(index, res):
        with open(f"{folder}/{index}.json", 'w') as resfile: 
            resfile.write(json.dumps(res))

    with open(f'{folder_name}_report.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["qfile","question", "numresults", "errormsg", "gensparql"])
        for index, q in enumerate(ground_truth):
            print(str(index))
            nlq=q['question']
            expected_sparql=q['SPARQL']
            num_results=0
            error_msg=""
            gen_sparql=""
            genres=None

            try:
                genres=chain_maker(index).invoke(nlq) # result has 'query', 'result/detailedMEssage, result/code, result.message, result.intermediateSteps
                gen_sparql=genres['intermediate_steps'][0]['query'].replace("\n", " ")
                if 'results' in genres['result']:
                    num_results = len(genres['result']['results']['bindings'])
                if 'message' in genres['result']:
                    error_msg=genres['result']['message'].replace("\n", " ")
                write_results(index, genres)
                
            except Exception as e:
                print(f"Error on {index}")
                print("Exception: {}".format(type(e).__name__))
                print("Exception message: {}".format(e))
                error_msg="Exception message: {}".format(e).replace("\n", " ")

            writer.writerow([str(index), nlq, num_results, error_msg, gen_sparql])
  


In [None]:
run_test("schema", make_default_chain)

run_test("schema_tips", make_tips_chain)

run_test("schema_few", make_few_chain)

run_test("schema_few_tips", make_few_tips_chain)