# Init

In [1]:
%load_ext autoreload

In [2]:
from llmlib.app import app
context = app.app_context()
context.push()

## Imports

In [3]:
from develop.utils import formatters

## Inputs

In [4]:
query="What is the relationship between Zn2+ and glycolate?"

## Utils

In [5]:
import openai

def chatGPTQA(query: str):
    return openai.ChatCompletion.create(
        messages=[
            { "role": "user", "content": query }
        ],
        model="gpt-3.5-turbo",
        temperature=0
    )

## Initial check

Current response:

In [6]:
chatGPTQA(query)

# Compose chain

## Init llm model

In [7]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(temperature=0, verbose=True)
llm

## Pepare prompts for core term extraction

In [8]:
from langchain.prompts import PromptTemplate
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
QUERY_KEY = 'query'

core_terms_prompt_template = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate(
        prompt=PromptTemplate(
            template="Always response with comma separated list of chemical or biological terms identified in prompt.",
            input_variables=[]
        )
    ),
    HumanMessagePromptTemplate(
        prompt=PromptTemplate(
            template=f"{{{QUERY_KEY}}}",
            input_variables=[QUERY_KEY],
        )
    )
])
core_terms_prompt_template

## Define embedding model

In [9]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
embeddings

## Define vectorstore

In [10]:
from langchain.vectorstores import Chroma

In [11]:
# Vectorstore
vectorstore = Chroma(embedding_function=embeddings, persist_directory="./chroma_db_oai")
vectorstore

<langchain.vectorstores.chroma.Chroma at 0x7f9c56197f50>

## Get graph reference

### Define neo4j connector

In [12]:
from llmlib.database import Neo4j

In [13]:
graph = Neo4j().graph()
graph.schema = ''  # Overwrite schema to avoid it to be loaded from db (lengthy process not ussed in here)

schema setter 
schema setter 


#### Init graph search API wrapper

In [14]:
from llmlib.utils.search.cypher_search_api_wrapper import CypherSearchAPIWrapper

In [15]:
search = CypherSearchAPIWrapper(
    graph=graph,
    verbose=True,
    return_intermediate_steps=True
)
search

## Compose QA chain with graph retriever

### Init graph retriever

In [16]:
from llmlib.utils.retrievers.graph_search_retriever import GraphSearchRetriever

In [17]:
retriever = GraphSearchRetriever.from_llm(
    vectorstore=vectorstore,
    llm=llm,
    graph_search=search,
    prompt=core_terms_prompt_template,
    verbose=True,
    return_intermediate_steps=True
)
retriever

### Define callback handler

In [18]:
from langchain.callbacks.stdout import StdOutCallbackHandler
handler = StdOutCallbackHandler()
handler

<langchain.callbacks.stdout.StdOutCallbackHandler at 0x7f9c5248a8d0>

### Init QA chain

In [19]:
from langchain.chains import RetrievalQA

In [20]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    verbose=True,
    callbacks=[handler]
)
qa

# Test

## Initial query

In [34]:
qa.run(query, callbacks=[handler])



[1m> Entering new RetrievalQA chain...[0m

	[1m>> Generating list of revelant terms[0m
	Identified Terms:
	['Zn2+', 'glycolate']

	[1m>> Searching for related graph nodes[0m
	Identified Nodes:
	[{'term': 'Zn2+', 'matches': [{'node': {'original_entity_types': ['Compound'], 'eid': 'ZN+2', 'inchi': 'InChI=1S/Zn/q+2', 'abbrev_name': '', 'smiles': '[Zn+2]', 'entityType': 'Chemical', 'displayName': 'Zn2+', 'inchi_key': 'PTFCDOFLOPIGGS-UHFFFAOYSA-N', 'name': 'Zn2+', 'biocyc_id': 'ZN+2', 'data_source': 'BioCyc'}, 'similarity': 1.0, 'node_id': 3703030}]}, {'term': 'glycolate', 'matches': [{'node': {'original_entity_types': ['Compound'], 'eid': 'GLYCOLLATE', 'inchi': 'InChI=1S/C2H4O3/c3-1-2(4)5/h3H,1H2,(H,4,5)/p-1', 'abbrev_name': '', 'smiles': 'C(C(=O)[O-])O', 'entityType': 'Chemical', 'displayName': 'glycolate', 'inchi_key': 'AEMRFAOFKBGASW-UHFFFAOYSA-M', 'name': 'glycolate', 'biocyc_id': 'GLYCOLLATE', 'data_source': 'BioCyc'}, 'similarity': 1.0, 'node_id': 3703369}, {'node': {'origina

In [35]:
chatGPTQA(query)

## Other checks

In [23]:
query1 = "What is the association between putrescine and zinT?"
qa.run(query1, callbacks=[handler])



[1m> Entering new RetrievalQA chain...[0m

	[1m>> Generating list of revelant terms[0m
	Identified Terms:
	['putrescine', 'zinT']

	[1m>> Searching for related graph nodes[0m
	Identified Nodes:
	[{'term': 'putrescine', 'matches': [{'node': {'original_entity_types': ['Chemical'], 'eid': '17148', 'inchi_key': 'KIDHWZJUCRJVML-UHFFFAOYSA-N', 'name': 'putrescine', 'data_source': 'ChEBI', 'alt_id': 'CHEBI:26405|CHEBI:14972|CHEBI:8650|CHEBI:45092'}, 'similarity': 1.0, 'node_id': 11208475}, {'node': {'original_entity_types': ['Compound'], 'eid': 'PUTRESCINE', 'inchi': 'InChI=1S/C4H12N2/c5-3-1-2-4-6/h1-6H2/p+2', 'abbrev_name': '', 'smiles': 'C([NH3+])CCC[NH3+]', 'entityType': 'Chemical', 'displayName': 'putrescine', 'inchi_key': 'KIDHWZJUCRJVML-UHFFFAOYSA-P', 'name': 'putrescine', 'biocyc_id': 'PUTRESCINE', 'data_source': 'BioCyc'}, 'similarity': 1.0, 'node_id': 3743644}, {'node': {'original_entity_types': ['Chemical'], 'eid': '17148', 'inchi_key': 'KIDHWZJUCRJVML-UHFFFAOYSA-N', 'name':

In [24]:
chatGPTQA(query1)

In [25]:
query2 = "What is the association between putrescine and zinc?"
qa.run(query2, callbacks=[handler])



[1m> Entering new RetrievalQA chain...[0m

	[1m>> Generating list of revelant terms[0m
	Identified Terms:
	['putrescine', 'zinc']

	[1m>> Searching for related graph nodes[0m
	Identified Nodes:
	[{'term': 'putrescine', 'matches': [{'node': {'original_entity_types': ['Chemical'], 'eid': '17148', 'inchi_key': 'KIDHWZJUCRJVML-UHFFFAOYSA-N', 'name': 'putrescine', 'data_source': 'ChEBI', 'alt_id': 'CHEBI:26405|CHEBI:14972|CHEBI:8650|CHEBI:45092'}, 'similarity': 1.0, 'node_id': 11208475}, {'node': {'original_entity_types': ['Compound'], 'eid': 'PUTRESCINE', 'inchi': 'InChI=1S/C4H12N2/c5-3-1-2-4-6/h1-6H2/p+2', 'abbrev_name': '', 'smiles': 'C([NH3+])CCC[NH3+]', 'entityType': 'Chemical', 'displayName': 'putrescine', 'inchi_key': 'KIDHWZJUCRJVML-UHFFFAOYSA-P', 'name': 'putrescine', 'biocyc_id': 'PUTRESCINE', 'data_source': 'BioCyc'}, 'similarity': 1.0, 'node_id': 3743644}, {'node': {'original_entity_types': ['Chemical'], 'eid': '17148', 'inchi_key': 'KIDHWZJUCRJVML-UHFFFAOYSA-N', 'name':

In [26]:
chatGPTQA(query2)

In [27]:
query3 = "What is the association between calcium and zinT?"
qa.run(query3, callbacks=[handler])



[1m> Entering new RetrievalQA chain...[0m

	[1m>> Generating list of revelant terms[0m
	Identified Terms:
	['calcium', 'zinc']

	[1m>> Searching for related graph nodes[0m
	Identified Nodes:
	[{'term': 'calcium', 'matches': [{'node': {'original_entity_types': ['Chemical'], 'eid': 'D002118', 'obsolete': 0, 'name': 'Calcium', 'mesh_id': 'D002118', 'data_source': 'MeSH'}, 'similarity': 0.8571428571428571, 'node_id': 83342363}, {'node': {'original_entity_types': ['Chemical'], 'eid': '29320', 'inchi_key': 'OYPRJOBELJOOCE-UHFFFAOYSA-N', 'name': 'calcium(0)', 'data_source': 'ChEBI'}, 'similarity': 0.7, 'node_id': 11346629}, {'node': {'original_entity_types': ['Chemical'], 'eid': '22984', 'inchi_key': 'OYPRJOBELJOOCE-UHFFFAOYSA-N', 'name': 'calcium atom', 'data_source': 'ChEBI'}, 'similarity': 0.5833333333333334, 'node_id': 11332843}, {'node': {'original_entity_types': ['Chemical'], 'eid': '22984', 'inchi_key': 'OYPRJOBELJOOCE-UHFFFAOYSA-N', 'name': 'calcium atom', 'data_source': 'ChEB

In [28]:
chatGPTQA(query3)

## Check background noise

In [29]:
qa.run("What is the relationship between INHBA and MTMR4?", callbacks=[handler])



[1m> Entering new RetrievalQA chain...[0m

	[1m>> Generating list of revelant terms[0m
	Identified Terms:
	['INHBA', 'MTMR4']

	[1m>> Searching for related graph nodes[0m
	Identified Nodes:
	[{'term': 'INHBA', 'matches': [{'node': {'original_entity_types': ['Gene'], 'eid': '121028933', 'full_name': 'inhibin subunit beta A', 'name': 'INHBA', 'locus_tag': '', 'tax_id': '1608482', 'data_source': 'NCBI Gene'}, 'similarity': 1.0, 'node_id': 97704812}, {'node': {'original_entity_types': ['Gene'], 'eid': '119698030', 'full_name': 'inhibin subunit beta A', 'name': 'INHBA', 'locus_tag': '', 'tax_id': '1094192', 'data_source': 'NCBI Gene'}, 'similarity': 1.0, 'node_id': 97138396}, {'node': {'original_entity_types': ['Gene'], 'eid': '121333580', 'full_name': 'inhibin subunit beta A', 'name': 'INHBA', 'locus_tag': '', 'data_source': 'NCBI Gene', 'tax_id': '356909'}, 'similarity': 1.0, 'node_id': 96113143}, {'node': {'original_entity_types': ['Gene'], 'eid': '120398268', 'full_name': 'inhib

## Check different qa stuffing techniques

In [30]:
from langchain.chains import RetrievalQA

In [31]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="map_reduce",
    retriever=retriever,
    verbose=True,
    callbacks=[handler]
)
qa

In [32]:
qa.run(query, callbacks=[handler])



[1m> Entering new RetrievalQA chain...[0m

	[1m>> Generating list of revelant terms[0m
	Identified Terms:
	['Zn2+', 'glycolate']

	[1m>> Searching for related graph nodes[0m
	Identified Nodes:
	[{'term': 'Zn2+', 'matches': [{'node': {'original_entity_types': ['Compound'], 'eid': 'ZN+2', 'inchi': 'InChI=1S/Zn/q+2', 'abbrev_name': '', 'smiles': '[Zn+2]', 'entityType': 'Chemical', 'displayName': 'Zn2+', 'inchi_key': 'PTFCDOFLOPIGGS-UHFFFAOYSA-N', 'name': 'Zn2+', 'biocyc_id': 'ZN+2', 'data_source': 'BioCyc'}, 'similarity': 1.0, 'node_id': 3703030}]}, {'term': 'glycolate', 'matches': [{'node': {'original_entity_types': ['Compound'], 'eid': 'GLYCOLLATE', 'inchi': 'InChI=1S/C2H4O3/c3-1-2(4)5/h3H,1H2,(H,4,5)/p-1', 'abbrev_name': '', 'smiles': 'C(C(=O)[O-])O', 'entityType': 'Chemical', 'displayName': 'glycolate', 'inchi_key': 'AEMRFAOFKBGASW-UHFFFAOYSA-M', 'name': 'glycolate', 'biocyc_id': 'GLYCOLLATE', 'data_source': 'BioCyc'}, 'similarity': 1.0, 'node_id': 3703369}, {'node': {'origina

In [33]:
qa.run("What is the relationship between INHBA and MTMR4?", callbacks=[handler])



[1m> Entering new RetrievalQA chain...[0m

	[1m>> Generating list of revelant terms[0m
	Identified Terms:
	['INHBA', 'MTMR4']

	[1m>> Searching for related graph nodes[0m
	Identified Nodes:
	[{'term': 'INHBA', 'matches': [{'node': {'original_entity_types': ['Gene'], 'eid': '121028933', 'full_name': 'inhibin subunit beta A', 'name': 'INHBA', 'locus_tag': '', 'tax_id': '1608482', 'data_source': 'NCBI Gene'}, 'similarity': 1.0, 'node_id': 97704812}, {'node': {'original_entity_types': ['Gene'], 'eid': '119698030', 'full_name': 'inhibin subunit beta A', 'name': 'INHBA', 'locus_tag': '', 'tax_id': '1094192', 'data_source': 'NCBI Gene'}, 'similarity': 1.0, 'node_id': 97138396}, {'node': {'original_entity_types': ['Gene'], 'eid': '121333580', 'full_name': 'inhibin subunit beta A', 'name': 'INHBA', 'locus_tag': '', 'data_source': 'NCBI Gene', 'tax_id': '356909'}, 'similarity': 1.0, 'node_id': 96113143}, {'node': {'original_entity_types': ['Gene'], 'eid': '120398268', 'full_name': 'inhib