In [57]:
import os
import json
from langchain_core.runnables import (
    RunnableBranch,
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import Tuple, List, Optional
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_community.graphs import Neo4jGraph
# from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import TokenTextSplitter
from langchain_openai import ChatOpenAI
from langchain_experimental.graph_transformers import LLMGraphTransformer
from neo4j import GraphDatabase
from yfiles_jupyter_graphs import GraphWidget
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from langchain_core.runnables import ConfigurableField, RunnableParallel, RunnablePassthrough
# from llama_index.readers.json import JSONReader
from langchain.schema import Document
from pymongo import MongoClient
from concurrent.futures import ThreadPoolExecutor, as_completed

In [58]:


graph = Neo4jGraph()

In [59]:
mongo_uri = "uri"  

client = MongoClient(mongo_uri)

db = client['research_articles']
collection = db['gut_microbiome']

# Retrieve all documents from the collection
papers = collection.find()




In [60]:
# reader = JSONReader()
# documents = reader.load_data(input_file="mock_research_papers.json", extra_info={})

def process_graph_conversion(batch):
    return llm_transformer.convert_to_graph_documents(batch)

documents = []
for json_data in papers:
    print(papers)
    print(json_data)
# with open('test_document.json', 'r', encoding='utf-8') as f:
#     json_data = json.load(f)

    json_data_lower = {key.lower(): value for key, value in json_data.items()}
    content = (
        # json_data_lower.get("introduction", "") +
        json_data_lower.get("methods", "") +
        json_data_lower.get("results", "") +
        json_data_lower.get("discussion", "") +
        json_data_lower.get("conclusion", "")
    )       

    metadata = {
        "title": json_data_lower.get("title", "Untitled"),
        "authors": json_data_lower.get("authors", "Unknown"),
        "publication_date": json_data_lower.get("publication_date", "Unknown"),
        "journal_name": json_data_lower.get("journal_name", "Unknown")
    }

    document = Document(page_content=content, metadata=metadata)
    documents.append(document)

# text_splitter = TokenTextSplitter(chunk_size=4000, chunk_overlap=200)
# texts = text_splitter.split_text(content)

# for chunk in texts:
#     document = Document(page_content=chunk, metadata=metadata)
#     documents.append(document)


llm=ChatOpenAI(temperature=0, model_name="gpt-4o-mini-2024-07-18") 
llm_transformer = LLMGraphTransformer(llm=llm)


batch_size = 10  # Adjust the batch size based on API limits and performance testing
batched_documents = [documents[i:i+batch_size] for i in range(0, len(documents), batch_size)]

with ThreadPoolExecutor(max_workers=13) as executor:  # Adjust max_workers based on system resources
    future_to_batch = {executor.submit(process_graph_conversion, batch): batch for batch in batched_documents}

    for future in as_completed(future_to_batch):
        graph_documents = future.result()
        graph.add_graph_documents(
            graph_documents,
            baseEntityLabel=True,
            include_source=True
        )

# graph_documents = llm_transformer.convert_to_graph_documents(documents)
# graph.add_graph_documents(
#     graph_documents,
#     baseEntityLabel=True,
#     include_source=True
# )

client.close()

<pymongo.synchronous.cursor.Cursor object at 0x00000231822E5750>
{'_id': ObjectId('66f3fd777027ab88318f65d6'), 'pmc_id': 'PMC11417245', 'title': 'MBCN: A novel reference database for Effcient Metagenomic analysis of human gut microbiome', 'abstract': 'No Abstract Available', 'authors': ['Zheng B', 'Xu J', 'Zhang Y', 'Qin J', 'Yuan D', 'Fan T', 'Wu W', 'Chen Y', 'Jiang Y'], 'publication_date': '2024 Sep 6', 'journal_name': 'Heliyon', 'doi': '10.1016/j.heliyon.2024.e37422', 'keywords': [], 'score': 10, 'filled_sections_count': 0.8333333333333334, 'Introduction': 'IntroductionThe human gut microbiome,  which is now recognized as a complex ecosystem that plays a critical role in human health and disease,  has been the subject of extensive research in recent years.  Prokaryotic microorganisms perform a variety of functions,  such as breaking down complex carbohydrates,  producing vitamins,  and modulating the immune system,  so much so that most of the functions are unclear.  However,  it i

In [47]:
# directly show the graph resulting from the given Cypher query
default_cypher ="MATCH (s)-[r:!MENTIONS]->(t) RETURN s,r,t LIMIT 50"

def fetchGraphData(cypher: str = default_cypher):
    # create a neo4j session to run queries
    driver = GraphDatabase.driver(
        uri=os.environ["NEO4J_URI"],
        auth=(os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])
    )
    session = driver.session()

    # Run the query and fetch results
    result = session.run(cypher)

    # Convert results to a format suitable for JSON
    graph_data = []
    for record in result:
        source = record['s']
        relationship = record['r']
        target = record['t']
        
        graph_data.append({
            'source': dict(source),  # Convert Node to dict
            'relationship': {
                'type': relationship.type,  # Extract relationship type
                'properties': dict(relationship)  # Convert Relationship properties to dict
            },
            'target': dict(target)  # Convert Node to dict
        })

    session.close()
    return graph_data


def saveGraphDataToJSON(graph_data, filename='graph_data.json'):
    # Save the graph data to a JSON file
    with open(filename, 'w') as json_file:
        json.dump(graph_data, json_file, indent=4)


def showGraph(cypher: str = default_cypher):
    # create a neo4j session to run queries
    driver = GraphDatabase.driver(
        uri = os.environ["NEO4J_URI"],
        auth = (os.environ["NEO4J_USERNAME"],
                os.environ["NEO4J_PASSWORD"]))
    session = driver.session()
    widget = GraphWidget(graph = session.run(cypher).graph())
    widget.node_label_mapping = 'id'
    #display(widget)
    return widget

showGraph()

GraphWidget(layout=Layout(height='800px', width='100%'))

In [None]:
graph_data = fetchGraphData()
saveGraphDataToJSON(graph_data) 

In [51]:
vector_index = Neo4jVector.from_existing_graph(
    OpenAIEmbeddings(),
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)

In [5]:
class Entities(BaseModel):
    """Extracting information about research papers and related entities."""
    names: List[str] = Field(
        ...,
        description="All the people, group, entities, methods and organization that appear in the text",
    )
    tools: List[str] = Field(
        ...,
        description="Tools, techniques, or software mentioned in the text",
    )
    topics: List[str] = Field(
        ...,
        description="Relevant topics or keywords from the text",
    )

# Edit your prompt to focus on extracting more information from papers
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting people, groups, entities, results, methods, topics and tools from research papers for a knowledge graph.",
        ),
        (
            "human",
            "Use the given format to extract information from the following "
            "input: {question}",
        ),
    ]
)

entity_chain = prompt | llm.with_structured_output(Entities)

NameError: name 'llm' is not defined

In [53]:
entity_chain.invoke({"question": "What are the different tools for gut microbiome?"})

Entities(names=['gut microbiome', 'tools'])

In [54]:
def generate_full_text_query(input: str) -> str:
    """
    Generate a full-text search query for a given input string.

    This function constructs a query string suitable for a full-text search.
    It processes the input string by splitting it into words and appending a
    similarity threshold (~2 changed characters) to each word, then combines
    them using the AND operator. Useful for mapping entities from user questions
    to database values, and allows for some misspelings.
    """
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()

# Fulltext index query
def structured_retriever(question: str) -> str:
    """
    Collects the neighborhood of entities mentioned
    in the question
    """
    result = ""
    entities = entity_chain.invoke({"question": question})
    for entity in entities.names:
        response = graph.query(
            """CALL db.index.fulltext.queryNodes('entity', $query, {limit: 2})
                YIELD node, score
                CALL (node) {
                WITH node
                MATCH (node)-[r:MENTIONS]->(neighbor)
                RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
                UNION ALL
                WITH node
                MATCH (node)<-[r:MENTIONS]-(neighbor)
                RETURN neighbor.id + ' - ' + type(r) + ' -> ' + node.id AS output
                }
                RETURN output LIMIT 50
            """,
            {"query": generate_full_text_query(entity)},
        )
        result += "\n".join([el['output'] for el in response])
    return result

In [55]:
print(structured_retriever("What is gut microbiome?"))




In [56]:
def retriever(question: str):
    print(f"Search query: {question}")
    structured_data = structured_retriever(question)
    unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
    final_data = f"""Structured data:
{structured_data}
Unstructured data:
{"#Document ". join(unstructured_data)}
    """
    return final_data

In [57]:
# Condense a chat history and follow-up question into a standalone question
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question,
in its original language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""  # noqa: E501
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

def _format_chat_history(chat_history: List[Tuple[str, str]]) -> List:
    buffer = []
    for human, ai in chat_history:
        buffer.append(HumanMessage(content=human))
        buffer.append(AIMessage(content=ai))
    return buffer

_search_query = RunnableBranch(
    # If input includes chat_history, we condense it with the follow-up question
    (
        RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
            run_name="HasChatHistoryCheck"
        ),  # Condense follow-up question and chat into a standalone_question
        RunnablePassthrough.assign(
            chat_history=lambda x: _format_chat_history(x["chat_history"])
        )
        | CONDENSE_QUESTION_PROMPT
        | ChatOpenAI(temperature=0)
        | StrOutputParser(),
    ),
    # Else, we have no chat history, so just pass through the question
    RunnableLambda(lambda x : x["question"]),
)

In [58]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
Use natural language and be concise.
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

chain = (
    RunnableParallel(
        {
            "context": _search_query | retriever,
            "question": RunnablePassthrough(),
        }
    )
    | prompt
    | llm
    | StrOutputParser()
)

In [59]:
chain.invoke({"question": "What is gut microbiome?"})

Search query: What is gut microbiome?




'The gut microbiome refers to the complex community of microorganisms, including bacteria, fungi, and viruses, that inhabit the digestive tract. It plays a crucial role in digestion, immune function, and has been linked to various health conditions, including mental health issues like depression.'