In [19]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
import os
from langchain_core.output_parsers import StrOutputParser
from langchain_community.graphs import Neo4jGraph
from langchain_community.chat_models import ChatOllama
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from neo4j import GraphDatabase
from yfiles_jupyter_graphs import GraphWidget
from langchain_community.vectorstores import Neo4jVector
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from langchain_community.document_loaders import PyPDFLoader

from dotenv import load_dotenv

load_dotenv()

True

In [20]:

from langchain_core.documents import Document
file_doc = []
# Load PDF
for file in os.listdir("documents"):
    if file.endswith(".pdf"):
        file_doc.extend(PyPDFLoader("documents/" + file).load())

In [21]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = text_splitter.split_documents(documents=file_doc)

In [22]:
docs[:5]

[Document(metadata={'source': 'documents/AttentionisAllYouNeed.pdf', 'page': 0}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.comNoam Shazeer∗\nGoogle Brain\nnoam@google.comNiki Parmar∗\nGoogle Research\nnikip@google.comJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.comAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.eduŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗ ‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architectu

In [23]:
from langchain_experimental.llms.ollama_functions import OllamaFunctions
llm = OllamaFunctions(model="llama3.1", temperature=0, format="json")

llm_graph_transformer = LLMGraphTransformer(
    llm=llm
)

In [None]:
from typing import List
from pydantic import BaseModel, Field
from tqdm import tqdm

class Entities(BaseModel):
    """Identifying information about entities."""

    names: List[str] = Field(
        ...,
        description="All the person, organization, or business entities that "
        "appear in the text",
    )

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting organization and person and any entities from the text.",
        ),
        (
            "human",
            "Use the given format to extract information from the following "
            "input: {question}",
        ),
    ]
)
entity_chain = prompt | llm.with_structured_output(Entities)


100%|██████████| 127/127 [11:17<00:00,  5.33s/it]

{'documents/AttentionisAllYouNeed.pdf': Entities(names=['Google Brain', 'Ashish Vaswani', 'Noam Shazeer', 'Niki Parmar', 'Jakob Uszkoreit', 'Llion Jones', 'Aidan N. Gomez', 'Łukasz Kaiser', 'Illia Polosukhin', 'University of Toronto', 'Jakob', 'Ashish', 'Illia', 'Noam', 'Niki', 'Llion', 'Lukasz', 'Aidan', 'Google Brain', 'Google Research', 'Recurrent neural networks', 'Long short-term memory', 'Gated recurrent neural networks', 'Attention mechanisms', 'Extended Neural GPU', 'ByteNet', 'ConvS2S', 'Transformer', 'Self-attention', 'Transformer', 'neural sequence transduction models', 'encoder-decoder structure', 'Transformer', 'decoder', 'Scaled Dot-Product Attention', 'Multi-Head Attention', 'Attention', 'Q', 'K', 'V', 'softmax', 'additive attention', 'dot-product (multiplicative) attention', 'algorithm', 'feed-forward network', 'matrix multiplication code', 'dk', 'gradients', 'Transformer', 'multi-head attention', 'encoder-decoder attention', 'decoder layer', 'encoder output', 'self-att




In [28]:
graph_documents = llm_graph_transformer.convert_to_graph_documents(docs)
graph_documents

[GraphDocument(nodes=[], relationships=[], source=Document(metadata={'source': 'documents/AttentionisAllYouNeed.pdf', 'page': 0}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.comNoam Shazeer∗\nGoogle Brain\nnoam@google.comNiki Parmar∗\nGoogle Research\nnikip@google.comJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.comAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.eduŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗ ‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmech

In [66]:
doc_entities = []

for graph_doc in tqdm(graph_documents):
    source = graph_doc.source.metadata.get('source')  # Get the document source
    nodes = graph_doc.nodes  # Get nodes within the graph document

    for node in nodes:
        # Collect node details: ID, type, and the document source
        doc_entity = {
            "source": source,
            "node_id": node.id,
            "type": node.type,
        }
        doc_entities.append(doc_entity)  # Append to the overall list
        
(doc_entities)


100%|██████████| 127/127 [00:00<00:00, 105043.70it/s]


[{'source': 'documents/AttentionisAllYouNeed.pdf',
  'node_id': 'Ashish',
  'type': 'Person'},
 {'source': 'documents/AttentionisAllYouNeed.pdf',
  'node_id': 'Illia',
  'type': 'Person'},
 {'source': 'documents/AttentionisAllYouNeed.pdf',
  'node_id': 'Noam',
  'type': 'Person'},
 {'source': 'documents/AttentionisAllYouNeed.pdf',
  'node_id': 'Niki',
  'type': 'Person'},
 {'source': 'documents/AttentionisAllYouNeed.pdf',
  'node_id': 'Llion',
  'type': 'Person'},
 {'source': 'documents/AttentionisAllYouNeed.pdf',
  'node_id': 'Lukasz',
  'type': 'Person'},
 {'source': 'documents/AttentionisAllYouNeed.pdf',
  'node_id': 'Aidan',
  'type': 'Person'},
 {'source': 'documents/AttentionisAllYouNeed.pdf',
  'node_id': 'Our Research',
  'type': 'Research'},
 {'source': 'documents/AttentionisAllYouNeed.pdf',
  'node_id': 'Google Brain',
  'type': 'Organization'},
 {'source': 'documents/AttentionisAllYouNeed.pdf',
  'node_id': 'Google Research',
  'type': 'Organization'},
 {'source': 'documents

In [69]:
from neo4j import GraphDatabase
import os

driver = GraphDatabase.driver(
    uri=os.environ["NEO4J_URI"],
    auth=(os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])
)

def insert_document_with_entities(document_data):
    """
    Inserts documents and their associated entities with specific labels into the Neo4j database.

    Args:
        document_data (list[dict]): A list of dictionaries where each dictionary contains:
                                    - source: str
                                    - node_id: str
                                    - type: str
    """
    with driver.session() as session:
        for entity_dict in document_data:
            # Extract data from the entity dictionary
            print(entity_dict)
            source = entity_dict.get('source')
            node_id = entity_dict.get('node_id')
            entity_type = entity_dict.get('type')

            # Ensure all required fields are present
            if not (source and node_id and entity_type):
                raise ValueError(
                    f"Missing required fields in entity_dict: {entity_dict}"
                )

            # Insert Document node
            session.run(
                """
                MERGE (d:Document {source: $source})
                """,
                source=source
            )

            # Insert Entity node with __Entity__ label and specific type label
            session.run(
                """
                MERGE (e:__Entity__ {{id: $node_id}})
                ON CREATE SET e:`{entity_type}`, e.type = $entity_type
                ON MATCH SET e:`{entity_type}`, e.type = $entity_type
                """.format(entity_type=entity_type),
                node_id=node_id,
                entity_type=entity_type
            )

            # Create relationship between Document and Entity
            session.run(
                """
                MATCH (d:Document {source: $source})
                MATCH (e:__Entity__ {id: $node_id})
                MERGE (d)-[:CONTAINS]->(e)
                """,
                source=source,
                node_id=node_id
            )

# Close the driver connection when done
driver.close()
insert_document_with_entities(doc_entities)

  with driver.session() as session:


{'source': 'documents/AttentionisAllYouNeed.pdf', 'node_id': 'Ashish', 'type': 'Person'}
{'source': 'documents/AttentionisAllYouNeed.pdf', 'node_id': 'Illia', 'type': 'Person'}
{'source': 'documents/AttentionisAllYouNeed.pdf', 'node_id': 'Noam', 'type': 'Person'}
{'source': 'documents/AttentionisAllYouNeed.pdf', 'node_id': 'Niki', 'type': 'Person'}
{'source': 'documents/AttentionisAllYouNeed.pdf', 'node_id': 'Llion', 'type': 'Person'}
{'source': 'documents/AttentionisAllYouNeed.pdf', 'node_id': 'Lukasz', 'type': 'Person'}
{'source': 'documents/AttentionisAllYouNeed.pdf', 'node_id': 'Aidan', 'type': 'Person'}
{'source': 'documents/AttentionisAllYouNeed.pdf', 'node_id': 'Our Research', 'type': 'Research'}
{'source': 'documents/AttentionisAllYouNeed.pdf', 'node_id': 'Google Brain', 'type': 'Organization'}
{'source': 'documents/AttentionisAllYouNeed.pdf', 'node_id': 'Google Research', 'type': 'Organization'}
{'source': 'documents/AttentionisAllYouNeed.pdf', 'node_id': 'Recurrent Neural Net

In [70]:
url = os.getenv("NEO4J_URI")
user = os.getenv("NEO4J_USERNAME")
password = os.getenv("NEO4J_PASSWORD")

graph = Neo4jGraph(
    url=url,
    password=password
)
graph.add_graph_documents(graph_documents,
                          baseEntityLabel=True,
                          include_source=True
                          )

Failed to write data to connection ResolvedIPv4Address(('34.78.76.49', 7687)) (ResolvedIPv4Address(('34.78.76.49', 7687)))
Failed to write data to connection IPv4Address(('6cb3f5fb.databases.neo4j.io', 7687)) (ResolvedIPv4Address(('34.78.76.49', 7687)))


In [72]:
def ShowGraph():
    driver = GraphDatabase.driver(url, auth=(user, password))
    session = driver.session()
    widget = GraphWidget(graph = session.run("MATCH (s)-[r:!MENTIONS]->(t) RETURN s,r,t").graph())
    widget.node_label_mapping = 'id'
    return widget

In [73]:
ShowGraph()

GraphWidget(layout=Layout(height='800px', width='100%'))

In [74]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

vector_index = Neo4jVector.from_existing_graph(
    embeddings,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)
vector_retriever = vector_index.as_retriever()

Failed to write data to connection ResolvedIPv4Address(('34.78.76.49', 7687)) (ResolvedIPv4Address(('34.78.76.49', 7687)))
Failed to write data to connection IPv4Address(('6cb3f5fb.databases.neo4j.io', 7687)) (ResolvedIPv4Address(('34.78.76.49', 7687)))
  from tqdm.autonotebook import tqdm, trange
  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [75]:
from typing import List
from pydantic import BaseModel, Field

class Entities(BaseModel):
    """Identifying information about entities."""

    names: List[str] = Field(
        ...,
        description="All the person, organization, or business entities that "
        "appear in the text",
    )

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting organization and person entities from the text.",
        ),
        (
            "human",
            "Use the given format to extract information from the following "
            "input: {question}",
        ),
    ]
)

entity_chain = llm.with_structured_output(Entities)


In [76]:

entity_chain = prompt | llm.with_structured_output(Entities)
result = entity_chain.invoke({"question": "who is Nana Lucia?"}).names

In [77]:
driver = GraphDatabase.driver(
        uri = os.environ["NEO4J_URI"],
        auth = (os.environ["NEO4J_USERNAME"],
                os.environ["NEO4J_PASSWORD"]))
def create_fulltext_index(tx):
    query = '''
    CREATE FULLTEXT INDEX `fulltext_entity_id` 
    FOR (n:__Entity__) 
    ON EACH [n.id];
    '''
    tx.run(query)

# Function to execute the query
def create_index():
    with driver.session() as session:
        session.execute_write(create_fulltext_index)
        print("Fulltext index created successfully.")

# Call the function to create the index
try:
    create_index()
except:
    pass

# Close the driver connection
driver.close()

In [80]:

def generate_full_text_query(input: str) -> str:
    words = [el for el in remove_lucene_chars(input).split() if el]
    if not words:
        return ""
    full_text_query = " AND ".join([f"{word}~2" for word in words])
    print(f"Generated Query: {full_text_query}")
    return full_text_query.strip()

def graph_retriever(question: str) -> str:
    entities = entity_chain.invoke({"question": question}).names
    print(entities)
    final_results = []

    for entity in entities:
        query = """
        CALL db.index.fulltext.queryNodes("fulltext_entity_id", $query) YIELD node, score
        CALL (node, score) {
            WITH node
            MATCH (node)-[r:!MENTIONS]->(neighbour)
            RETURN node.id + ' - ' + type(r) + ' -> ' + neighbour.id AS output
            UNION ALL
            WITH node
            MATCH (node)<-[r:!MENTIONS]-(neighbour)
            RETURN neighbour.id + ' <- ' + type(r) + ' - ' + node.id AS output
        }
        RETURN output LIMIT 50;
        """
        response = graph.query(query, {"query": (entity)})
        final_results.extend(record['output'] for record in response)
        # filter the None
        final_results = list(filter(None, final_results))

    return "\n".join(final_results)

In [81]:
print(graph_retriever("Who is Ashish Vaswani?"))

['Ashish Vaswani']
Ashish - COLLABORATOR -> Illia
Noam <- CONTRIBUTOR - Ashish
Bert <- BASEDON - Vaswani Et Al.


In [82]:
def full_retriever(question: str):
    graph_data = graph_retriever(question)
    print(graph_data)
    vector_data = [el.page_content for el in vector_retriever.invoke(question)]
    final_data = f"""Graph data:
{graph_data}
vector data:
{"#Document ". join(vector_data)}
    """
    return final_data

In [91]:
llm = OllamaFunctions(model="llama3.2", temperature=0.2, format="json")

template = """ 
    Answer the question based on only the following context provided:
    {context}
    
    Question: {question}
    Use natural language to answer the question.
    Answer:
"""

prompt = ChatPromptTemplate.from_template(
    template=template
)

chain = (
        {
            "context": full_retriever,
            "question": RunnablePassthrough(),
        }
    | prompt
    | llm
    | StrOutputParser()
)

In [92]:
result = chain.invoke(input="Summarize the paper Attention is All You Need?")
print(result)

['Attention is All You Need']
Attention - INPUT -> Q
Attention - INPUT -> K
Attention - INPUT -> V
Dropout <- USED_IN - Attention
Self-Attention - SYNONYM -> Intra-Attention
Self-Attention - SEQUENTIAL -> Recurrent
Self-Attention - COMPARISON -> Recurrent Layers
Self-Attention - COMPARISON -> Convolutional Layers
Self-Attention - RESTRICTED_TO_CONSIDERING_ONLY_A -> Neighborhood
Transformer <- USES - Self-Attention
Self-Attention (Restricted) <- ALTERNATIVE - Self-Attention
Rin <- CENTERED_AROUND_THE_RESPECTIVE_OUTPUT_POSITION - Self-Attention
Attention Visualizations - RELATED_TO -> Input Layer5
Attention Mechanisms - USED_WITH -> Recurrent Network
Transformer <- USES - Attention Mechanisms
Self-Attention <- SYNONYM - Intra-Attention
Self-Attention Layer - CONNECTS -> Long-Range Dependencies
Separable Convolutions <- EQUIVALENCE - Self-Attention Layer
Encoder-Decoder Attention - USES -> Memory Keys And Values
Encoder-Decoder Attention - ATTENDS_TO -> Queries
Self-Attention (Restricted)



The paper 'Attention Is All You Need' by Ashish Vaswani et al. proposes a new simple network architecture, the Transformer, based solely on attention mechanisms. This model replaces complex recurrent or convolutional neural networks with an encoder and decoder, instead using self-attention to connect the two components. The authors argue that this approach can lead to more interpretable models, as individual attention heads appear to learn different tasks and exhibit behavior related to sentence structure. The Transformer is trained on the WMT 2014 English-German dataset and demonstrates improved performance compared to existing models.
