## Integrating Unstructured and Graph Knowledge with Neo4j and LangChain for Enhanced Question Answering



#### Installing Dependencies

In [1]:
! pip install -qU \
       transformers \
       datasets \
       langchain \
       openai \
       wikipedia \
       tiktoken \
       neo4j \
       python-dotenv

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m49.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m86.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m90.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.4/192.4 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?

#### Importing Packanges

In [26]:
import os
import re
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain.document_loaders import WikipediaLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from dotenv import load_dotenv

#### Setting API's in Environment Variable[link text](https://)

In [27]:
load_dotenv()
os.environ["OPENAI_API_KEY"] = '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'
os.environ["NEO4J_URI"] = 'neo4j+s://817ac93a.databases.neo4j.io'
os.environ["NEO4J_USERNAME"] = 'neo4j'
os.environ["NEO4J_PASSWORD"] = 'CN1zhoj9bQwUc4JpfRk6hufP9Muojw_bTmaYMcxJXg4'

In [28]:
print(os.getenv('OPENAI_API_KEY'))
print(os.getenv("NEO4J_URI"))
print(os.getenv("NEO4J_USERNAME"))
print(os.getenv('NEO4J_PASSWORD'))

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
neo4j+s://817ac93a.databases.neo4j.io
neo4j
CN1zhoj9bQwUc4JpfRk6hufP9Muojw_bTmaYMcxJXg4


#### Data Preprocessing

In [29]:
from transformers import AutoTokenizer

# Define the tokenizer using "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Function to calculate the number of tokens in a text
def bert_len(text):
    tokens = tokenizer.encode(text)
    return len(tokens)

# Example usage
input_text = "This is a sample sentence for tokenization."
num_tokens = bert_len(input_text)
print(f"Number of tokens: {num_tokens}")

Number of tokens: 11


In [30]:
# Load Wikipedia articles related to "Leonhard Euler"
raw_documents = WikipediaLoader(query="Leonhard Euler").load()

# Define a text splitter with specific parameters
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200, chunk_overlap=20, length_function=bert_len, separators=['\n\n', '\n', ' ', '']
)

# Split the content of the first Wikipedia article into smaller documents
documents = text_splitter.create_documents([raw_documents[0].page_content])

Token indices sequence length is longer than the specified maximum sequence length for this model (736 > 512). Running this sequence through the model will result in indexing errors


In [31]:
print(len(documents))

18


#### Initializing Graph Database Neo4j [link text](https://)

In [32]:
# Instantiate Neo4j vector from documents
neo4j_vector = Neo4jVector.from_documents(
    documents,
    OpenAIEmbeddings(),
    url=os.environ["NEO4J_URI"],
    username=os.environ["NEO4J_USERNAME"],
    password=os.environ["NEO4J_PASSWORD"]
)

#### Peroforming Similarity Search on Ingested Documents

In [33]:
# Define the query.
query = "Who were the siblings of Leonhard Euler?"

# Execute the query, get top 2 results.
vector_results = neo4j_vector.similarity_search(query, k=2)

# Print search results with separation.
for i, res in enumerate(vector_results):
    print(res.page_content)
    if i != len(vector_results) - 1:
        print()

# Store the content of the most similar result.
vector_result = vector_results[0].page_content

== Early life ==
Leonhard Euler was born on 15 April 1707, in Basel to Paul III Euler, a pastor of the Reformed Church, and Marguerite (née Brucker), whose ancestors include a number of well-known scholars in the classics. He was the oldest of four children, having two younger sisters, An

== Early life ==
Leonhard Euler was born on 15 April 1707, in Basel to Paul III Euler, a pastor of the Reformed Church, and Marguerite (née Brucker), whose ancestors include a number of well-known scholars in the classics. He was the oldest of four children, having two younger sisters, An


#### Building Knowledge Graph

In [34]:
# Necessary Libraries to setup the Neo4j DB QuestionAnswering Chain
from langchain.chat_models import ChatOpenAI
from langchain.chains import GraphCypherQAChain
from langchain.graphs import Neo4jGraph

In [35]:
# Create a Neo4jGraph object by connecting to a Neo4j database.
graph = Neo4jGraph(
    url=os.environ["NEO4J_URI"],
    username=os.environ["NEO4J_USERNAME"],
    password=os.environ["NEO4J_PASSWORD"]
)

In [36]:
# Print the schema of the Neo4j graph.
print(graph.schema)


        Node properties are the following:
        [{'labels': 'Chunk', 'properties': [{'property': 'id', 'type': 'STRING'}, {'property': 'embedding', 'type': 'LIST'}, {'property': 'text', 'type': 'STRING'}]}]
        Relationship properties are the following:
        []
        The relationships are the following:
        []
        


In [37]:
# Create a question-answering chain using GPT-3 and a Neo4j graph, with verbose mode enabled.
chain = GraphCypherQAChain.from_llm(
    ChatOpenAI(temperature=0), graph=graph, verbose=True
)

In [38]:
# Use the question-answering chain to query the Neo4j graph.
graph_result = chain.run("Who were the siblings of Leonhard Euler?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (euler:Chunk {text: 'Leonhard Euler'})-[:SIBLING]->(sibling:Chunk)
RETURN sibling.text[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


In [39]:
graph_result

"I'm sorry, but I don't have any information about the siblings of Leonhard Euler."