## Integrating Unstructured and Graph Knowledge with Neo4j and LangChain for Enhanced Question Answering



#### Installing Dependencies

In [None]:
# !pip install -qU \
#        transformers \
#        datasets \
#        langchain \
#        openai \
#        wikipedia \
#        tiktoken \
#        neo4j \
#        python-dotenv

#### Importing Packanges

In [2]:
import os
import re
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain.document_loaders import WikipediaLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


#### Setting API's in Environment Variable[link text](https://)

In [3]:
load_dotenv()
# os.environ["OPENAI_API_KEY"] = ''
os.environ["NEO4J_URI"] = 'bolt://localhost:7687'
os.environ["NEO4J_USERNAME"] = 'neo4j'
os.environ["NEO4J_PASSWORD"] = 'docdb@123'

In [4]:
# print(os.getenv('OPENAI_API_KEY'))
print(os.getenv("NEO4J_URI"))
print(os.getenv("NEO4J_USERNAME"))
print(os.getenv('NEO4J_PASSWORD'))

bolt://localhost:7687
neo4j
docdb@123


#### Data Preprocessing

In [4]:
from transformers import AutoTokenizer

# Define the tokenizer using "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Function to calculate the number of tokens in a text
def bert_len(text):
    tokens = tokenizer.encode(text)
    return len(tokens)

# Example usage
input_text = "This is a sample sentence for tokenization."
num_tokens = bert_len(input_text)
print(f"Number of tokens: {num_tokens}")

Number of tokens: 11


In [5]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("./docs/YouCanHaveAnAmazingMemoryLearn.pdf")
pages = loader.load_and_split()

In [None]:
# # Load Wikipedia articles related to "Leonhard Euler"
# raw_documents = WikipediaLoader(query="Sachin Tendulkar").load()

# # Define a text splitter with specific parameters
# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=200, chunk_overlap=20, length_function=bert_len, separators=['\n\n', '\n', ' ', '']
# )

# # Split the content of the first Wikipedia article into smaller documents
# documents = text_splitter.create_documents([raw_documents[0].page_content])

In [None]:
print(len(documents))

In [6]:
# Define a text splitter with specific parameters
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, length_function=bert_len, separators=['\n\n', '\n', ' ', '']
)

# Split the content of the first Wikipedia article into smaller documents
documents = text_splitter.create_documents([pages[4].page_content])

#### Initializing Graph Database Neo4j [link text](https://)

In [7]:
# Instantiate Neo4j vector from documents
neo4j_vector = Neo4jVector.from_documents(
    documents,
    OpenAIEmbeddings(),
    url=os.environ["NEO4J_URI"],
    username=os.environ["NEO4J_USERNAME"],
    password=os.environ["NEO4J_PASSWORD"]
)

#### Peroforming Similarity Search on Ingested Documents

In [8]:
# Define the query.
query = "What is the introduction on book?"

# Execute the query, get top 2 results.
vector_results = neo4j_vector.similarity_search(query, k=2)

# Print search results with separation.
for i, res in enumerate(vector_results):
    print(res.page_content)
    if i != len(vector_results) - 1:
        print()

# Store the content of the most similar result.
vector_result = vector_results[0].page_content

Chapter 26:
 Using the tools: Study and learning
Chapter 27:
 Using the tools: Everyday ways to train your
memory
Chapter 28:
 Using the tools: Just for fun
Chapter 29:
 Age equals experience, not forgetfulness!
Chapter 30:
 I’ve done all that, now what can I expect?
Chapter 31:
 Look at what you can do now!
 
Afterword: The champions of the future
 
Index

Chapter 26:
 Using the tools: Study and learning
Chapter 27:
 Using the tools: Everyday ways to train your
memory
Chapter 28:
 Using the tools: Just for fun
Chapter 29:
 Age equals experience, not forgetfulness!
Chapter 30:
 I’ve done all that, now what can I expect?
Chapter 31:
 Look at what you can do now!
 
Afterword: The champions of the future
 
Index


#### Building Knowledge Graph

In [None]:
# Necessary Libraries to setup the Neo4j DB QuestionAnswering Chain
from langchain.chat_models import ChatOpenAI
from langchain.chains import GraphCypherQAChain
from langchain.graphs import Neo4jGraph

In [None]:
# Create a Neo4jGraph object by connecting to a Neo4j database.
graph = Neo4jGraph(
    url="bolt://localhost:7687", username="neo4j", password="docdb@123"
)
# from py2neo import Graph
# graph = Graph(os.environ["NEO4J_URI"],
#               auth = (os.environ["NEO4J_USERNAME"], 
#                       os.environ["NEO4J_PASSWORD"]))

In [None]:
# Print the schema of the Neo4j graph.
print(graph.schema)

In [None]:
# Create a question-answering chain using GPT-3 and a Neo4j graph, with verbose mode enabled.
chain = GraphCypherQAChain.from_llm(
    ChatOpenAI(temperature=0.9), graph=graph, verbose=True
)

In [None]:
# Use the question-answering chain to query the Neo4j graph.
graph_result = chain.run("What is the book about?")

In [None]:
graph_result