In [1]:
!pip install langchain neo4j langchain_openai wikipedia
!pip install -U langchain-community

Collecting openai<2.0.0,>=1.26.0 (from langchain_openai)
  Downloading openai-1.35.3-py3-none-any.whl.metadata (21 kB)
Downloading openai-1.35.3-py3-none-any.whl (327 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m327.4/327.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 0.28.0
    Uninstalling openai-0.28.0:
      Successfully uninstalled openai-0.28.0
Successfully installed openai-1.35.3




In [2]:
from langchain.graphs import Neo4jGraph

url = "neo4j+s://ba730d0a.databases.neo4j.io"
username ="neo4j"
password = "2h1rYLEA0kvD6hwv7kTyWWTS3ymOQWBi2KzTopQsH5o"
graph = Neo4jGraph(
    url=url,
    username=username,
    password=password
)

In [3]:
from langchain_community.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.schema import Document
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel

class Property(BaseModel):
  """A single property consisting of key and value"""
  key: str = Field(..., description="key")
  value: str = Field(..., description="value")

class Node(BaseNode):
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseRelationship):
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )

class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

In [4]:
def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)

def props_to_dict(props) -> dict:
    """Convert properties to a dictionary."""
    properties = {}
    if not props:
        return properties
    for p in props:
        properties[format_property_key(p.key)] = p.value
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""
    properties = props_to_dict(node.properties) if node.properties else {}
    # Add name property for better Cypher statement generation
    properties["name"] = node.id.title()
    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )


def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

In [5]:
import os
from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_structured_output_chain,
)
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

#os.environ["OPENAI_API_KEY"] = "sk-"
llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

def get_extraction_chain(
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
    ):
    prompt = ChatPromptTemplate.from_messages(
        [(
          "system",
          f"""# Knowledge Graph Instructions for GPT-4
## 1. Overview
You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
- **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
## 2. Labeling Nodes
- **Consistency**: Ensure you use basic or elementary types for node labels.
  - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
{'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
{'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""}
## 3. Handling Numerical Data and Dates
- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
- **Property Format**: Properties must be in a key-value format.
- **Quotation Marks**: Never use escaped single or double quotes within property values.
- **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
## 4. Coreference Resolution
- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
## 5. Strict Compliance
Adhere to the rules strictly. Non-compliance will result in termination.
          """),
            ("human", "Use the given format to extract information from the following input: {input}"),
            ("human", "Tip: Make sure to answer in the correct format"),
        ])
    return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)

In [6]:
def extract_and_store_graph(
    document: Document,
    nodes:Optional[List[str]] = None,
    rels:Optional[List[str]]=None) -> None:
    # Extract graph data using OpenAI functions
    extract_chain = get_extraction_chain(nodes, rels)
    
    data = extract_chain.invoke(document.page_content)['function']
    # Construct a graph document
    graph_document = GraphDocument(
      nodes = [map_to_base_node(node) for node in data.nodes],
      relationships = [map_to_base_relationship(rel) for rel in data.rels],
      source = document
    )
    # Store information into a graph
    graph.add_graph_documents([graph_document])

In [7]:
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import TokenTextSplitter

# Read the wikipedia article
raw_documents = WikipediaLoader(query="Walt Disney").load()
# Define chunking strategy
text_splitter = TokenTextSplitter(chunk_size=2048, chunk_overlap=24)

# Only take the first the raw_documents
documents = text_splitter.split_documents(raw_documents[:3])

In [8]:
from tqdm import tqdm

for i, d in tqdm(enumerate(documents), total=len(documents)):
    extract_and_store_graph(d)

  warn_deprecated(
100%|██████████| 3/3 [01:36<00:00, 32.19s/it]


In [9]:
from langchain.chains import GraphCypherQAChain

graph.refresh_schema()

cypher_chain = GraphCypherQAChain.from_llm(
    graph=graph,
    cypher_llm=ChatOpenAI(temperature=0, model="gpt-4"),
    qa_llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo"),
    validate_cypher=True, # Validate relationship directions
    verbose=True
)

In [10]:
cypher_chain.invoke({"query": "When was Walter Elias Disney born?"})



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Person {name: "Walter Elias Disney"}) RETURN p.birthdate[0m
Full Context:
[32;1m[1;3m[{'p.birthdate': 'December 5, 1901'}][0m

[1m> Finished chain.[0m


{'query': 'When was Walter Elias Disney born?',
 'result': 'Walter Elias Disney was born on December 5, 1901.'}

In [12]:
graph.schema.visualization()

AttributeError: 'str' object has no attribute 'visualization'

In [65]:
!pip install llama-index llama_index.llms.fireworks llama-index-llms-groq groq llama-index-embeddings-huggingface ipywidgets

Collecting llama_index.llms.fireworks
  Downloading llama_index_llms_fireworks-0.1.5-py3-none-any.whl.metadata (619 bytes)




Downloading llama_index_llms_fireworks-0.1.5-py3-none-any.whl (4.4 kB)
Installing collected packages: llama_index.llms.fireworks
Successfully installed llama_index.llms.fireworks-0.1.5

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [58]:
!pip install neo4j langchain-experimental spacy llama-index-llms-langchain

Collecting llama-index-llms-langchain
  Downloading llama_index_llms_langchain-0.1.4-py3-none-any.whl.metadata (751 bytes)
Collecting langchain<0.2.0,>=0.1.3 (from llama-index-llms-langchain)
  Downloading langchain-0.1.20-py3-none-any.whl.metadata (13 kB)
Collecting llama-index-llms-anyscale<0.2.0,>=0.1.1 (from llama-index-llms-langchain)
  Downloading llama_index_llms_anyscale-0.1.4-py3-none-any.whl.metadata (647 bytes)
INFO: pip is looking at multiple versions of langchain to determine which version is compatible with other requirements. This could take a while.
Collecting langchain<0.2.0,>=0.1.3 (from llama-index-llms-langchain)
  Downloading langchain-0.1.19-py3-none-any.whl.metadata (13 kB)
  Downloading langchain-0.1.17-py3-none-any.whl.metadata (13 kB)
  Downloading langchain-0.1.16-py3-none-any.whl.metadata (13 kB)
  Downloading langchain-0.1.15-py3-none-any.whl.metadata (13 kB)
  Downloading langchain-0.1.14-py3-none-any.whl.metadata (13 kB)
  Downloading langchain-0.1.13-py3

Collecting langchain-experimental
  Using cached langchain_experimental-0.0.61-py3-none-any.whl.metadata (1.5 kB)
Collecting langchain-community<0.1,>=0.0.38 (from langchain<0.2.0,>=0.1.3->llama-index-llms-langchain)
  Downloading langchain_community-0.0.38-py3-none-any.whl.metadata (8.7 kB)
Collecting langchain-core<0.2.0,>=0.1.52 (from langchain<0.2.0,>=0.1.3->llama-index-llms-langchain)
  Downloading langchain_core-0.1.52-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain-text-splitters<0.1,>=0.0.1 (from langchain<0.2.0,>=0.1.3->llama-index-llms-langchain)
  Downloading langchain_text_splitters-0.0.2-py3-none-any.whl.metadata (2.2 kB)
INFO: pip is looking at multiple versions of langchain-experimental to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-experimental
  Downloading langchain_experimental-0.0.60-py3-none-any.whl.metadata (2.1 kB)
  Downloading langchain_experimental-0.0.59-py3-none-any.whl.metadata (2.1 kB

Downloading llama_index_llms_langchain-0.1.4-py3-none-any.whl (4.8 kB)
Downloading langchain-0.1.20-py3-none-any.whl (1.0 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m0m
[?25hDownloading langchain_community-0.0.38-py3-none-any.whl (2.0 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m
[?25hDownloading langchain_experimental-0.0.58-py3-none-any.whl (199 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.4/199.4 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading langchain_core-0.1.52-py3-none-any.whl (302 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.9/302.9 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading llama_index_llms_anyscale-0.1.4-py3-none-any.whl 

In [59]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [71]:
from IPython.display import display
import ipywidgets as widgets
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext, ServiceContext, load_index_from_storage
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.groq import Groq
import warnings
import os
from neo4j import GraphDatabase
#import openai
import spacy
import chromadb
from chromadb.config import Settings
from chromadb import Client, Settings
from chromadb.utils import embedding_functions

warnings.filterwarnings('ignore')

# ---- NEO4J SETUP ----
neo4j_uri = "neo4j+s://ba730d0a.databases.neo4j.io"
neo4j_user = "neo4j"
neo4j_password = "2h1rYLEA0kvD6hwv7kTyWWTS3ymOQWBi2KzTopQsH5o"
driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))

In [72]:
# ---- PROMPT TEMPLATE ----
prompt_template = """
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Graph Insights: {graph_insights}
Question: {question}

Answer the question and provide additional helpful information,
based on the pieces of information and graph insights, if applicable. Be succinct.

Responses should be properly formatted to be easily read.
"""

# Define the context for your prompt
context = "This directory contains multiple documents providing examples and solutions for various programming tasks."

# Data ingestion: load all files from a directory
directory_path = os.environ.get("CHATBOT_PATH") + '/ChatbotScraper/test_txts'
reader = SimpleDirectoryReader(input_dir=directory_path)
documents = reader.load_data()

# Load spacy model (you can choose a different model)
nlp = spacy.load("en_core_web_sm")

In [73]:
def populate_graph(documents, driver, nlp):
    with driver.session() as session:
        for doc in documents:
            doc_text = doc.text  # Assuming each document has a 'text' attribute
            #print(doc_text)
            
            nlp_doc = nlp(doc_text)
            concepts = [ent.text for ent in nlp_doc.ents if ent.label_ == "ORG" or ent.label_ == "PRODUCT"] # Adjust entity types as needed

            for concept in concepts:
                session.run("MERGE (:Concept {name: $concept})", concept=concept)

            for i, concept in enumerate(concepts):
                if i + 1 < len(concepts):
                    next_concept = concepts[i + 1]
                    session.run(
                        """
                        MATCH (c1:Concept {name: $concept}), (c2:Concept {name: $next_concept})
                        MERGE (c1)-[:RELATED_TO]->(c2)
                        """,
                        concept=concept, next_concept=next_concept
                    )
            
# Populate the Neo4j graph
populate_graph(documents, driver, nlp)

In [74]:
# Split the documents into nodes
text_splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=200)
nodes = text_splitter.get_nodes_from_documents(documents, show_progress=True)

# Set up embedding model and LLM
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
#llm = Groq(model="llama3-70b-8192", api_key=GROQ_API_KEY)
#embed_model = embedding_functions.OpenAIEmbeddingFunction( # Using openai 
#                api_key=os.environ.get('OPENAI_API_KEY'),
#                model_name="text-embedding-ada-002"
#            )
llm = ChatOpenAI(model="gpt-3.5-turbo-1106")

# Create service context
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=llm)

# Create vector store index
vector_index = VectorStoreIndex.from_documents(documents, show_progress=True, service_context=service_context, node_parser=nodes)
vector_index.storage_context.persist(persist_dir="./storage_mini")

# Load the index from storage
storage_context = StorageContext.from_defaults(persist_dir="./storage_mini")
index = load_index_from_storage(storage_context, service_context=service_context)

driver.close()

Parsing nodes:   0%|          | 0/8 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/8 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/9 [00:00<?, ?it/s]

In [75]:
# Create the interactive widgets
input_box = widgets.Text(
    value='Explain Python?',
    placeholder='Type your question here',
    description='Question:',
    disabled=False
)

output_area = widgets.Output()

In [78]:
#Query Enhancement with Neo4j

def get_graph_insights(question):
    with driver.session() as session:
        result = session.run(
            """
                MATCH (c:Concept)
                WHERE toLower(c.name) CONTAINS toLower($question)
                OPTIONAL MATCH (c)-[r:RELATED_TO]->(other:Concept)
                RETURN c.name AS concept, collect(other.name) AS related_concepts
            """,
            question=question
            )
        insights = []
        for record in result:
            insights.append(f"Concept: {record['concept']}, Related Concepts: {', '.join(record['related_concepts'])}")
            return "\n".join(insights) if insights else "No relevant graph insights found."

In [81]:
def on_button_click(b):
    with output_area:
        output_area.clear_output()
        question = input_box.value
        graph_insights = get_graph_insights(question)
        print(graph_insights)
        query_prompt = prompt_template.format(context=context, graph_insights=graph_insights, question=question)
        resp = query_engine.query(query_prompt)
        print(resp.response)


button = widgets.Button(
    description='Ask',
    disabled=False,
    button_style='',
    tooltip='Ask the question',
    icon='check'
)

button.on_click(on_button_click)

display(input_box, button, output_area)

#Query Engine Setup
query_engine = index.as_query_engine(service_context=service_context)

Text(value='coronavirus', description='Question:', placeholder='Type your question here')

Button(description='Ask', icon='check', style=ButtonStyle(), tooltip='Ask the question')

Output(outputs=({'output_type': 'stream', 'text': "I'm sorry, I don't have information related to the coronavi…