In [60]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

True

In [61]:
import os
from langchain.graphs import Neo4jGraph

graph = Neo4jGraph(
    url="bolt://localhost:7687",
    username="neo4j",
    password="neo4j"
)

In [62]:
from langchain_community.chat_models import ChatOllama
from langchain_openai import ChatOpenAI
from langchain_experimental.llms.ollama_functions import OllamaFunctions

# llm = ChatOllama(model="mistral", temperature=0)
llm = ChatOpenAI(model="gpt-4", temperature=0)
model= OllamaFunctions(model="mistral", verbose=True)

In [63]:
# loading the documents
from langchain_community.document_loaders import DirectoryLoader
loader = DirectoryLoader('./input/', glob="**/*.txt", use_multithreading=True, show_progress=True)
docs = loader.load()
len(docs)

100%|██████████| 3/3 [00:00<00:00, 43.71it/s]


3

In [64]:
# splitting the documents
from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(
  chunk_size=1000,
  chunk_overlap=200,
  length_function=len,
  is_separator_regex=False
)
split_docs = splitter.split_documents(docs)
len(split_docs)

12

In [65]:
# setup pydantic classes
from langchain.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship
)
from langchain.pydantic_v1 import Field, BaseModel
from typing import List, Optional

class Property(BaseModel):
  """A single property consisting of key and value"""
  key: str = Field(..., description="key")
  value: str = Field(..., description="value")


class Node(BaseNode):
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")


class Relationship(BaseRelationship):
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )

class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(..., description="List of relationships in the knowledge graph")

In [66]:
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate

def get_extraction_chain(
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
    ):
  sys_prompt = SystemMessage(content=f"""# Knowledge Graph Instructions for GPT-4
  ## 1. Overview
  You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
  - **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
  - The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
  ## 2. Labeling Nodes
  - **Consistency**: Ensure you use basic or elementary types for node labels.
    - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
  - **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
  {'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
  {'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""}
  ## 3. Handling Numerical Data and Dates
  - Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
  - **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
  - **Property Format**: Properties must be in a key-value format.
  - **Quotation Marks**: Never use escaped single or double quotes within property values.
  - **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
  ## 4. Coreference Resolution
  - **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
  If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"), 
  always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.  
  Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial. 
  ## 5. Strict Compliance
  Adhere to the rules strictly. Non-compliance will result in termination.""")

  from langchain.output_parsers import PydanticOutputParser
  parser = PydanticOutputParser(pydantic_object=KnowledgeGraph)

  prompt = ChatPromptTemplate.from_messages([
    sys_prompt,
    SystemMessagePromptTemplate.from_template(
        "{format_instructions}"),
    HumanMessagePromptTemplate.from_template("Extract information from the following input and ONLY respond in the given format: {input}"),
    HumanMessage(content="Tip: Make sure to answer in the correct format"),
  ]).partial(format_instructions=parser.get_format_instructions())
  
  # use LCEL to pass the prompt to the model and force a structured response
  extraction_chain = prompt | llm | parser

  return extraction_chain

In [67]:
from langchain.graphs.graph_document import GraphDocument
from langchain_core.documents import Document


def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)

def props_to_dict(props) -> dict:
    """Convert properties to a dictionary."""
    properties = {}
    if not props:
      return properties
    for p in props:
        properties[format_property_key(p.key)] = p.value
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""
    properties = props_to_dict(node.properties) if node.properties else {}
    # Add name property for better Cypher statement generation
    properties["name"] = node.id.title()
    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )


def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

def extract_and_store_graph(
        document: Document,
        nodes: Optional[List[str]] = None,
        rels: Optional[List[str]] = None) -> None:
    # Extract graph data using OpenAI functions
    extract_chain = get_extraction_chain(nodes, rels)
    data = extract_chain.invoke({"input": document.page_content})
    print(data)
    # for chunk in extract_chain.stream({"input": document.page_content}):
    #     print(chunk.content, end="", flush=True)
    # Construct a graph document
    graph_document = GraphDocument(
        nodes=[map_to_base_node(node) for node in data.nodes],
        relationships=[map_to_base_relationship(rel) for rel in data.rels],
        source=document
    )
    # Store information into a graph
    graph.add_graph_documents([graph_document])

In [68]:
# clear the graph
graph.query("MATCH (n) DETACH DELETE n")

[]

In [69]:
from tqdm import tqdm

for i, d in tqdm(enumerate(split_docs), total=len(split_docs)):
    extract_and_store_graph(d)

  8%|▊         | 1/12 [00:19<03:31, 19.25s/it]

nodes=[Node(id='Elon Musk', type='person', properties=[Property(key='name', value='Elon Musk')]), Node(id='Windows Laptop', type='product', properties=[Property(key='name', value='Windows Laptop')]), Node(id='Microsoft', type='company', properties=[Property(key='name', value='Microsoft')]), Node(id='X', type='socialMediaPlatform', properties=[Property(key='name', value='X')])] rels=[Relationship(source=Node(id='Elon Musk', type='person'), target=Node(id='Windows Laptop', type='product'), type='bought', properties=[]), Relationship(source=Node(id='Elon Musk', type='person'), target=Node(id='Microsoft', type='company'), type='tweetedAt', properties=[]), Relationship(source=Node(id='Elon Musk', type='person'), target=Node(id='X', type='socialMediaPlatform'), type='activeOn', properties=[])]
nodes=[Node(id='Musk', type='person', properties=[Property(key='name', value='Musk')]), Node(id='Microsoft', type='organization', properties=[Property(key='name', value='Microsoft')]), Node(id='Microso

 17%|█▋        | 2/12 [00:42<03:36, 21.62s/it]

nodes=[Node(id='Community Notes', type='software', properties=[Property(key='status', value='failing')]), Node(id='Elon Musk', type='person', properties=[Property(key='profession', value='tech mogul')]), Node(id='Microsoft', type='company', properties=[Property(key='relationWithOpenAI', value='investor')]), Node(id='OpenAI', type='company', properties=[Property(key='origin', value='open source'), Property(key='currentStatus', value='closed source, maximum-profit company'), Property(key='control', value='Microsoft')]), Node(id='Google', type='company', properties=[])] rels=[Relationship(source=Node(id='Elon Musk', type='person'), target=Node(id='Community Notes', type='software'), type='commented', properties=[Property(key='comment', value='This option no longer exists')]), Relationship(source=Node(id='Elon Musk', type='person'), target=Node(id='Microsoft', type='company'), type='accused', properties=[Property(key='accusation', value='controlling OpenAI')]), Relationship(source=Node(id=

 33%|███▎      | 4/12 [01:14<02:14, 16.86s/it]

nodes=[Node(id='Elon Musk', type='person', properties=[]), Node(id='OpenAI', type='company', properties=[])] rels=[Relationship(source=Node(id='Elon Musk', type='person'), target=Node(id='OpenAI', type='company'), type='founder', properties=[Property(key='endDate', value='2018')])]


 42%|████▏     | 5/12 [01:34<02:05, 17.91s/it]

nodes=[Node(id='Elon Musk', type='person', properties=[Property(key='role', value='CEO of Tesla and xAI')]), Node(id='Google', type='organization', properties=[Property(key='location', value='Mountain View, California')]), Node(id='Gemini AI chatbot', type='product', properties=[Property(key='feature', value='text-to-image generation'), Property(key='owner', value='Google')]), Node(id='X', type='platform', properties=[Property(key='previousName', value='Twitter')])] rels=[Relationship(source=Node(id='Elon Musk', type='person'), target=Node(id='Google', type='organization'), type='criticized', properties=[Property(key='reason', value='overplaying hand with AI image generation of Gemini'), Property(key='platform', value='X')]), Relationship(source=Node(id='Gemini AI chatbot', type='product'), target=Node(id='Google', type='organization'), type='ownedBy')]


 50%|█████     | 6/12 [02:04<02:11, 21.99s/it]

nodes=[Node(id='Musk', type='person', properties=[Property(key='quote', value='I’m glad that Google overplayed their hand with their AI image generation, as it made their insane racist, anti-civilizational programming clear to all.')]), Node(id='Google', type='organization', properties=[Property(key='accusation', value='overplayed their hand with their AI image generation, insane racist, anti-civilizational programming')]), Node(id='Vivek Ramaswamy', type='person', properties=[Property(key='quote', value='The globally embarrassing rollout of Google’s LLM has proves that James Damore was 100% correct about Google’s descent into an ideological echo chamber. Employees working on Gemini surely realized it was a mistake to make it so blatantly racist, but they likely kept their mouths shut because they didn’t want to get fired like Damore. These companies program their employees with broken incentives, and those employees then program the AI with the same biases.'), Property(key='position',

 58%|█████▊    | 7/12 [02:18<01:37, 19.48s/it]

nodes=[Node(id='Google', type='organization', properties=[Property(key='location', value='Mountain View, California')]), Node(id='Gemini', type='AI', properties=[Property(key='creator', value='Google'), Property(key='feature', value='image-generation'), Property(key='status', value='paused'), Property(key='issue', value='inaccuracies in some historical depictions')])] rels=[Relationship(source=Node(id='Google', type='organization'), target=Node(id='Gemini', type='AI'), type='created', properties=[]), Relationship(source=Node(id='Google', type='organization'), target=Node(id='Gemini', type='AI'), type='paused', properties=[Property(key='reason', value="address recent issues with Gemini's image generation feature")])]


 67%|██████▋   | 8/12 [02:38<01:18, 19.56s/it]

nodes=[Node(id='Google', type='organization', properties=[]), Node(id='Gemini', type='ai', properties=[Property(key='ownedBy', value='Google')]), Node(id='Elon Musk', type='person', properties=[Property(key='activity', value='tweeting memes')]), Node(id='Adolf Hitler', type='person', properties=[Property(key='activity', value='ordering the deaths of millions of people')]), Node(id='Nate Silver', type='person', properties=[Property(key='profession', value='Psephologist')])] rels=[Relationship(source=Node(id='Nate Silver', type='person'), target=Node(id='Google', type='organization'), type='criticizes', properties=[]), Relationship(source=Node(id='Gemini', type='ai'), target=Node(id='Elon Musk', type='person'), type='compares', properties=[]), Relationship(source=Node(id='Gemini', type='ai'), target=Node(id='Adolf Hitler', type='person'), type='compares', properties=[])]


 75%|███████▌  | 9/12 [03:11<01:11, 23.97s/it]

nodes=[Node(id='Google DeepMind', type='organization', properties=[Property(key='location', value='New York, US'), Property(key='date', value='Dec. 8, 2023')]), Node(id="Alphabet's Google", type='organization', properties=[Property(key='aiModel', value='Gemini'), Property(key='previousAiModel', value='PaLM 2'), Property(key='aiModelReleaseDate', value='May')]), Node(id='Gabby Jones', type='person', properties=[Property(key='profession', value='Photographer'), Property(key='affiliation', value='Bloomberg')]), Node(id='Silver', type='person', properties=[Property(key='previousPosition', value='head of data and polling news site FiveThirtyEight')]), Node(id='Elon', type='person', properties=[]), Node(id='Hitler', type='person', properties=[]), Node(id='Hindustan Times', type='organization', properties=[Property(key='description', value='your fastest source for breaking news')])] rels=[Relationship(source=Node(id="Alphabet's Google", type='organization'), target=Node(id='Google DeepMind', 

 83%|████████▎ | 10/12 [03:35<00:47, 23.80s/it]

nodes=[Node(id='Hindustan Times', type='organization', properties=[Property(key='description', value='fastest source for breaking news')]), Node(id='Elon', type='person', properties=[Property(key='description', value='known for tweeting memes')]), Node(id='Hitler', type='person', properties=[Property(key='description', value='actions led to the deaths of millions of people')]), Node(id='Silver', type='person', properties=[Property(key='description', value='able to replicate this')]), Node(id='Gemini', type='product', properties=[Property(key='description', value='several months away from being ready for prime time'), Property(key='status', value='needs to be shut down'), Property(key='producer', value='Google')]), Node(id='Musk', type='person', properties=[Property(key='description', value='commented on the post')])] rels=[Relationship(source=Node(id='Silver', type='person'), target=Node(id='Gemini', type='product'), type='criticizes', properties=[]), Relationship(source=Node(id='Musk'

 92%|█████████▏| 11/12 [04:09<00:27, 27.07s/it]

nodes=[Node(id='Musk', type='person', properties=[Property(key='comment', value='It’s scary!')]), Node(id='Gemini', type='product', properties=[]), Node(id='Google', type='organization', properties=[Property(key='marketCap', value='$1.8 trillion')]), Node(id='Wuhan Institute of Virology', type='organization', properties=[Property(key='productRelease', value='SARS-CoV-2 in 2019')]), Node(id='SARS-CoV-2', type='product', properties=[]), Node(id='User1', type='person', properties=[Property(key='comment', value="It would be almost impossible to make a product less useful and more destructive than Google Gemini. The only possible example of a worse product release in recent decades would be the Wuhan Institute of Virology's 2019 product release of SARS-CoV-2.")]), Node(id='User2', type='person', properties=[Property(key='comment', value='Google may work hard to lead in AI, but with this they have ensured that a large segment of the population will never trust or use their product')]), Node(

 92%|█████████▏| 11/12 [04:24<00:24, 24.03s/it]


OutputParserException: Failed to parse KnowledgeGraph from completion {'nodes': [{'id': 'Gemini', 'type': 'concept', 'properties': []}, {'id': 'User1', 'type': 'person', 'properties': []}, {'id': 'User2', 'type': 'person', 'properties': []}, {'id': 'User3', 'type': 'person', 'properties': []}, {'id': 'SpeechIsViolence', 'type': 'concept', 'properties': []}, {'id': 'IndefensiblePosition', 'type': 'concept', 'properties': []}], 'rels': [{'source': 'User1', 'target': 'Gemini', 'type': 'criticizes', 'properties': []}, {'source': 'User2', 'target': 'SpeechIsViolence', 'type': 'believes', 'properties': []}, {'source': 'User2', 'target': 'IndefensiblePosition', 'type': 'holds', 'properties': []}, {'source': 'User3', 'target': 'Gemini', 'type': 'criticizes', 'properties': []}]}. Got: 8 validation errors for KnowledgeGraph
rels -> 0 -> source
  value is not a valid dict (type=type_error.dict)
rels -> 0 -> target
  value is not a valid dict (type=type_error.dict)
rels -> 1 -> source
  value is not a valid dict (type=type_error.dict)
rels -> 1 -> target
  value is not a valid dict (type=type_error.dict)
rels -> 2 -> source
  value is not a valid dict (type=type_error.dict)
rels -> 2 -> target
  value is not a valid dict (type=type_error.dict)
rels -> 3 -> source
  value is not a valid dict (type=type_error.dict)
rels -> 3 -> target
  value is not a valid dict (type=type_error.dict)