In [1]:
from dotenv import load_dotenv
import os

# load neo4j credentials (and openai api key in background).
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')

#uncomment this line if you aren't using a .env file
# os.environ['OPENAI_API_KEY'] = 'copy_paste_the_openai_key_here'

In [2]:
import neo4j
from neo4j_graphrag.llm import OpenAILLM
from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings

driver = neo4j.GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

ex_llm=OpenAILLM(
    model_name="gpt-4o-mini",
    model_params={
        "response_format": {"type": "json_object"}, # use json_object formatting for best results
        "temperature": 0 # turning temperature down for more deterministic results
    }
)

#create text embedder
embedder = OpenAIEmbeddings()

In [3]:
# Define basic node labels for sentiment analysis on movies
basic_node_labels = ["Object", "Entity", "Group", "Person", "Organization", "Place"]

# Define specific node labels for sentiment analysis
sentiment_node_labels = [
    "Review", "Comment", "Sentiment", "User", "Rating", 
    "Emotion", "Topic", "Movie", "Genre", "Tag"
]

# Combine the basic and sentiment-specific node labels
node_labels = basic_node_labels + sentiment_node_labels

# Define relationship types for the sentiment analysis knowledge graph
rel_types = [
    "REVIEWS", "COMMENTS_ON", "EXPRESSES", "RATED", 
    "BELONGS_TO", "MENTIONS", "HAS_EMOTION", "DISCUSSES", 
    "IS_WRITTEN_BY"
]

# Output node labels and relationship types for verification
print("Node Labels:", node_labels)
print("Relationship Types:", rel_types)

Node Labels: ['Object', 'Entity', 'Group', 'Person', 'Organization', 'Place', 'Review', 'Comment', 'Sentiment', 'User', 'Rating', 'Emotion', 'Topic', 'Movie', 'Genre', 'Tag']
Relationship Types: ['REVIEWS', 'COMMENTS_ON', 'EXPRESSES', 'RATED', 'BELONGS_TO', 'MENTIONS', 'HAS_EMOTION', 'DISCUSSES', 'IS_WRITTEN_BY']


In [4]:
prompt_template = '''
You are an agent in a talent agency and tasked with extracting information from reddit posts 
and structuring it in a property graph to inform further help your clients understand what people think about movies.

Extract the entities (nodes) and specify their type from the following Input text.
Also extract the relationships between these nodes. the relationship direction goes from the start node to the end node. 


Return result as JSON using the following format:
{{"nodes": [ {{"id": "0", "label": "the type of entity", "properties": {{"name": "name of entity" }} }}],
  "relationships": [{{"type": "TYPE_OF_RELATIONSHIP", "start_node_id": "0", "end_node_id": "1", "properties": {{"details": "Description of the relationship"}} }}] }}

- Use only the information from the Input text.  Do not add any additional information.  
- If the input text is empty, return empty Json. 
- Make sure to create as many nodes and relationships as needed to offer rich medical context for further research.
- An AI knowledge assistant must be able to read this graph and immediately understand the context to inform detailed research questions. 
- Multiple documents will be ingested from different sources and we are using this property graph to connect information, so make sure entity types are fairly general. 

Use only fhe following nodes and relationships (if provided):
{schema}

Assign a unique ID (string) to each node, and reuse it to define relationships.
Do respect the source and target node types for relationship and
the relationship direction.

Do not return any additional information other than the JSON in it.

Examples:
{examples}

Input text:

{text}
'''

In [15]:
from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import FixedSizeSplitter
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline

kg_builder_pdf = SimpleKGPipeline(
    llm=ex_llm,
    driver=driver,
    text_splitter=FixedSizeSplitter(chunk_size=500, chunk_overlap=100),
    embedder=embedder,
    entities=node_labels,
    relations=rel_types,
    prompt_template=prompt_template,
    from_pdf=False
)

In [18]:
import os
file_name = []
for files in os.listdir():
    if files.endswith(".txt"):
        file_name.append(files)
   
for path in file_name:
    print(f"Processing : {path}")
    with open(path, 'r', encoding='utf-8') as file:
        file_contents = file.read()     
    pdf_result = await kg_builder_pdf.run_async(text=file_contents)
    print(f"Result: {pdf_result}")

Processing : jurassic_park_reddit_discussion.txt




Result: run_id='b358d3be-7389-4259-8b81-ad0a2ac15723' result={'resolver': {'number_of_nodes_to_resolve': 11280, 'number_of_created_nodes': 6089}}


In [20]:
from neo4j_graphrag.indexes import create_vector_index

create_vector_index(driver, name="text_embeddings", label="Chunk",
                    embedding_property="embedding", dimensions=1536, similarity_fn="cosine")

In [21]:
from neo4j_graphrag.retrievers import VectorRetriever

vector_retriever = VectorRetriever(
    driver,
    index_name="text_embeddings",
    embedder=embedder,
    return_properties=["text"],
)

In [23]:
import json

vector_res = vector_retriever.get_search_results(query_text = "What do people think of Jurassic Park?", 
                                                 top_k=3)
for i in vector_res.records: print("====\n" + json.dumps(i.data(), indent=4))

====
{
    "node": {
        "text": "me people in it. \n- I don't know, I guess it depends on his mindset for the film. If \"*Aliens* with Dinos\" was his mindset, I could see how this would *not* be a good fit for him. What I loved about Spielberg's *JP* was the wonder and amazement you felt in seeing the dinosaurs for the first time. I'll be goddamned if I still don't get a little teary eyed when they first see the Brontosaurus(?) on the island and John William's score ramps up and I just feel like a little kid again, amazed at the "
    },
    "score": 0.9253692626953125
}
====
{
    "node": {
        "text": "- God I love Jurassic Park. Most of the reviews at the time were lukewarm and a lot of the people I talk to about it seem to think it's not much more than a great popcorn movie but I think it's honestly one of the best and most important movies in cinema.\n\nFrom the technological revolution that it helped to kick-start in the industry to the brilliant way it builds suspense 

In [24]:
from neo4j_graphrag.retrievers import VectorCypherRetriever

vc_retriever = VectorCypherRetriever(
    driver,
    index_name="text_embeddings",
    embedder=embedder,
    retrieval_query="""
//1) Go out 2-3 hops in the entity graph and get relationships
WITH node AS chunk
MATCH (chunk)<-[:FROM_CHUNK]-()-[relList:!FROM_CHUNK]-{1,2}()
UNWIND relList AS rel

//2) collect relationships and text chunks
WITH collect(DISTINCT chunk) AS chunks, 
  collect(DISTINCT rel) AS rels

//3) format and return context
RETURN '=== text ===\n' + apoc.text.join([c in chunks | c.text], '\n---\n') + '\n\n=== kg_rels ===\n' +
  apoc.text.join([r in rels | startNode(r).name + ' - ' + type(r) + '(' + coalesce(r.details, '') + ')' +  ' -> ' + endNode(r).name ], '\n---\n') AS info
"""
)

In [26]:
vc_res = vc_retriever.get_search_results(query_text = "What do people think of Jurassic Park?", top_k=3)

# print output
kg_rel_pos = vc_res.records[0]['info'].find('\n\n=== kg_rels ===\n')
print("# Text Chunk Context:")
print(vc_res.records[0]['info'][:kg_rel_pos])
print("# KG Context From Relationships:")
print(vc_res.records[0]['info'][kg_rel_pos:])

# Text Chunk Context:
=== text ===
me people in it. 
- I don't know, I guess it depends on his mindset for the film. If "*Aliens* with Dinos" was his mindset, I could see how this would *not* be a good fit for him. What I loved about Spielberg's *JP* was the wonder and amazement you felt in seeing the dinosaurs for the first time. I'll be goddamned if I still don't get a little teary eyed when they first see the Brontosaurus(?) on the island and John William's score ramps up and I just feel like a little kid again, amazed at the 
---
- God I love Jurassic Park. Most of the reviews at the time were lukewarm and a lot of the people I talk to about it seem to think it's not much more than a great popcorn movie but I think it's honestly one of the best and most important movies in cinema.

From the technological revolution that it helped to kick-start in the industry to the brilliant way it builds suspense to the fantastic acting. God, the music. Even the script is great at doing a lot wit

In [27]:
from neo4j_graphrag.llm import OpenAILLM as LLM
from neo4j_graphrag.generation import RagTemplate
from neo4j_graphrag.generation.graphrag import GraphRAG

llm = LLM(model_name="gpt-4o",  model_params={"temperature": 0.0})

rag_template = RagTemplate(template='''Answer the Question using the following Context. Only respond with information mentioned in the Context. Do not inject any speculative information not mentioned. 

# Question:
{query_text}
 
# Context:
{context}

# Answer:
''', expected_inputs=['query_text', 'context'])

v_rag  = GraphRAG(llm=llm, retriever=vector_retriever, prompt_template=rag_template)
vc_rag = GraphRAG(llm=llm, retriever=vc_retriever, prompt_template=rag_template)

In [28]:
q = "What do people think of the movie Deadpool? provide in list format."
print(f"Vector Response: \n{v_rag.search(q, retriever_config={'top_k':5}).answer}")
print("\n===========================\n")
print(f"Vector + Cypher Response: \n{vc_rag.search(q, retriever_config={'top_k':5}).answer}")

Vector Response: 
- Many people thought the human story in Deadpool was weak.
- Deadpool had a lower budget of ~$58 million due to expectations of lower returns for R-rated movies.
- Deadpool is from Fox, not Marvel/Disney.
- The movie had to fit with the adult-oriented source material of the comics.
- Deadpool is compared to characters like Blade, Punisher, and Wolverine, who are similarly violent.
- R-rated adaptations of Deadpool are considered better than PG-13-rated ones.
- Deadpool needed to be rated R, but not for the same reasons as other R-rated movies like Kick-Ass and Kingsman.




LLMGenerationError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}