Example using neo4j driver. This creates Movie nodes, Person nodes (actors+directors), Genre nodes, and relationships.

Use MERGE to avoid duplicates.

Batch commits if dataset is big.

Add indexes for performance: CREATE INDEX movie_title IF NOT EXISTS FOR (m:Movie) ON (m.title)

In [4]:
pip install langchain_huggingface

Collecting langchain_huggingface
  Using cached langchain_huggingface-0.3.1-py3-none-any.whl.metadata (996 bytes)
Using cached langchain_huggingface-0.3.1-py3-none-any.whl (27 kB)
Installing collected packages: langchain_huggingface
Successfully installed langchain_huggingface-0.3.1
Note: you may need to restart the kernel to use updated packages.


In [None]:

import pandas as pd
from neo4j import GraphDatabase
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chains import GraphCypherQAChain, RetrievalQA, LLMChain
from langchain.prompts import FewShotPromptTemplate, PromptTemplate


In [6]:
csv_url = "https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/movies/movies_small.csv"
df = pd.read_csv(csv_url)

def split_list(x):
    if pd.isna(x):
        return []
    return [i.strip() for i in x.split("|")]

## (:Movie)-[:DIRECTED]->(:Person)
## (:Movie)-[:ACTED_IN]<-(:Person)
## (:Movie)-[:IN_GENRE]->(:Genre)

In [9]:
URI = "neo4j+s://896e87f9.databases.neo4j.io"
USER = "neo4j"
PASS = "XUJsOWG0NzbqTBIcEkCkslX0bSHlBmylyUa4kt8OQdU"
driver = GraphDatabase.driver(URI, auth=(USER, PASS))

def ingest_row(tx, row): # find movie node with unique movie id
    tx.run("""
    MERGE (m:Movie {movieId: $id})
    SET m.title = $title, m.released = $released, m.imdbRating = $rating
    """, id=int(row["movieId"]), title=row["title"],
       released=str(row["released"]), rating=float(row["imdbRating"]) if not pd.isna(row["imdbRating"]) else None)

    if row.get("director"): # create director node and relationship with movie
        tx.run("""
        MERGE (d:Person {name: $name})
        WITH d
        MATCH (m:Movie {movieId: $id})
        MERGE (d)-[:DIRECTED]->(m)
        """, name=row["director"], id=int(row["movieId"]))

    for actor in split_list(row.get("actors", "")): # create actor nodes and relationships with movie loop ensure actor present as node
        tx.run("""
        MERGE (p:Person {name: $actor}) 
        WITH p
        MATCH (m:Movie {movieId: $id})
        MERGE (p)-[:ACTED_IN]->(m)
        """, actor=actor, id=int(row["movieId"]))

    for genre in split_list(row.get("genres", "")): #Connects movies to their genres with [:IN_GENRE].
        tx.run("""
        MERGE (g:Genre {name: $genre})
        WITH g
        MATCH (m:Movie {movieId: $id})
        MERGE (m)-[:IN_GENRE]->(g)
        """, genre=genre, id=int(row["movieId"]))

with driver.session() as sess: #Iterates through each row in the CSV (df.iterrows()).Calls ingest_row inside a write transaction to insert into Neo4j.
    for _, row in df.iterrows():
        sess.execute_write(ingest_row, row)

graph = Neo4jGraph(url=URI, username=USER, password=PASS)


  graph = Neo4jGraph(url=URI, username=USER, password=PASS)


In [None]:
docs = [] #You loop over each row in the movies DataFrame. For each movie, you create a formatted text string containing its details (title, release date, director, actors, genres, and IMDB rating). You then create a Document object with this text and metadata (movieId and title) and append it to the docs list.
from langchain.schema import Document
#page_content → the text chunk used for embedding. metadata → a dictionary of metadata associated with the document.
for _, r in df.iterrows():
    text = f"Title: {r.title}\nReleased: {r.released}\nDirector: {r.director}\nActors: {r.actors}\nGenres: {r.genres}\nIMDB: {r.imdbRating}"
    docs.append(Document(page_content=text, metadata={"movieId": r.movieId, "title": r.title}))

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(docs, embeddings, collection_name="movies_kg_rag")

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

In [None]:
examples = [
    {"question": "Who acted in Casino?", "query": "MATCH (m:Movie {{title: 'Casino'}})<-[:ACTED_IN]-(p:Person) RETURN p.name"},
    {"question": "Who directed Toy Story?", "query": "MATCH (m:Movie {{title: 'Toy Story'}})<-[:DIRECTED]-(d:Person) RETURN d.name"},
    {"question": "How many movies has Tom Hanks acted in?", "query": "MATCH (p:Person {{name: 'Tom Hanks'}})-[:ACTED_IN]->(m:Movie) RETURN COUNT(m)"},
]

example_prompt = PromptTemplate(
    input_variables=["question", "query"],
    template="User input: {question}\nCypher query: {query}"
)

prefix = ( #This acts like the system prompt for your Cypher translator.
    "You are a Cypher expert. Use only MATCH/WITH/RETURN/COUNT. "
    "Never generate CREATE/DELETE/SET. "
    "Schema:\n{schema}\n" #schema for the exact relationships in the KG
    "Generate ONLY the Cypher query.\n"
)

cypher_prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    prefix=prefix,
    suffix="User input: {question}\nCypher query:",
    input_variables=["schema", "question"]
)


In [12]:
import os
from dotenv import load_dotenv
load_dotenv()

groq_api_key=os.getenv("GROQ_API_KEY")
from langchain_groq import ChatGroq
llm=ChatGroq(groq_api_key=groq_api_key,model_name="Gemma2-9b-It")

# Graph chain
graph_chain = GraphCypherQAChain.from_llm(
    graph=graph,
    llm=llm,
    cypher_prompt=cypher_prompt,
    verbose=True,
    allow_dangerous_requests=True
)

# RAG chain
rag_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

# Composer chain
compose_prompt = PromptTemplate(
    input_variables=["question", "graph_results", "docs"],
    template="""
Answer the question using both graph results and supporting documents.

Question:
{question}

Graph results:
{graph_results}

Docs:
{docs}

Return:
- answer (1-2 sentences)
- sources: whether from graph, docs, or both
"""
)
composer = LLMChain(llm=llm, prompt=compose_prompt)

  composer = LLMChain(llm=llm, prompt=compose_prompt)


In [None]:
def hybrid_answer(question):
    graph_out = graph_chain.run(question)
    docs_out = rag_chain.run(question)

    return composer.run( #compose Then lets the LLM reason across both sources and produce a final answer.
        question=question,
        graph_results=graph_out,
        docs=docs_out
    )


In [14]:
print(hybrid_answer("Who acted in the movie Casino?"))
print(hybrid_answer("List all genres of Schindler's List"))

  graph_out = graph_chain.run(question)




[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (m:Movie {title: 'Casino'})<-[:ACTED_IN]-(p:Person) RETURN p.name 
[0m
Full Context:
[32;1m[1;3m[{'p.name': 'Robert De Niro'}, {'p.name': 'Joe Pesci'}, {'p.name': 'Sharon Stone'}, {'p.name': 'James Woods'}][0m

[1m> Finished chain.[0m
- Robert De Niro, Joe Pesci, Sharon Stone, and James Woods acted in the movie Casino. 
- sources: both graph and docs 



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (m:Movie {title: 'Schindler\'s List'})-[r:IN_GENRE]->(g:Genre) RETURN g.name 
[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m
Schindler's List is primarily considered a historical drama.  

- sources: Docs 

