In [None]:
import os
import json
import numpy as np
import polars as pl

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chat_models import ChatOpenAI
from langchain_community.graphs import Neo4jGraph
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import GraphCypherQAChain
from langchain_core.prompts import ChatPromptTemplate
from langchain.prompts.prompt import PromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

from openai import OpenAI

import pprint

In [None]:
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "gT83_w5bQ2@"
NEO4J_DATABASE = "jobs"

In [None]:
kg = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    database=NEO4J_DATABASE
)

In [None]:
kg.refresh_schema()
print(kg.schema)

## Questions

In [None]:
USER_QUESTIONS = [
    "Summarize all e-commerce jobs",
    "What is the most common job?",
]

## Vector RAG (no Graphs)

In [None]:
db_indices = kg.query("SHOW INDEXES")

[index["name"] for index in db_indices]

In [None]:
neo4j_vector_store = Neo4jVector.from_existing_index(
    embedding = OpenAIEmbeddings(
        model = "text-embedding-3-small",
    ),
    url = NEO4J_URI,
    username = NEO4J_USERNAME,
    password = NEO4J_PASSWORD,
    database=NEO4J_DATABASE,
    index_name = "vector_jd_chunk",
    text_node_property = "jd_chunk",
    # search_type="vector",
)

In [None]:
# Vanila RAG (no Graph involved)
retriever = neo4j_vector_store.as_retriever(k=10) # k=4 <-- default

In [None]:
retriever

In [None]:
ir_result = retriever.invoke(
    USER_QUESTIONS[0]
)

In [None]:
print(USER_QUESTIONS[0])
for doc in ir_result:
    print(doc.__dict__)

In [None]:
llm = ChatOpenAI(
        model="gpt-4o", # "gpt-3.5-turbo-0125",
        temperature=0,
    )

# template = (
#     "Use the given context to answer the question. "
#     "If you don't know the answer, say you don't know. "
#     "Use three sentence maximum and keep the answer concise. "
#     "Context: {context}"
# )
# prompt = ChatPromptTemplate.from_template(template)

In [None]:
# chain = prompt | llm

# chain = (
#             {"context": retriever}
#             | prompt
#             | llm
# )

In [None]:
# chain.invoke(USER_QUESTIONS[0])

In [None]:
def create_rag_chain(llm, retriever):
    system_prompt = (
        "Use the given context to answer the question. "
        "If you don't know the answer, say you don't know. "
        "Use three sentence maximum and keep the answer concise. "
        "Context: {context}"
    )
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", "{input}"),
        ]
    )
    question_answer_chain = create_stuff_documents_chain(llm, prompt)
    return create_retrieval_chain(retriever, question_answer_chain)

In [None]:
rag_chain = create_rag_chain(llm, retriever)

In [None]:
for question in USER_QUESTIONS:
    chain_result = rag_chain.invoke({"input": question})
    pprint.pp(chain_result)

## Vector RAG with JD chunk and extra text

In [None]:
kg.refresh_schema()
print(kg.schema)

In [None]:
retrieval_query_extra_text = """
WITH node, score, "Below is a part of a job description of 2023 jobs." as intro_text
RETURN intro_text + "\n" + node.jd_chunk as text,
    score,
    node {.job_id, .jd_chunk_seq_id} AS metadata
"""

In [None]:
neo4j_vector_store_extra_text = Neo4jVector.from_existing_index(
    embedding = OpenAIEmbeddings(
        model = "text-embedding-3-small",
    ),
    url = NEO4J_URI,
    username = NEO4J_USERNAME,
    password = NEO4J_PASSWORD,
    database=NEO4J_DATABASE,
    index_name = "vector_jd_chunk",
    text_node_property = "jd_chunk",
    retrieval_query=retrieval_query_extra_text,
)

In [None]:
retriever_extra_text = neo4j_vector_store_extra_text.as_retriever(k=10)

In [None]:
ir_result = retriever_extra_text.invoke(
    USER_QUESTIONS[0]
)

In [None]:
ir_result

In [None]:
print(USER_QUESTIONS[0])
for doc in ir_result:
    print(doc.__dict__)

In [None]:
rag_chain = create_rag_chain(llm, retriever_extra_text)

In [None]:
for question in USER_QUESTIONS:
    chain_result = rag_chain.invoke({"input": question})
    pprint.pp(chain_result)

## Vector RAG with window of JD and job tiltles

In [None]:
kg.refresh_schema()
print(kg.schema)

In [None]:
N_NEIGHBOR_CHUNKS = 5

In [None]:
retrieval_query_window = f"""
MATCH window=
    (:JD_Chunk)-[:NEXT*0..{N_NEIGHBOR_CHUNKS}]->(node)-[:NEXT*0..{N_NEIGHBOR_CHUNKS}]->(:JD_Chunk)
WITH node, score, window as longestWindow 
  ORDER BY length(window) DESC LIMIT 100
WITH nodes(longestWindow) as chunkList, node, score
  UNWIND chunkList as chunkRows
WITH collect(chunkRows.jd_chunk) as textList, node, score
RETURN apoc.text.join(textList, " \\n ") as text,
    score,
    node {{.job_id, .jd_chunk_seq_id}} AS metadata
"""


In [None]:
print(retrieval_query_window)

In [None]:
neo4j_vector_store_window = Neo4jVector.from_existing_index(
    embedding = OpenAIEmbeddings(
        model = "text-embedding-3-small",
    ),
    url = NEO4J_URI,
    username = NEO4J_USERNAME,
    password = NEO4J_PASSWORD,
    database=NEO4J_DATABASE,
    index_name = "vector_jd_chunk",
    text_node_property = "jd_chunk",
    retrieval_query=retrieval_query_window,
)

In [None]:
retriever_window = neo4j_vector_store_window.as_retriever(k=10)

In [None]:
ir_result = retriever_window.invoke(
    USER_QUESTIONS[0]
)

ir_result

In [None]:
rag_chain = create_rag_chain(llm, retriever_window)

In [None]:
for question in USER_QUESTIONS:
    chain_result = rag_chain.invoke({"input": question})
    pprint.pp(chain_result)

In [None]:
N_NEIGHBOR_CHUNKS = 10

In [None]:
retrieval_query_window_title = f"""
MATCH window =
    (:JD_Chunk)-[:NEXT*0..{N_NEIGHBOR_CHUNKS}]->(node)-[:NEXT*0..{N_NEIGHBOR_CHUNKS}]->(:JD_Chunk)
WITH node, score, window as longestWindow 
  ORDER BY length(window) DESC
WITH DISTINCT node, collect(score)[0] AS score, collect(longestWindow)[0] AS longestWindow
WITH nodes(longestWindow) as chunkList, node, score
  UNWIND chunkList as chunkRows
WITH collect(chunkRows.jd_chunk) as textList, node, score
MATCH (node)-[:PART_OF]->(job:Job)
WITH DISTINCT job, collect(textList)[0] AS textList, collect(node)[0] AS node, collect(score)[0] AS score
RETURN "Job title: " + job.job_title + "\\n Job description: " + apoc.text.join(textList, " \\n ") as text,
    score,
    node {{.job_id, .jd_chunk_seq_id}} AS metadata
"""

In [None]:
print(retrieval_query_window_title)

In [None]:
neo4j_vector_store_window_title = Neo4jVector.from_existing_index(
    embedding = OpenAIEmbeddings(
        model = "text-embedding-3-small",
    ),
    url = NEO4J_URI,
    username = NEO4J_USERNAME,
    password = NEO4J_PASSWORD,
    database=NEO4J_DATABASE,
    index_name = "vector_jd_chunk",
    text_node_property = "jd_chunk",
    retrieval_query=retrieval_query_window_title,
)

In [None]:
retriever_window_title = neo4j_vector_store_window_title.as_retriever(k=10)

In [None]:
ir_result = retriever_window_title.invoke(
    USER_QUESTIONS[1]
)

ir_result

In [None]:
print(ir_result[0].page_content)

In [None]:
rag_chain = create_rag_chain(llm, retriever_window_title)

In [None]:
for question in USER_QUESTIONS:
    chain_result = rag_chain.invoke({"input": question})
    pprint.pp(chain_result)

In [None]:
chain_result = rag_chain.invoke({"input": "Summarize all Salesforce jobs"})
# pprint.pp(chain_result)
pprint.pp(chain_result["answer"])

## Vector RAG on Job titles and all available information

In [None]:
kg.refresh_schema()
print(kg.schema)

In [None]:
db_indices = kg.query("SHOW INDEXES")

[(index["name"], index["type"]) for index in db_indices]

In [None]:
retrieval_query_job_title = f"""
MATCH
  (node)-[:SECTION]->(jd_first_chunk:JD_Chunk)
WITH node, score, jd_first_chunk
RETURN
  "Job title: " + node.job_title
    + "\\n Job description: " + jd_first_chunk.jd_chunk
    + "\\n Job skills: " + apoc.text.join(node.skills, ", ")
    + "\\n Job vertical: " + node.job_vertical
    + "\\n Job posted at: " + node.job_posted_at
    + "\\n Job client: " + node.client_name
    + "\\n Job client type: " + node.client_type
    AS text,
  score,
  node {{.job_id, .client_id}} AS metadata
"""

In [None]:
print(retrieval_query_job_title)

In [None]:
neo4j_vector_store_job_title = Neo4jVector.from_existing_index(
    embedding = OpenAIEmbeddings(
        model = "text-embedding-3-small",
    ),
    url = NEO4J_URI,
    username = NEO4J_USERNAME,
    password = NEO4J_PASSWORD,
    database=NEO4J_DATABASE,
    index_name = "vector_job_title",
    text_node_property = "job_title",
    retrieval_query=retrieval_query_job_title,
)

In [None]:
retriever_job_title = neo4j_vector_store_job_title.as_retriever(k=10)

In [None]:
ir_result = retriever_job_title.invoke(
    USER_QUESTIONS[0]
)

ir_result

In [None]:
print(ir_result[1].page_content)

In [None]:
rag_chain = create_rag_chain(llm, retriever_job_title)

In [None]:
for question in USER_QUESTIONS:
    chain_result = rag_chain.invoke({"input": question})
    pprint.pp(chain_result)

In [None]:
retrieval_query_job_title_jd_window = f"""
MATCH
  (node)-[:SECTION]->(jd_first_chunk:JD_Chunk)
WITH node, score, jd_first_chunk

MATCH jd_window =
    (jd_first_chunk)-[:NEXT*0..100]->(:JD_Chunk)

WITH node, score, jd_window AS longest_jd_window 
  ORDER BY length(jd_window) DESC

WITH DISTINCT node, collect(score)[0] AS score, collect(longest_jd_window)[0] AS longest_jd_window

WITH nodes(longest_jd_window) AS jd_chunk_list, node, score
  UNWIND jd_chunk_list AS jd_chunks

WITH collect(jd_chunks.jd_chunk) AS jd_chunks, node, score

RETURN
  "Job title: " + node.job_title
    + "\\n Job description: " + apoc.text.join(jd_chunks, " \\n ")
    + "\\n Job skills: " + apoc.text.join(node.skills, ", ")
    + "\\n Job vertical: " + node.job_vertical
    + "\\n Job posted at: " + node.job_posted_at
    + "\\n Job client: " + node.client_name
    + "\\n Job client type: " + node.client_type
    AS text,
  score,
  node {{.job_id, .client_id}} AS metadata
"""

In [None]:
print(retrieval_query_job_title_jd_window)

In [None]:
neo4j_vector_store_job_title_jd_window = Neo4jVector.from_existing_index(
    embedding = OpenAIEmbeddings(
        model = "text-embedding-3-small",
    ),
    url = NEO4J_URI,
    username = NEO4J_USERNAME,
    password = NEO4J_PASSWORD,
    database=NEO4J_DATABASE,
    index_name = "vector_job_title",
    text_node_property = "job_title",
    retrieval_query=retrieval_query_job_title_jd_window,
)

In [None]:
retriever_job_title_jd_window = neo4j_vector_store_job_title_jd_window.as_retriever(k=10)

In [None]:
ir_result = retriever_job_title_jd_window.invoke(
    USER_QUESTIONS[0]
)

ir_result

In [None]:
print(ir_result[1].page_content)

In [None]:
rag_chain = create_rag_chain(llm, retriever_job_title_jd_window)

In [None]:
for question in USER_QUESTIONS:
    chain_result = rag_chain.invoke({"input": question})
    pprint.pp(chain_result)

In [None]:
chain_result = rag_chain.invoke({"input": "Summarize all Salesforce jobs"})
# pprint.pp(chain_result)
pprint.pp(chain_result["answer"])

## Full text search

In [None]:
kg.refresh_schema()
print(kg.schema)

In [None]:
db_indices = kg.query("SHOW INDEXES")

[(index["name"], index["type"]) for index in db_indices]

In [None]:
job_title_contains = "Data Scuence"

cypher = f"""
  CALL db.index.fulltext.queryNodes("fulltext_job_title", 
      "{job_title_contains}") YIELD node, score
  RETURN node.job_title, score
"""

print(cypher)

In [None]:
kg.query(cypher)

In [None]:
job_skills_contains = "Data Scyence"

cypher = f"""
  CALL db.index.fulltext.queryNodes("fulltext_skills", 
      "{job_title_contains}") YIELD node, score
  RETURN node.skills, score
"""

print(cypher)

In [None]:
kg.query(cypher)

## Generate Cypher query by LLM 

In [None]:
kg.refresh_schema()
print(kg.schema)

In [None]:
CYPHER_GENERATION_TEMPLATE = """Task:Generate Cypher statement to 
query a graph database.

Instructions:
Use only the provided relationship types and properties in the 
schema. Do not use any other relationship types or properties that 
are not provided.

Schema:
{schema}

Note: Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than 
for you to construct a Cypher statement.
Do not include any text except the generated Cypher statement.

The question is:
{question}


EXAMPLES:
Here are a few examples of generated Cypher
statements for particular questions:
"""

In [None]:
# CYPHER_GENERATION_PROMPT = PromptTemplate(
#     input_variables=["schema", "question"], 
#     template=CYPHER_GENERATION_TEMPLATE
# )

In [None]:
# cypherChain = GraphCypherQAChain.from_llm(
#     llm,
#     graph=kg,
#     verbose=True,
#     cypher_prompt=CYPHER_GENERATION_PROMPT,
# )

In [None]:
# CYPHER_GENERATION_TEMPLATE += """
# # What jobs are related to Salesforce?
# CALL db.index.fulltext.queryNodes("fulltext_job_title", 
#       "Salesforce") YIELD node, score
#   RETURN node.job_title
# """

In [None]:
def query_graph_db(cypher_generation_template, llm, kg, question):
    cypher_generation_prompt = PromptTemplate(
        input_variables=["schema", "question"], 
        template=cypher_generation_template,
    )
    cypher_chain = GraphCypherQAChain.from_llm(
        llm,
        graph=kg,
        verbose=True,
        cypher_prompt=cypher_generation_prompt,
    )
    cypher_chain.top_k = 100
    return cypher_chain.run(question)

In [None]:
USER_QUESTIONS

In [None]:
llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0,
)

In [None]:
result = query_graph_db(
    CYPHER_GENERATION_TEMPLATE,
    llm,
    kg,
    "What jobs contain E-commerce in title?",
)

In [None]:
result = query_graph_db(
    CYPHER_GENERATION_TEMPLATE,
    llm,
    kg,
    USER_QUESTIONS[0],
)

In [None]:
# result = query_graph_db(
#     CYPHER_GENERATION_TEMPLATE,
#     llm,
#     kg,
#     "What jobs are related to Salesforce?",
# )

In [None]:
result = query_graph_db(
    CYPHER_GENERATION_TEMPLATE,
    llm,
    kg,
    "What jobs are related to E-commerce?",
)

In [None]:
# CYPHER_GENERATION_TEMPLATE += """
# # What jobs are related to Salesforce?
# CALL db.index.fulltext.queryNodes("fulltext_job_title", 
#       "Salesforce") YIELD node, score
#   RETURN node.job_title
# """

In [None]:
template = CYPHER_GENERATION_TEMPLATE + """
# What jobs are related to Salesforce?
CALL db.index.fulltext.queryNodes("fulltext_job_title", 
      "Salesforce") YIELD node, score
  RETURN node.job_title
"""


context = query_graph_db(
    template,
    llm,
    kg,
    "What jobs are related to E-Commerce?",
)

In [None]:
context

In [None]:
# def call_rag(llm, question, context):
#     system_prompt = (
#         "Use the given context to answer the question. "
#         "If you don't know the answer, say you don't know. "
#         "Use three sentence maximum and keep the answer concise. "
#         "Context: {context}"
#     )
#     prompt = ChatPromptTemplate.from_messages(
#         [
#             ("system", system_prompt),
#             ("human", "{question}"),
#         ]
#     )
#     return llm.invoke(prompt.invoke({"question": question, "context": context}))

In [None]:
# for question in USER_QUESTIONS:
#     result = call_rag(llm, question, context)
#     pprint.pp(result)

In [None]:
print(CYPHER_GENERATION_TEMPLATE)

In [None]:
for question in USER_QUESTIONS:
    print(question)
    query_graph_db(
        CYPHER_GENERATION_TEMPLATE,
        llm,
        kg,
        question,
    )
    # pprint.pp(result)

In [None]:
template = CYPHER_GENERATION_TEMPLATE + """
# all e-commerce jobs
CALL db.index.fulltext.queryNodes("fulltext_job_title", 
      "e-commerce") YIELD node, score
  RETURN node.job_title
"""

for question in USER_QUESTIONS:
    print(question)
    result = query_graph_db(
        template,
        llm,
        kg,
        question,
    )
    print(result)
    print()

In [None]:
question = "Summarize all Salesforce jobs"

print(question)
result = query_graph_db(
    template,
    llm,
    kg,
    question,
)

pprint.pp(result)