In [None]:
# !pip install langchain langchain-community neo4j openai tiktoken

In [None]:
import os
import json
import numpy as np
import polars as pl

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chat_models import ChatOpenAI
from langchain_community.graphs import Neo4jGraph
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import GraphCypherQAChain

from openai import OpenAI

In [None]:
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "gT83_w5bQ2@"
NEO4J_DATABASE = "jobs"

## Connect to GDB

In [None]:
kg = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    database=NEO4J_DATABASE
)

In [None]:
cypher = """
  MATCH (n) 
  RETURN count(n)
  """

In [None]:
result = kg.query(cypher)
result

In [None]:
[idx["name"] for idx in kg.query("SHOW INDEXES")]

## Read source data

In [None]:
jobs = pl.read_parquet("job_data/jobs_2023_all_2024-02-23.parquet")

In [None]:
for job in jobs.sample(3).to_dicts():
    print(job)

## Add Jobs to GDB

In [None]:
jobs.shape

In [None]:
N_JOBS_IN_GBD = 2000

jobs_sample = jobs.sample(N_JOBS_IN_GBD)

# list(jobs_sample.to_dicts()[0].keys())

In [None]:
len(jobs_sample)

In [None]:
#  'job_id',
#  'job_title',
#  'job_type',
#  'job_specialization',
#  'job_posted_at',
#  'skills',
#  'client_id',
#  'client_name',
#  'client_type',

In [None]:
# jobs_sample.to_dicts()

In [None]:
merge_job_node_query = """
MERGE(mergedJob:Job {job_id: $job_param.job_id})
    ON CREATE SET 
        mergedJob.job_title = $job_param.job_title, 
        mergedJob.job_vertical = $job_param.job_type, 
        mergedJob.job_posted_at = $job_param.job_posted_at,
        mergedJob.skills = $job_param.skills,
        mergedJob.client_id = $job_param.client_id,
        mergedJob.client_name = $job_param.client_name,
        mergedJob.client_type = $job_param.client_type
RETURN mergedJob
"""

In [None]:
len(jobs_sample)

In [None]:
kg.query(merge_job_node_query, 
         params={'job_param': jobs_sample.to_dicts()[0]})

In [None]:
kg.query("""
CREATE CONSTRAINT unique_job IF NOT EXISTS 
    FOR (j:Job) REQUIRE j.job_id IS UNIQUE
""")


In [None]:
kg.query("SHOW INDEXES")[-1:]

In [None]:
job_count = 0
for job in jobs_sample.to_dicts():
    print(f"Creating `:Job` node for job_id {job['job_id']}")
    kg.query(merge_job_node_query, 
            params={
                'job_param': job,
            })
    job_count += 1
print(f"Created {job_count} job nodes")

In [None]:
kg.query("""
         MATCH (j:Job)
         RETURN count(j) as job_count
         """)

## Add chunks to GDB

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 100,
    chunk_overlap  = 20,
    length_function = len,
    is_separator_regex = False,
    separators = ["\n\n", "\n", ". ", "; ", ", ", " ", ""],
    keep_separator  = True,
)

# text_splitter.__dict__
text_splitter._separators

In [None]:
job_sample_index = 0

In [None]:
len(jobs_sample.to_dicts()[job_sample_index]["job_description"])

In [None]:
jd_chunks = text_splitter.split_text(jobs_sample.to_dicts()[job_sample_index]["job_description"])
len(jd_chunks)

In [None]:
jd_chunks

In [None]:
#  'client_id',
#  'client_name',
#  'client_type',
#  'job_id',
#  'job_title',
#  'job_description',
#  'job_type',
#  'job_specialization',
#  'job_posted_at',
#  'skills',

In [None]:
def split_job_description(job):
    chunks_with_metadata = []
    job_descr_chunks = text_splitter.split_text(job["job_description"])
    job_id = job["job_id"]
    
    chunk_seq_id = 0
    for chunk in job_descr_chunks:
        chunks_with_metadata.append({
            'jd_chunk_id': f'{job_id}-chunk{chunk_seq_id:04d}',
            'job_id': job_id,
            'jd_chunk_seq_id': chunk_seq_id,
            'jd_chunk': chunk,
        })
        chunk_seq_id += 1
    print(f'\tSplit into {chunk_seq_id} chunks')
    return chunks_with_metadata

In [None]:
jd_chunks = split_job_description(jobs_sample.to_dicts()[0])

In [None]:
jd_chunks[:3]

In [None]:
jd_chunks[0].keys()

In [None]:
merge_jd_chunk_node_query = """
MERGE(mergedChunk:JD_Chunk {jd_chunk_id: $chunkParam.jd_chunk_id})
    ON CREATE SET 
        mergedChunk.job_id = $chunkParam.job_id, 
        mergedChunk.jd_chunk_seq_id = $chunkParam.jd_chunk_seq_id, 
        mergedChunk.jd_chunk = $chunkParam.jd_chunk
RETURN mergedChunk
"""

In [None]:
kg.query(merge_jd_chunk_node_query, 
         params={'chunkParam': jd_chunks[0]})

In [None]:
kg.query("""
CREATE CONSTRAINT unique_jd_chunk IF NOT EXISTS 
    FOR (c:JD_Chunk) REQUIRE c.jd_chunk_id IS UNIQUE
""")


In [None]:
kg.query("SHOW INDEXES")[-1:]

In [None]:
def add_jd_chunks_to_gdb(jd_chunks, verbose=False):
    node_count = 0
    for chunk in jd_chunks:
        if verbose:
            print(f"Creating `:JD_Chunk` node for JD chunk ID {chunk['jd_chunk_id']}")
        kg.query(merge_jd_chunk_node_query, 
                params={
                    'chunkParam': chunk
                })
        node_count += 1
    print(f"Created JD chunk {node_count} nodes")

In [None]:
add_jd_chunks_to_gdb(jd_chunks, verbose=True)

In [None]:
kg.query("""
         MATCH (jdc:JD_Chunk)
         RETURN count(jdc) as jd_chunks_count
         """)

In [None]:
len(jobs_sample)

In [None]:
job_count = 0
for job in jobs_sample.to_dicts():
    print(f"Creating `:JD_Chunk` nodes for job_id {job['job_id']}. Processed {job_count} jobs.")
    jd_chunks = split_job_description(job)
    add_jd_chunks_to_gdb(jd_chunks)
    job_count += 1
print(f"---------\nCreated `:JD_Chunk` nodes for {job_count} jobs")

In [None]:
kg.query("""
         MATCH (jdc:JD_Chunk)
         RETURN count(jdc) as jd_chunks_count
         """)

## Add relationships

In [None]:
# 'jd_chunk': 'Summary:',
# 'job_id': 369533,
# 'jd_chunk_id': '369533-chunk0000',
# 'jd_chunk_seq_id': 0,

In [None]:
cypher = """
  MATCH (of_same_job:JD_Chunk)
    WHERE of_same_job.job_id = $job_id
  RETURN of_same_job {.job_id, .jd_chunk_id, .jd_chunk_seq_id } as id_chunk_info
    LIMIT 10
"""

kg.query(cypher, params={"job_id": jobs_sample.to_dicts()[0]["job_id"]})[:3]

In [None]:
cypher = """
  MATCH (of_same_job:JD_Chunk)
    WHERE of_same_job.job_id = $job_id
  RETURN of_same_job {.job_id, .jd_chunk_id, .jd_chunk_seq_id } as id_chunk_info 
    ORDER BY of_same_job.jd_chunk_seq_id ASC
    LIMIT 10
"""

kg.query(cypher, params={"job_id": jobs_sample.to_dicts()[0]["job_id"]})[:3]

In [None]:
cypher = """
  MATCH (of_same_job:JD_Chunk)
    WHERE of_same_job.job_id = $job_id
  WITH of_same_job {.job_id, .jd_chunk_id, .jd_chunk_seq_id }
    ORDER BY of_same_job.jd_chunk_seq_id ASC
  RETURN collect(of_same_job)
"""

# kg.query(cypher, params={"job_id": jobs_sample.to_dicts()[0]["job_id"]})[:3]

In [None]:
cypher_linked_list_of_jd_chinks = """
  MATCH (of_same_job:JD_Chunk)
    WHERE of_same_job.job_id = $job_id
  WITH of_same_job
    ORDER BY of_same_job.jd_chunk_seq_id ASC
  WITH collect(of_same_job) as jd_chunk_list
    CALL apoc.nodes.link(
        jd_chunk_list, 
        "NEXT", 
        {avoidDuplicates: true}
    )
  RETURN size(jd_chunk_list)
"""

kg.query(cypher_linked_list_of_jd_chinks, params={"job_id": jobs_sample.to_dicts()[0]["job_id"]})

In [None]:
kg.refresh_schema()
print(kg.schema)

In [None]:
job_count = 0
for job in jobs_sample.to_dicts():
    print(f"Creating a linked list of `:JD_Chunk` nodes for job_id {job['job_id']}")
    kg.query(cypher_linked_list_of_jd_chinks, params={"job_id": job["job_id"]})
    job_count += 1
print(f"---------\nCreated linked list of `:JD_Chunk` nodes for {job_count} jobs")

In [None]:
cypher = """
  MATCH (jdc:JD_Chunk), (j:Job)
    WHERE jdc.job_id = j.job_id
  MERGE (jdc)-[newRelationship:PART_OF]->(j)
  RETURN count(newRelationship)
"""

kg.query(cypher)

In [None]:
cypher = """
  MATCH (first:JD_Chunk), (j:Job)
    WHERE first.job_id = j.job_id
      AND first.jd_chunk_seq_id = 0
  WITH first, j
    MERGE (j)-[r:SECTION]->(first)
  RETURN count(r)
"""
# TODO: rename: SECTION --> DESCRIPTION

kg.query(cypher)

In [None]:
cypher = """
  MATCH (j:Job)-[r:SECTION]->(first:JD_Chunk)
    WHERE j.job_id = $job_id
  RETURN first.jd_chunk_id, first.jd_chunk
"""

first_jd_chunk_info = kg.query(
    cypher,
    params = {"job_id": jobs_sample.to_dicts()[0]["job_id"]})

first_jd_chunk_info


In [None]:
cypher = """
  MATCH (first:JD_Chunk)-[:NEXT]->(nextChunk:JD_Chunk)
    WHERE first.jd_chunk_id = $jd_chunk_id
  RETURN nextChunk.jd_chunk_id, nextChunk.jd_chunk
"""

next_chunk_info = kg.query(
    cypher,
    params = {"jd_chunk_id": first_jd_chunk_info[0]["first.jd_chunk_id"]},
)

next_chunk_info


In [None]:
cypher = """
    MATCH (c1:JD_Chunk)-[:NEXT]->(c2:JD_Chunk)-[:NEXT]->(c3:JD_Chunk) 
        WHERE c2.jd_chunk_id = $jd_chunk_id
    RETURN c1.jd_chunk_id, c2.jd_chunk_id, c3.jd_chunk_id
    """

kg.query(cypher,
         params={"jd_chunk_id": next_chunk_info[0]["nextChunk.jd_chunk_id"]})

In [None]:
cypher = """
    MATCH window = (c1:JD_Chunk)-[:NEXT]->(c2:JD_Chunk)-[:NEXT]->(c3:JD_Chunk) 
        WHERE c1.jd_chunk_id = $jd_chunk_id
    RETURN length(window) as windowPathLength
    """

kg.query(cypher,
         params={"jd_chunk_id": next_chunk_info[0]["nextChunk.jd_chunk_id"]})

In [None]:
first_jd_chunk_info[0]["first.jd_chunk_id"]

In [None]:
cypher = """
  MATCH window=
      (:JD_Chunk)-[:NEXT*0..1]->(c:JD_Chunk)-[:NEXT*0..1]->(:JD_Chunk) 
    WHERE c.jd_chunk_id = $jd_chunk_id
  RETURN length(window)
  """

kg.query(cypher,
         params={"jd_chunk_id": first_jd_chunk_info[0]["first.jd_chunk_id"]})

In [None]:
cypher = """
  MATCH window=
      (:JD_Chunk)-[:NEXT*0..1]->(c:JD_Chunk)-[:NEXT*0..1]->(:JD_Chunk)
    WHERE c.jd_chunk_id = $jd_chunk_id
  WITH window as longestChunkWindow
      ORDER BY length(window) DESC LIMIT 1
  RETURN length(longestChunkWindow)
  """

kg.query(cypher,
         params={"jd_chunk_id": first_jd_chunk_info[0]["first.jd_chunk_id"]})

## Create text vectors (embeddings)

In [None]:
# kg.query("""
#          CREATE VECTOR INDEX `job_chunks` IF NOT EXISTS
#           FOR (c:Chunk) ON (c.textEmbedding) 
#           OPTIONS { indexConfig: {
#             `vector.dimensions`: 1536,
#             `vector.similarity_function`: 'cosine'
#          } }
# """)

# kg.query("""
#          CREATE VECTOR INDEX `job_chunks_test_3dim` IF NOT EXISTS
#           FOR (c:Chunk) ON (c.textEmbedding_3dim) 
#           OPTIONS { indexConfig: {
#             `vector.dimensions`: 3,
#             `vector.similarity_function`: 'cosine'
#          } }
# """)

In [None]:
# db_indices = kg.query("SHOW INDEXES")

In [None]:
# db_indices

In [None]:
# kg.query("""
#     MATCH (chunk:Chunk) WHERE chunk.textEmbedding IS NULL
#     WITH chunk, genai.vector.encode(
#       chunk.text, 
#       "OpenAI", 
#       {
#         token: $openAiApiKey, 
#         endpoint: $openAiEndpoint
#       }) AS vector
#     CALL db.create.setNodeVectorProperty(chunk, "textEmbedding", vector)
#     """, 
#     params={"openAiApiKey":OPENAI_API_KEY, "openAiEndpoint": OPENAI_ENDPOINT} )

In [None]:
# kg.query("""
#     CALL apoc.ml.openai.embedding(['Text to create embedding'], $openAiApiKey) YIELD embedding
#     """, 
#     params={"openAiApiKey":OPENAI_API_KEY})

In [None]:
# kg.query("""
#     MATCH (chunk:Chunk) WHERE chunk.textEmbedding IS NULL
#     WITH
#         chunk,
#         CALL apoc.ml.openai.embedding([chunk.text], $openAiApiKey) YIELD embedding AS vector
#     CALL db.create.setNodeVectorProperty(chunk, "textEmbedding", vector)
#     """, 
#     params={"openAiApiKey":OPENAI_API_KEY, "openAiEndpoint": OPENAI_ENDPOINT} )

# # CALL apoc.ml.openai.embedding(['Knowledge Graphs work well with LLMs'], $apiKey, {}) yield index, text, embedding;

In [None]:
# kg.query("""
#     MATCH (chunk:Chunk) WHERE chunk.textEmbedding_3dim IS NULL
#     WITH
#         chunk,
#         [0.1, 0.2, 0.3] AS vector
#     CALL db.create.setNodeVectorProperty(chunk, "textEmbedding_3dim", vector)
#     """, 
#     params={})

In [None]:
# result = kg.query("""
#     MATCH (chunk:Chunk) 
#     WHERE chunk.textEmbedding_3dim IS NOT NULL
#     RETURN chunk.job_descr_chunk, chunk.textEmbedding_3dim
#     LIMIT 20
#     """
# )

In [None]:
# result

In [None]:
## SEARCH

# kg.query("""
#     CALL db.index.vector.queryNodes(
#         $index_name, 
#         $top_k,
#         $question_embedding
#         ) YIELD node AS chunk, score
#     RETURN chunk.job_id, chunk.job_descr_chunk, score
#     """, 
#     params={"index_name": "job_chunks_test_3dim",
#             "top_k": 5,
#             "question_embedding": [0.099, 0.21, 0.29],
#             })

In [None]:
# os.environ

In [None]:
embeddings = OpenAIEmbeddings(
    model = "text-embedding-3-small",
    # openai_api_key = OPENAI_API_KEY,
)

In [None]:
emb = embeddings.embed_query("Text to vectorize")
len(emb), emb[:5]

In [None]:
# openai_client = OpenAI(api_key=OPENAI_API_KEY)

In [None]:
# len(
#     openai_client.embeddings.create(
#         input=[result[0]["chunk.job_descr_chunk"]],
#         model="text-embedding-3-small",
#     ).data[0].embedding
# )

In [None]:
# Genete embeddings (create index if not exist)

Neo4jVector.from_existing_graph(
    OpenAIEmbeddings(
        model = "text-embedding-3-small",    
    ),
    url = NEO4J_URI,
    username = NEO4J_USERNAME,
    password = NEO4J_PASSWORD,
    database=NEO4J_DATABASE,
    index_name = "vector_jd_chunk",
    node_label="JD_Chunk",
    text_node_properties=["jd_chunk"],
    embedding_node_property='jd_chunk_embedding',
)

In [None]:
db_indices = kg.query("SHOW INDEXES")

[index["name"] for index in db_indices]

In [None]:
kg.refresh_schema()
print(kg.schema)

In [None]:
result = kg.query("""
    MATCH (chunk:JD_Chunk) 
    WHERE chunk.jd_chunk_embedding IS NOT NULL
    RETURN chunk.jd_chunk, chunk.jd_chunk_embedding
    LIMIT 3
    """
)

In [None]:
len(result[0]["chunk.jd_chunk_embedding"]), result[0]["chunk.jd_chunk_embedding"][:5]

In [None]:
# Genete embeddings (create index if not exist)

Neo4jVector.from_existing_graph(
    OpenAIEmbeddings(
        model = "text-embedding-3-small",    
    ),
    url = NEO4J_URI,
    username = NEO4J_USERNAME,
    password = NEO4J_PASSWORD,
    database=NEO4J_DATABASE,
    index_name = "vector_job_title",
    node_label="Job",
    text_node_properties=["job_title"],
    embedding_node_property="job_title_embedding",
)

In [None]:
db_indices = kg.query("SHOW INDEXES")

[index["name"] for index in db_indices]

In [None]:
kg.refresh_schema()
print(kg.schema)

In [None]:
result = kg.query("""
    MATCH (j:Job) 
    WHERE j.job_title_embedding IS NOT NULL
    RETURN j.job_title, j.job_title_embedding
    LIMIT 3
    """
)

In [None]:
result[0]["j.job_title"], len(result[0]["j.job_title_embedding"]), result[0]["j.job_title_embedding"][:5]

## Full text index

In [None]:
kg.refresh_schema()
print(kg.schema)

In [None]:
kg.query("""
CREATE FULLTEXT INDEX fulltext_job_title
  IF NOT EXISTS
  FOR (job:Job) 
  ON EACH [job.job_title]
""")

In [None]:
kg.query("""
CREATE FULLTEXT INDEX fulltext_skills
  IF NOT EXISTS
  FOR (job:Job) 
  ON EACH [job.skills]
""")

## Vector/semantic retrieval from GDB

In [None]:
neo4j_vector_store = Neo4jVector.from_existing_graph(
    OpenAIEmbeddings(
        model = "text-embedding-3-small",
        # openai_api_key = OPENAI_API_KEY,    
    ),
    url = NEO4J_URI,
    username = NEO4J_USERNAME,
    password = NEO4J_PASSWORD,
    database=NEO4J_DATABASE,
    index_name = "job_chunks",
    node_label="Chunk",
    text_node_properties=["job_descr_chunk"],
    embedding_node_property='textEmbedding',
)

In [None]:
retriever = neo4j_vector_store.as_retriever()  # k=4 <-- default

In [None]:
# chain = RetrievalQAWithSourcesChain.from_chain_type(
#     ChatOpenAI(
#         temperature=0,
#         openai_api_key=OPENAI_API_KEY,
#     ), 
#     chain_type="stuff", 
#     retriever=retriever
# )

In [None]:
# chain(
#     {"question": "automation of the processes"},
#     return_only_outputs=False,
# )

In [None]:
retriever

In [None]:
ir_result = retriever.invoke("automation of the processes")    

In [None]:
ir_result[0].__dict__

In [None]:
# ChatOpenAI(
#     model="gpt-3.5-turbo-0125",
#     temperature=0,
#     openai_api_key=OPENAI_API_KEY,
# )

In [None]:
# chain = GraphCypherQAChain.from_llm(
#     ChatOpenAI(
#         model="gpt-3.5-turbo-0125",
#         temperature=0,
#         # openai_api_key=OPENAI_API_KEY,
#     ),
#     graph=kg,
#     verbose=True,
# )

In [None]:
# chain.run("""
# automation of the processes
# """)

In [None]:
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain

llm = ChatOpenAI(
        model="gpt-3.5-turbo-0125",
        temperature=0,
        # openai_api_key=OPENAI_API_KEY,
    )

system_prompt = (
    "Use the given context to answer the question. "
    "If you don't know the answer, say you don't know. "
    "Use three sentence maximum and keep the answer concise. "
    "Context: {context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, prompt)
chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
chain.invoke({"input": "automation of the processes"})

In [None]:
chain.invoke({"input": "What are the main skills?"})