# This notebook uses the NEO4J graph database to associate acts and regulations 
### The purpose of using a graph database is to understand how graphs can be connected for better retrieval. 
#### There are a few advantages of using a graph database over traditional databasese and this notebook tries to explore more advantages
- It is easier to grow the data in this database without any complex migration scripts or ORM
- Much easier to link different data

## Please run s3_v2.py before running this notebook,. This file will download all the regulation and Acts in html format and store it in the appropriate directory.

In [None]:
!pip install langchain
!pip install neo4j
!pip install bs4
!pip install llama-index

In [None]:
import os
from bs4 import BeautifulSoup
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
import warnings
from langchain_community.document_loaders import DirectoryLoader
from llama_index.core import SimpleDirectoryReader, StorageContext
warnings.filterwarnings("ignore")
from langchain_community.embeddings import HuggingFaceEmbeddings

In [None]:
definition = None

In [None]:
NEO4J_URI = 'bolt://' + os.getenv('NEO4J_HOST') + ':7687'
NEO4J_USERNAME = os.getenv('NEO4J_USER')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = 'neo4j' #os.getenv('NEO4J_DB')
print(NEO4J_URI)
print(NEO4J_DATABASE)

In [None]:
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

In [None]:
cypher = """
  MATCH (n) 
  RETURN count(n)
  """

In [None]:
result = kg.query(cypher)
result

In [None]:
# Fetch all HTML
Acts_documents = SimpleDirectoryReader("./HTML_Acts").load_data()
Regulations_documents = SimpleDirectoryReader("./HTML_Regulations").load_data()
print((len(Acts_documents)))
print((len(Regulations_documents)))

In the next section we try to loop through all the Acts

In [None]:
print(Regulations_documents[1].get_text())

In [None]:
def get_title(soup):
    title = soup.find_all("h2")
    title = title[0].get_text()
    return title

In [None]:
def get_definitions(sections):
    for index, section in enumerate(sections):
        heading = section.find("h4")
        print(heading.get_text())
        if 'Definition' in heading.get_text():
            definition = section
            return definition

In [None]:
def get_preamble(soup):
    preamble = soup.find_all("div", class_='preamble')
    if preamble:
        print(preamble[0].get_text())

In [None]:
def split_data_from_file(file, soup):
    chunks_with_metadata = [] # use this to accumlate chunk records
    #print(f'Processing {file}') 
    item_text = file #file_as_object[item] # grab the text of the item
    item_text_chunks = text_splitter.split_text(item_text) # split the text into chunks
    title  = get_title(soup)
    #print(title)
    chunk_seq_id = 0
    for chunk in item_text_chunks: # only take the first 20 chunks
        #form_id = file[file.rindex('/') + 1:file.rindex('.')] # extract form id from file name
        # finally, construct a record with metadata and the chunk text
        chunks_with_metadata.append({
            'text': chunk, 
            # metadata from looping...
            'chunkSeqId': chunk_seq_id,
            'chunkId': f'{title}-chunk{chunk_seq_id:04d}',
            'ActId': f'{title}',
            # constructed metadata...
            # metadata from file...
        })
        chunk_seq_id += 1
        #print(f'\tSplit into {chunk_seq_id} chunks')
    return chunks_with_metadata

In [None]:
merge_chunk_node_query = """
MERGE(mergedChunk:Chunk {chunkId: $chunkParam.chunkId})
    ON CREATE SET 
        mergedChunk.chunkSeqId = $chunkParam.chunkSeqId, 
        mergedChunk.text = $chunkParam.text,
        mergedChunk.ActId = $chunkParam.ActId
RETURN mergedChunk
"""

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
for index, Acts in enumerate(Acts_documents):
    soup = BeautifulSoup(Acts.get_text(), 'html.parser')
    #sections = soup.find_all("div", class_='section')
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 2000,
        chunk_overlap  = 200,
        length_function = len,
        is_separator_regex = False,
    )
    item1_text = soup.get_text()
   # item1_text_chunks = text_splitter.split_text(item1_text)
    first_file_chunks = split_data_from_file(item1_text, soup)
    #print(first_file_chunks[0])
    kg.query(merge_chunk_node_query, 
         params={'chunkParam':first_file_chunks[0]})
    kg.query("""
CREATE CONSTRAINT unique_chunk IF NOT EXISTS 
    FOR (c:Chunk) REQUIRE c.chunkId IS UNIQUE
""")
    node_count = 0
    for chunk in first_file_chunks:
        print(f"Creating `:Chunk` node for chunk ID {chunk['chunkSeqId']}")
        kg.query(merge_chunk_node_query, 
                params={
                    'chunkParam': chunk
                })
        node_count += 1
    #print(f"Created {node_count} nodes")
    kg.query("""
         MATCH (n)
         RETURN count(n) as nodeCount
         """)
    kg.query("""
         CREATE VECTOR INDEX `Acts_chunks` IF NOT EXISTS
          FOR (c:Chunk) ON (c.textEmbedding) 
          OPTIONS { indexConfig: {
            `vector.dimensions`: 384,
            `vector.similarity_function`: 'cosine'    
         }}
""")
    # Create the embeddings
    for chunk in first_file_chunks:
        query_result = embeddings.embed_query(chunk['text'])
        #print(chunk['chunkId'])
        match =        kg.query("""
        MATCH (chunk:Chunk) WHERE
        chunk.textEmbedding IS NULL
        AND chunk.chunkId = $chunkId
        AND chunk.chunkSeqId = $chunkSeqId
        RETURN chunk
        """,
        params={"chunkSeqId": chunk['chunkSeqId'], "chunkId": chunk['chunkId'], "ActId":chunk['ActId'] })
        #print(match)
        kg.query("""
        MATCH (chunk:Chunk) WHERE
        chunk.textEmbedding IS NULL
        AND chunk.chunkSeqId = $chunkSeqId
        AND chunk.chunkId = $chunkId
        CALL db.create.setNodeVectorProperty(chunk, "textEmbedding", $vector)
    """, 
    params={"chunkSeqId": chunk['chunkSeqId'], "chunkId": chunk['chunkId'], "ActId":chunk['ActId'], "vector": query_result} )
    kg.query("SHOW INDEXES")
    #break;

In [None]:
kg.refresh_schema()
print(kg.schema)

In [None]:
def neo4j_vector_search(question):
  """Search for similar nodes using the Neo4j vector index"""
  query_embedding = embeddings.embed_query(question)  
  vector_search_query = """
    CALL db.index.vector.queryNodes($index_name, $top_k, $question) yield node, score
    RETURN score, node.ActId, node.text AS text
  """
  similar = kg.query(vector_search_query, 
                     params={
                      'question': query_embedding, 
                      'index_name':'Acts_chunks', 
                      'top_k': 10})
  return similar

In [None]:
search_results = neo4j_vector_search(
    'When an employee is fired what needs to be done next?'
)

In [None]:
search_results[0]['text']

### Loop through all laws, associate all the chunks, make a parent ACT node and atacching the children chunks to the corresponding parent chunk

In [None]:
def act_info_list_fn(actId):
    cypher = """
      MATCH (anyChunk:Chunk) 
      WHERE anyChunk.ActId = $ActId
      WITH anyChunk LIMIT 1
      RETURN anyChunk { .ActId } as ActInfo
    """
    act_info_list = kg.query(cypher, params={'ActId': actId})
    return act_info_list

### Connect chunks to their parent form with a PART_OF relationship

In [None]:
def create_parent_act_node(act_info):
    cypher = """
        MERGE (f:Act {ActId: $formInfoParam.ActId })
          ON CREATE 
            SET f.ActId = $formInfoParam.ActId
            """
    kg.query(cypher, params={'formInfoParam': act_info})

### Add a NEXT relationship between subsequent chunks
- Use the `apoc.nodes.link` function from Neo4j to link ordered list of `Chunk` nodes with a `NEXT` relationship
- Do this for just the "Item 1" section to start

In [None]:
def create_chunk_relationship(act_info):
    cypher = """
  MATCH (from_same_section:Chunk)
  WHERE from_same_section.ActId = $ActParam
  WITH from_same_section
    ORDER BY from_same_section.chunkSeqId ASC
  WITH collect(from_same_section) as section_chunk_list
    CALL apoc.nodes.link(
        section_chunk_list, 
        "NEXT", 
        {avoidDuplicates: true}
    )  // NEW!!!
  RETURN size(section_chunk_list)
"""
    kg.query(cypher, params={'ActParam': act_info['ActId']})

In [None]:
def connect_chunk_to_parent():
    cypher = """
      MATCH (c:Chunk), (f:Act)
        WHERE c.ActId = f.ActId
      MERGE (c)-[newRelationship:PART_OF]->(f)
      RETURN count(newRelationship)
    """
    kg.query(cypher)

In [None]:
for index, Acts in enumerate(Acts_documents):
    soup = BeautifulSoup(Acts.get_text(), 'html.parser')
    title  = get_title(soup)
    act_info_lists = act_info_list_fn(title)
    for act_info_list in act_info_lists:
        act_info = act_info_list['ActInfo']        
        create_parent_act_node(act_info)
        create_chunk_relationship(act_info)
        connect_chunk_to_parent()

In [None]:
kg.query("SHOW INDEXES")