In [29]:
import os
from bs4 import BeautifulSoup
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
import warnings
from langchain_community.document_loaders import DirectoryLoader
from llama_index.core import SimpleDirectoryReader, StorageContext
warnings.filterwarnings("ignore")
from langchain_community.embeddings import HuggingFaceEmbeddings
from sentence_transformers import CrossEncoder
import numpy as np
from trulens_eval import Tru
from trulens_eval.tru_custom_app import instrument
from llmlingua import PromptCompressor
import re
from IPython.core.display import display, HTML
import json

In [112]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [113]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
    separators=["\n\n", "\n", ". ", " ", ""],
)

In [114]:
NEO4J_URI = 'bolt://' + os.getenv('NEO4J_HOST') + ':7687'
NEO4J_USERNAME = os.getenv('NEO4J_USER')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = 'neo4j' #os.getenv('NEO4J_DB')
print(NEO4J_URI)
print(NEO4J_DATABASE)

bolt://neo4j:7687
neo4j


In [5]:
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

## Downlaod the data whch should be in a json format 

In [54]:
file_metadata = lambda x: {"filename": x}
graphic_documents = SimpleDirectoryReader("./graphicdata",file_metadata=file_metadata).load_data()

In [159]:
print(graphic_documents[0].get_text())

{
	"Url": "https://www.bclaws.gov.bc.ca/civix/document/id/complete/statreg/208_2018_FormQ-1.gif",
	"Title": "Form Q.1",
	"RegulationInfo": "[en. B.C. Reg. 208/2018, App. 2, s. 2.]",
	"Description": "This form is used to register a dispute of a ticket.",
	"RegulationId": "Offence Act Forms Regulation",
	"ActId": "Offence Act",
	"Text": [
		{
			"Title": "HOW TO PAY THE TICKET",
			"Description": [
				{
					"Title": "REDUCE TICKETED AMOUNT(S) BY PAYING EARLY",
					"Description": "If a ticketed amount is over $58 and you pay in full on or before the 30th day from the date of service of the ticket, the ticketed amount is reduced by $25."
				},
				{
					"Title": "PAY ONLINE",
					"Description": "Paying online is your quickest option. Visit: pay.gov.bc.ca Payment by credit card and other methods can be accepted at the website above."
				},
				{
					"Title": "PAY BY MAIL",
					"Description": "Mail your payment by cheque or money order in Canadian funds payable to the Insurance Corp

In [160]:
merge_chunk_node_query = """
MERGE(mergedChunk:UpdatedChunk {chunkId: $chunkParam.chunkId})
    ON CREATE SET 
        mergedChunk.chunkSeqId = $chunkParam.chunkSeqId, 
        mergedChunk.text = $chunkParam.text,
        mergedChunk.ActId = $chunkParam.ActId,
        mergedChunk.sectionName = $chunkParam.sectionName,
        mergedChunk.url = $chunkParam.url,
        mergedChunk.regulationinfo = $chunkParam.regulationinfo,
        mergedChunk.regulationSeqId = $chunkParam.regulationSeqId,
        mergedChunk.parentRetrieval = $chunkParam.parentRetrieval,
        mergedChunk.formtitle = $chunkParam.formtitle,
        mergedChunk.RegId = $chunkParam.RegId
RETURN mergedChunk
"""

In [161]:
create_embeddings = """
        MATCH (chunk:UpdatedChunk) WHERE
        chunk.chunkId = $chunkParam.chunkId
        AND chunk.chunkSeqId = $chunkParam.chunkSeqId
        AND chunk.ActId = $chunkParam.ActId
        AND chunk.sectionName = $chunkParam.sectionName
        AND chunk.text = $chunkParam.text
        AND chunk.regulationinfo = $chunkParam.regulationinfo
        AND chunk.regulationSeqId = $chunkParam.regulationSeqId
        AND chunk.url = $chunkParam.url
        AND chunk.formtitle = $chunkParam.formtitle
        AND chunk.textEmbedding is NULL
        CALL db.create.setNodeVectorProperty(chunk, "textEmbedding", $vector)
        RETURN chunk
    """

In [162]:
def create_chunk_relationship(act_info):
    cypher = """
  MATCH (from_same_section:UpdatedChunk)
  WHERE from_same_section.ActId = $ActParam['ActId']
  AND from_same_section.sectionName = $ActParam['sectionName']
  AND from_same_section.RegId = $ActParam['RegId']
  AND from_same_section.url = $ActParam['url']
  AND from_same_section.formtitle = $ActParam['formtitle']
  AND from_same_section.regulationSeqId = $ActParam['regulationSeqId']
  WITH from_same_section
    ORDER BY from_same_section.chunkSeqId ASC
  WITH collect(from_same_section) as section_chunk_list
    CALL apoc.nodes.link(
        section_chunk_list, 
        "NEXT", 
        {avoidDuplicates: true}
    )  // NEW!!!
  RETURN size(section_chunk_list)
"""
    kg.query(cypher, params={'ActParam': act_info})

In [163]:
def create_chunk_embeddings(tokens):
    for chunk in tokens:
        print(f"Creating `:Chunk` node for chunk ID {chunk['chunkSeqId']}")
        kg.query(merge_chunk_node_query, 
            params={
                'chunkParam': chunk
            })
        vector = embeddings.embed_query(chunk['text'])
        result = kg.query(create_embeddings, params={'chunkParam':chunk, 'vector':vector})
        if result:
            print("Embedding created")
        else:
            print(result)
            print("Embedding not created")
    create_chunk_relationship(tokens[0])   

In [164]:
def create_metadata(token_split_texts,title, section_heading, regulationinfo, url, regid, seq_id, formtitle):
    chunks_with_metadata = [] # use this to accumlate chunk records
    chunk_seq_id = 0
    for chunk in token_split_texts: # only take the first 20 chunks
        #form_id = file[file.rindex('/') + 1:file.rindex('.')] # extract form id from file name
        # finally, construct a record with metadata and the chunk text
        chunks_with_metadata.append({
            'text': chunk, 
            # metadata from looping...
            'chunkSeqId': chunk_seq_id,
            'chunkId': f'{title}-{regid}-chunk-{section_heading}-{chunk_seq_id:04d}',
            'ActId': f'{title}',
            'RegId': f'{regid}',
            'sectionName':f'{section_heading}',
            'url':f'{url}',
            'regulationinfo': f'{regulationinfo}',
            'regulationSeqId': seq_id,
            'parentRetrieval': True,
            'formtitle':formtitle
            # constructed metadata...
            # metadata from file...
        })
        chunk_seq_id += 1
    return chunks_with_metadata

In [172]:
for index, graphic_document in enumerate(graphic_documents):
    #print(json.loads(graphic_document.get_text()))
    json_obj = json.loads(graphic_document.get_text())
    url = json_obj['Url']
    title = json_obj['Title']
    regulation_info = json_obj['RegulationInfo']
    description = json_obj['Description']
    actid = json_obj['ActId']
    regid = json_obj['RegulationId']
    text = json_obj['Text']
    parent_retrieval = ""
    if len(text):
        #sectionfn(text)
        #break         
        for section_index, section in enumerate(text):
            sectiontitle = section['Title']
            #print(sectiontitle)
            if len(section['Description']):
                for subsection_index, subsection in enumerate(section['Description']):
                    subsectiontitle = subsection['Title']
                    subsectiondescription = subsection['Description']
                    if sectiontitle == subsectiontitle:
                        #print(sectiontitle + ' \n' + subsectiondescription)
                        item_text = sectiontitle + ' \n' + subsectiondescription 
                        #create_chunks
                    else:
                        item_text = sectiontitle + ': '+ subsectiontitle + ' \n' + subsectiondescription
                        #print(sectiontitle + ': '+ subsectiontitle + ' \n' + subsectiondescription)
                    print("\n\n")
                    item_text_chunks = text_splitter.split_text(item_text) # split the text into chunks
                    token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=20, tokens_per_chunk=256)
                    token_split_texts = []
                    for text in item_text_chunks:
                        token_split_texts += token_splitter.split_text(text)
                    print(token_split_texts)
                    token = create_metadata(token_split_texts, actid, subsectiontitle, regulation_info, url, regid, subsection_index, title)
                    print(token)
                    create_chunk_embeddings(token)
                    parent_retrieval += item_text + "\n\n"
                    #meta_data = create_metadata(token_split_texts,title, section_heading, section_id) 
            #print(section)
    #print(text)




['how to pay the ticket : reduce ticketed amount ( s ) by paying early if a ticketed amount is over $ 58 and you pay in full on or before the 30th day from the date of service of the ticket, the ticketed amount is reduced by $ 25.']
[{'text': 'how to pay the ticket : reduce ticketed amount ( s ) by paying early if a ticketed amount is over $ 58 and you pay in full on or before the 30th day from the date of service of the ticket, the ticketed amount is reduced by $ 25.', 'chunkSeqId': 0, 'chunkId': 'Offence Act-Offence Act Forms Regulation-chunk-REDUCE TICKETED AMOUNT(S) BY PAYING EARLY-0000', 'ActId': 'Offence Act', 'RegId': 'Offence Act Forms Regulation', 'sectionName': 'REDUCE TICKETED AMOUNT(S) BY PAYING EARLY', 'url': 'https://www.bclaws.gov.bc.ca/civix/document/id/complete/statreg/208_2018_FormQ-1.gif', 'regulationinfo': '[en. B.C. Reg. 208/2018, App. 2, s. 2.]', 'regulationSeqId': 0, 'parentRetrieval': True, 'formtitle': 'Form Q.1'}]
Creating `:Chunk` node for chunk ID 0
[]
Em

In [187]:
#Create Parent Chunk
def create_parent_form_node(form_info):
    cypher = """
        MERGE (mergedChunk:form {formtitle: $chunkParam.Title })
          ON CREATE SET
        mergedChunk.text = $chunkParam.text,
        mergedChunk.url = $chunkParam.url,
        mergedChunk.regulationinfo = $chunkParam.regulationinfo,
        mergedChunk.formtitle = $chunkParam.Title,
        mergedChunk.RegId = $chunkParam.RegId,
        mergedChunk.LawId = $chunkParam.ActId
            """
    kg.query(cypher, params={'chunkParam': form_info})

In [193]:
def connect_form_parentNode():
    cypher = """
      MATCH (c:UpdatedChunk), (f:form)
        WHERE c.formtitle = f.formtitle
        AND c.chunkSeqId = 0
      MERGE (c)-[newRelationship:PARENT]->(f)
      MERGE (f)-[newRelationship2:CHILD]->(c)
      RETURN count(newRelationship)
    """
    kg.query(cypher)

In [194]:
print(json_obj['ActId'])

Offence Act


In [195]:
json_obj['Url']
json_obj['Title']
json_obj['RegulationInfo']
json_obj['Description']
json_obj['ActId']
json_obj['RegulationId']
json_obj['text'] = parent_retrieval
create_parent_form_node(json_obj)
connect_form_parentNode()
print(parent_retrieval)

HOW TO PAY THE TICKET: REDUCE TICKETED AMOUNT(S) BY PAYING EARLY 
If a ticketed amount is over $58 and you pay in full on or before the 30th day from the date of service of the ticket, the ticketed amount is reduced by $25.

HOW TO PAY THE TICKET: PAY ONLINE 
Paying online is your quickest option. Visit: pay.gov.bc.ca Payment by credit card and other methods can be accepted at the website above.

HOW TO PAY THE TICKET: PAY BY MAIL 
Mail your payment by cheque or money order in Canadian funds payable to the Insurance Corporation of British Columbia at the following address: Ticket Payment Processing, BAG #3505, VICTORIA, B.C. V8W 3N9. Do not send cash. A receipt will not be mailed. Your payment must be accompanied by a copy of the ticket or a note that contains the violation ticket number, your name, address, driver's licence number and date of birth, the violation date and the name of the Act or regulation and section contravened for each alleged offence to which the payment relates. I

In [367]:
def get_parent_query(node):
    query = """
    MATCH (n:UpdatedChunk)-[:PARENT]->(p)
    WHERE elementId(n) contains $node.elementId
    return p.text as text
    """
    print(node['elementId'])
    return kg.query(query, params={'node': node})

In [368]:
def neo4j_vector_search(question, index_name):
  """Search for similar nodes using the Neo4j vector index"""
  query_embedding = embeddings.embed_query(question)  
  vector_search_query = """
    CALL db.index.vector.queryNodes($index_name, $top_k, $question) yield node, score
        RETURN elementId(node) AS elementId, score, node.ActId, node.sectionId, node.sectionName, node.parentRetrieval AS parentretrieval, node.url, node.formtitle, node.text AS text, node.RegId AS Regulations, node.textEmbedding AS embedding
  """
  similar = kg.query(vector_search_query, 
                     params={
                      'question': query_embedding, 
                      'index_name':index_name, 
                      'top_k': 10})
  return similar

In [369]:
query = 'How to dispute a ticket?' 
#vector = embeddings.embed_query(query)
search_results = neo4j_vector_search(query, 'Acts_Updatedchunks')
print(search_results)

[{'elementId': '4:2645744a-9791-405a-8926-775c3f1e9540:156399', 'score': 0.9270763397216797, 'node.ActId': 'Offence Act', 'node.sectionId': None, 'node.sectionName': 'WHAT HAPPENS AFTER YOUR NOTICE OF DISPUTE HAS BEEN RECEIVED?', 'parentretrieval': True, 'node.url': 'https://www.bclaws.gov.bc.ca/civix/document/id/complete/statreg/208_2018_FormQ-1.gif', 'node.formtitle': 'Form Q.1', 'text': 'how to dispute the ticket : what happens after your notice of dispute has been received?', 'Regulations': 'Offence Act Forms Regulation', 'embedding': [-0.0038532104808837175, 0.07855070382356644, 0.02137164957821369, 0.0214228518307209, 0.038318466395139694, 0.0366387702524662, 0.05188072845339775, -0.03631860390305519, 0.004664931911975145, 0.03205491229891777, 0.026079896837472916, -0.08831565082073212, 0.03592817112803459, 0.03129422664642334, -0.05647681653499603, -0.030610088258981705, 0.006810715422034264, 0.04181034490466118, -0.055031098425388336, -0.072739377617836, 0.018179219216108322, 0

In [374]:
def get_parent(search_retrieval):
    if search_retrieval['parentretrieval']:
        print("parent retrieval")
        parent = get_parent_query(search_retrieval)
        print(parent[0]['text'])
        return

In [375]:
def search_result_output(search_results):
    for index, result in enumerate(search_results):
        print(result['score'])
        #print(result)
        print(result['node.ActId'])
        print(result['node.sectionName'])
        print(result['node.sectionId'])
        if (result['node.url']):
            print(result['node.url'])
        if (result['Regulations']):
            print(f"Regulation: {result['Regulations']}")
        if (result['node.formtitle']):
            print(f"Form title: {result['node.formtitle']}")
        print(result['text'])
        
        get_parent(result)
        print("\n\n")
search_result_output(search_results)

0.9270763397216797
Offence Act
WHAT HAPPENS AFTER YOUR NOTICE OF DISPUTE HAS BEEN RECEIVED?
None
https://www.bclaws.gov.bc.ca/civix/document/id/complete/statreg/208_2018_FormQ-1.gif
Regulation: Offence Act Forms Regulation
Form title: Form Q.1
how to dispute the ticket : what happens after your notice of dispute has been received?
parent retrieval
4:2645744a-9791-405a-8926-775c3f1e9540:156399
HOW TO PAY THE TICKET: REDUCE TICKETED AMOUNT(S) BY PAYING EARLY 
If a ticketed amount is over $58 and you pay in full on or before the 30th day from the date of service of the ticket, the ticketed amount is reduced by $25.

HOW TO PAY THE TICKET: PAY ONLINE 
Paying online is your quickest option. Visit: pay.gov.bc.ca Payment by credit card and other methods can be accepted at the website above.

HOW TO PAY THE TICKET: PAY BY MAIL 
Mail your payment by cheque or money order in Canadian funds payable to the Insurance Corporation of British Columbia at the following address: Ticket Payment Processin

[
    {
        "title": "how to pay",
        "subsection": [
            {
                "title":"",
                "Desctiption": ""
            },
            {
                "title":"",
                "subsection": [
                     "title": ""       
                ]
            }
        ]  
    },
    {
        "title":"how to dispute": ""
    }
]

In [220]:
query = 'How to dispute a ticket?' 
#vector = embeddings.embed_query(query)
search_results = neo4j_vector_search(query, 'Acts_Updatedchunks')

In [None]:
get_parent(search