In [None]:
!pip install langchain
!pip install neo4j
!pip install bs4
!pip install llama-index
!pip uninstall -y trulens_eval
!pip install trulens-eval==0.25.1
!pip install llmlingua

In [None]:
!pip uninstall -y trulens_eval
!pip install trulens-eval

In [None]:
import os
from bs4 import BeautifulSoup
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
import warnings
from langchain_community.document_loaders import DirectoryLoader
from llama_index.core import SimpleDirectoryReader, StorageContext
warnings.filterwarnings("ignore")
from langchain_community.embeddings import HuggingFaceEmbeddings
from sentence_transformers import CrossEncoder
import numpy as np
from trulens_eval import Tru
from trulens_eval.tru_custom_app import instrument
from llmlingua import PromptCompressor
import re
from IPython.core.display import display, HTML

In [None]:
def word_wrap(string, n_chars=72):
    # Wrap a string at the next space after n_chars
    if len(string) < n_chars:
        return string
    else:
        return string[:n_chars].rsplit(' ', 1)[0] + '\n' + word_wrap(string[len(string[:n_chars].rsplit(' ', 1)[0])+1:], n_chars)

In [None]:
NEO4J_URI = 'bolt://' + os.getenv('NEO4J_HOST') + ':7687'
NEO4J_USERNAME = os.getenv('NEO4J_USER')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = 'neo4j' #os.getenv('NEO4J_DB')
print(NEO4J_URI)
print(NEO4J_DATABASE)

In [None]:
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

In [None]:
cypher = """
  MATCH (n) 
  RETURN count(n)
  """
result = kg.query(cypher)
result

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
    separators=["\n\n", "\n", ". ", " ", ""],
)

In [None]:
file_metadata = lambda x: {"filename": x}
Acts_documents = SimpleDirectoryReader("./XML",file_metadata=file_metadata).load_data()

In [None]:
act_data = Acts_documents[0].get_text()

In [None]:
def get_title(soup):
    title = soup.find_all("h2")
    title = title[0].get_text().strip()
    return title

In [None]:
def get_definitions(sections):
    for index, section in enumerate(sections):
        heading = section.find("h4")
        print(heading.get_text())
        if 'Definition' in heading.get_text():
            definition = section
            return definition

In [None]:
def get_preamble(soup):
    preamble = soup.find_all("div", class_='preamble')
    if preamble:
        print(preamble[0].get_text())

In [None]:
file = BeautifulSoup(act_data, "lxml")

In [None]:
print(file)

In [None]:
title = file.find_all('act:title')
print(title)

In [None]:
definitions = file.find_all('bcl:definition')
print(definitions)

In [None]:
num_sections = file.find_all('bcl:section')
print(num_sections)
print(len(num_sections)[0])

## Working with the 1st section

#### bcl - B.C Laws #####
bcl:num gives the number of sections, subsections and something more

bcl:section gives all the sections and subsections

bcl:marginalnote gices the section heading

if the section is a definition then each definition has a definition tag

In [None]:
section_1 = num_sections[2]
section_heading = section_1.find_all('bcl:marginalnote')
section_definitions = section_1.find_all('bcl:definition')
section_subsection = section_1.find_all('bcl:subsection')
print(section_1)
print(len(section_subsection))

In [None]:
cache = []

In [None]:
#def link_references(str, index):
    

In [None]:
merge_chunk_node_query = """
MERGE(mergedChunk:UpdatedChunk {chunkId: $chunkParam.chunkId})
    ON CREATE SET 
        mergedChunk.chunkSeqId = $chunkParam.chunkSeqId, 
        mergedChunk.text = $chunkParam.text,
        mergedChunk.ActId = $chunkParam.ActId,
        mergedChunk.sectionId = $chunkParam.sectionId,
        mergedChunk.sectionName = $chunkParam.sectionName
RETURN mergedChunk
"""

In [None]:
match_chunk_node_query =  """
        MATCH (chunk:UpdatedChunk) WHERE
        chunk.chunkId = $chunkParam.chunkId
        AND chunk.chunkSeqId = $chunkParam.chunkSeqId
        AND chunk.ActId = $chunkParam.ActId
        AND chunk.sectionId = $chunkParam.sectionId
        AND chunk.sectionName = $chunkParam.sectionName
        RETURN chunk
        """

In [None]:
## This function will search for any reference that has the word section or subsection followed by a number
def extract_references(str, index):
    references = re.findall(r"section (\d+)", str)
    #print(references)
    #if (len(references)):
        #link_references(str, index)
        #print(index)
        #cache[index].append(references)
print(cache)        

In [None]:
def find_links(subsection):
    if not subsection:
        return
    if (subsection.find_all("bcl:link")):
        xml_link = subsection.find_all("bcl:link")[0]['xlink:href']
        display(HTML(f'<a href="{xml_link}">{subsection.find_all("bcl:link")[0].get_text()}</a>'))
    extract_references(subsection.get_text().replace("\n\n", "").replace("\r", ""), index)
    #print(subsection)
    #print(subsection.get_text().replace("\n", " ").replace("\r", " "))
    return subsection.get_text().replace("\n", " ").replace("\r", " ")

In [None]:
#get subsection
def subsection(section_definitions, index):
    string = ""
    for subsection_index, subsection in enumerate(section_definitions):
        nested_section = subsection.find_all('bcl:num')
        string += "\n" + find_links(subsection)
    return string

In [None]:
def create_metadata(token_split_texts,title, section_heading, section_id):
    chunks_with_metadata = [] # use this to accumlate chunk records
    chunk_seq_id = 0
    for chunk in token_split_texts: # only take the first 20 chunks
        #form_id = file[file.rindex('/') + 1:file.rindex('.')] # extract form id from file name
        # finally, construct a record with metadata and the chunk text
        chunks_with_metadata.append({
            'text': chunk, 
            # metadata from looping...
            'chunkSeqId': chunk_seq_id,
            'chunkId': f'{title}-chunk-{section_heading}-{chunk_seq_id:04d}',
            'ActId': f'{title}',
            'sectionId': f'{section_id}',
            'sectionName':f'{section_heading}',
            # constructed metadata...
            # metadata from file...
        })
        chunk_seq_id += 1
    return chunks_with_metadata

In [None]:
def create_chunks(item_text, title, section_heading, section_id):
    item_text_chunks = text_splitter.split_text(item_text) # split the text into chunks
    token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=20, tokens_per_chunk=256)
    token_split_texts = []
    for text in item_text_chunks:
        token_split_texts += token_splitter.split_text(text)
    meta_data = create_metadata(token_split_texts,title, section_heading, section_id)    
    return meta_data

In [None]:
connect_chunks = """
      MATCH (chunk:UpdatedChunk), (f:UpdatedChunk)
      WHERE
        chunk.chunkId = $chunkParam.chunkId
        AND chunk.chunkSeqId = $chunkParam.chunkSeqId
        AND chunk.ActId = $chunkParam.ActId
        AND chunk.sectionId = $chunkParam.sectionId
        AND chunk.sectionName = $chunkParam.sectionName
        AND f.ActId = $chunkParam.ActId
        AND f.sectionId = $chunkParam.connectnedsectionId
        AND f.chunkSeqId = 1
      MERGE (chunk)-[newRelationship:REFERENCE]->(f)
      RETURN count(newRelationship)
    """

In [None]:
def search_create_reference(match):
    #print(match[0]['chunk'])
    text = match[0]['chunk']['text']
    references = re.findall(r"(?i)(?:section)\s+(\d+|\(\d+\))(?:\s*\([a-z]\))?", text)
    if (references):
        print(match[0]['chunk'])
        print(references)
        # create the edges
        print("Match found - creating references")
        chunk_seq_id = match[0]['chunk']['chunkSeqId']
        section_heading = match[0]['chunk']['sectionName']
        section_id = match[0]['chunk']['sectionId']
        chunk = {
            'text': text, 
            # metadata from looping...
            'chunkSeqId': match[0]['chunk']['chunkSeqId'],
            'chunkId': match[0]['chunk']['chunkId'],
            'ActId': match[0]['chunk']['ActId'],
            'sectionId': match[0]['chunk']['sectionId'],
            'sectionName': match[0]['chunk']['sectionName'],
            'connectnedsectionId': references[0]
        }
        result = kg.query(connect_chunks,
                params={
                    'chunkParam':chunk
                }
                )
        print(result)

In [None]:
def create_chunk_neo4j(tokens, search=False):
    match_found = []
    for chunk in tokens:
        if search:
            print('search')
            match = kg.query(match_chunk_node_query, 
                    params={
                        'chunkParam': chunk
                    })
            if (match):
                search_create_reference(match)
            match_found.append(match)
        else:
            print(f"Creating `:Chunk` node for chunk ID {chunk['chunkSeqId']}")
            kg.query(merge_chunk_node_query, 
                    params={
                        'chunkParam': chunk
                    })
    return match_found

In [None]:
def extract_data(file, search=False):
    #get the ACT's title
    title = file.find_all('act:title')[0].get_text()
    print(title)
    #get all the sections
    preamble = file.find_all('bcl:preamble')
    if (preamble):
        item_text = subsection(preamble, 0)
        token = create_chunks(item_text, title, 'preamble', 0)
    sections = file.find_all('bcl:section')
    #find the definition subsection
    for index, section in enumerate(sections):
        section_heading = section.find_all('bcl:marginalnote')[0].get_text()
        #if (section_heading):
        #    section_heading = section_heading[0].get_text()
        #print(index)
        #print("-----" + section_heading + "------\n")
        section_definitions = section.find_all('bcl:definition')
        if (len(section_definitions) < 1):
            #find the remaining subsection
            section_subsection = section.find_all('bcl:subsection')
            if len(section_subsection):
                item_text = subsection(section_subsection, index+1)
                #print(item_text)
                token = create_chunks(item_text, title, section_heading, index+1)
            else:
                item_text = find_links(section)
                token = create_chunks(item_text, title, section_heading, index+1)
        else:
            item_text = subsection(section_definitions, index+1)
            #print(item_text)
            token = create_chunks(item_text, title, section_heading, index+1)
        #print(token)
        #print(len(token))
        #print("\n\n")
        #if (index > 1):    
            #break
        found = create_chunk_neo4j(token, search)
        if (search):
            return found

In [None]:
for index, Acts in enumerate(Acts_documents):
    soup = BeautifulSoup(Acts.get_text(), 'xml')
    extract_data(soup)
    #sections = soup.find_all("div", class_='section')

    break

In [None]:
#now lets match the chunks and create the links withing the document
search_found = []
for index, Acts in enumerate(Acts_documents):
    soup = BeautifulSoup(Acts.get_text(), 'xml')
    search_found.append(extract_data(soup, True))
    #sections = soup.find_all("div", class_='section')
    break

In [None]:
print(search_found)

In [None]:
text= search_found[0][0][0]['chunk']['text']
print(text)
references = re.findall(r"(?i)(?:section)\s+(\d+|\(\d+\))(?:\s*\([a-z]\))?", text)
print(references)