In [None]:
!pip install langchain
!pip install neo4j
!pip install bs4
!pip install llama-index
!pip uninstall -y trulens_eval
#!pip install trulens-eval==0.25.1
!pip install llmlingua
!pip install regex
!pip uninstall -y trulens_eval
!pip install trulens-eval

In [34]:
import os
from bs4 import BeautifulSoup
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
import warnings
from langchain_community.document_loaders import DirectoryLoader
from llama_index.core import SimpleDirectoryReader, StorageContext
warnings.filterwarnings("ignore")
from langchain_community.embeddings import HuggingFaceEmbeddings
import numpy as np
from trulens_eval import Tru
from trulens_eval.tru_custom_app import instrument
from llmlingua import PromptCompressor
import re
from IPython.core.display import display, HTML

In [20]:
file_metadata = lambda x: {"filename": x}
Acts_documents = SimpleDirectoryReader("./xml",file_metadata=file_metadata).load_data()

In [103]:
text_splitter1 = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
    separators=["\n\n", "\n", ". ", " ", ""],
)

In [71]:
embeddings = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")

In [171]:
NEO4J_URI = 'bolt://8.tcp.us-cal-1.ngrok.io:10651'
NEO4J_USERNAME = 'neo4j'
NEO4J_PASSWORD = '12345678'
NEO4J_DATABASE = 'neo4j'

# connect with the graph
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

In [54]:
merge_chunk_node_query = """
MERGE(mergedChunk:SummaryChunk {chunkId: $chunkParam.chunkId})
    ON CREATE SET 
        mergedChunk.chunkSeqId = $chunkParam.chunkSeqId, 
        mergedChunk.text = $chunkParam.text,
        mergedChunk.ActId = $chunkParam.ActId,
        mergedChunk.sectionId = $chunkParam.sectionId,
        mergedChunk.sectionName = $chunkParam.sectionName,
        mergedChunk.summary = $chunkParam.summary
RETURN mergedChunk
"""

match_chunk_node_query =  """
        MATCH (chunk:SummaryChunk) WHERE
        chunk.chunkId = $chunkParam.chunkId
        AND chunk.chunkSeqId = $chunkParam.chunkSeqId
        AND chunk.ActId = $chunkParam.ActId
        AND chunk.sectionId = $chunkParam.sectionId
        AND chunk.sectionName = $chunkParam.sectionName
        AND chunk.text = $chunkParam.text
        RETURN chunk
        """

create_embeddings = """
        MATCH (chunk:SummaryChunk) WHERE
        chunk.chunkId = $chunkParam.chunkId
        AND chunk.chunkSeqId = $chunkParam.chunkSeqId
        AND chunk.ActId = $chunkParam.ActId
        AND chunk.sectionId = $chunkParam.sectionId
        AND chunk.sectionName = $chunkParam.sectionName
        AND chunk.text = $chunkParam.text
        AND chunk.textEmbedding is NULL
        CALL db.create.setNodeVectorProperty(chunk, "textEmbedding", $vector)
        RETURN chunk
    """

connect_chunks = """
      MATCH (chunk:SummaryChunk), (f:SummaryChunk)
      WHERE
        chunk.chunkId = $chunkParam.chunkId
        AND chunk.chunkSeqId = $chunkParam.chunkSeqId
        AND chunk.ActId = $chunkParam.ActId
        AND chunk.sectionId = $chunkParam.sectionId
        AND chunk.sectionName = $chunkParam.sectionName
        AND f.ActId = $chunkParam.ActId
        AND f.sectionId = $chunkParam.connectnedsectionId
        AND f.chunkSeqId = 0
      MERGE (chunk)-[newRelationship:REFERENCE]->(f)
      RETURN count(newRelationship)
    """

In [10]:
def create_chunk_relationship(act_info):
    cypher = """
  MATCH (from_same_section:SummaryChunk)
  WHERE from_same_section.ActId = $ActParam['ActId']
  AND from_same_section.sectionName = $ActParam['sectionName']
  AND from_same_section.sectionId = $ActParam['sectionId']
  WITH from_same_section
    ORDER BY from_same_section.chunkSeqId ASC
  WITH collect(from_same_section) as section_chunk_list
    CALL apoc.nodes.link(
        section_chunk_list, 
        "NEXT", 
        {avoidDuplicates: true}
    )  // NEW!!!
  RETURN size(section_chunk_list)
"""
    kg.query(cypher, params={'ActParam': act_info})

In [28]:
## This function will search for any reference that has the word section or subsection followed by a number
def extract_references(str):
    references = re.findall(r"section (\d+)", str)
      
def find_links(subsection):
    if not subsection:
        return
    if (subsection.find_all("bcl:link")):
        xml_link = subsection.find_all("bcl:link")[0]['xlink:href']
        display(HTML(f'<a href="{xml_link}">{subsection.find_all("bcl:link")[0].get_text()}</a>'))
    extract_references(subsection.get_text().replace("\n\n", "").replace("\r", ""))
    return subsection.get_text().replace("\n", " ").replace("\r", " ")

In [12]:

#get subsection
def subsection(section_definitions, index):
    string = ""
    for subsection_index, subsection in enumerate(section_definitions):
        string += "\n" + find_links(subsection)
    return string

In [53]:
def create_metadata(token_split_texts,title, section_heading, section_id, summary):
    chunks_with_metadata = [] # use this to accumlate chunk records
    chunk_seq_id = 0
    for chunk in token_split_texts: # only take the first 20 chunks
        #form_id = file[file.rindex('/') + 1:file.rindex('.')] # extract form id from file name
        # finally, construct a record with metadata and the chunk text
        chunks_with_metadata.append({
            'text': chunk, 
            # metadata from looping...
            'chunkSeqId': chunk_seq_id,
            'chunkId': f'{title}-chunk-{section_heading}-{chunk_seq_id:04d}',
            'ActId': f'{title}',
            'sectionId': f'{section_id}',
            'sectionName':f'{section_heading}',
            'summary': f'{summary}',
            # constructed metadata...
            # metadata from file...
        })
        chunk_seq_id += 1
    return chunks_with_metadata

In [111]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embed_model = HuggingFaceEmbedding(model_name="mixedbread-ai/mxbai-embed-large-v1")

In [218]:
from llama_index.core.node_parser import SentenceSplitter
import nltk
import time
import random

def create_chunks(item_text, title, section_heading, section_id):
    nltk_tokens = nltk.word_tokenize(item_text)
    print(len(nltk_tokens))
    time.sleep(10)
    prompt = "summarize this text in detail using simple language:" + section_heading + "\n" + item_text
    # print(prompt)
    try:
        summary = chat.send_message(prompt).text
    except:
        print("summary failed trying again in 20 sec")
        time.sleep(20)
        summary = chat.send_message(prompt).text
        
    # summary = ai.prompt(message=prompt)
    # summary = summary["message"]
    print("got summary")
    token_split_texts = []
    if len(nltk_tokens) > 511:
        token_splitter = SentenceSplitter(chunk_size=512,chunk_overlap=20)
        token_split_texts += token_splitter.split_text(item_text)
    else:
        token_split_texts.append(item_text)
        
    meta_data = create_metadata(token_split_texts,title, section_heading, section_id, summary)    
    return meta_data

In [51]:
def create_chunk_embeddings(tokens):
    for chunk in tokens:
        print(f"Creating `:Chunk` node for chunk ID {chunk['chunkSeqId']}")
        kg.query(merge_chunk_node_query, 
            params={
                'chunkParam': chunk
            })
        vector = embeddings.embed_query(chunk['text'])
        result = kg.query(create_embeddings, params={'chunkParam':chunk, 'vector':vector})
        if result:
            print("Embedding created")
        else:
            print(result)
            print("Embedding not created")
    create_chunk_relationship(tokens[0])       

In [50]:
def create_chunk_neo4j(tokens, search=False):
    for chunk in tokens:
        if search:
            vector = embeddings.embed_query(chunk['text'])
            result = kg.query(create_embeddings, params={'chunkParam':chunk, 'vector':vector})
            if result:
                print("Embedding creating")
            else:
                print(result)
                return
        else:
            print(f"Creating `:Chunk` node for chunk ID {chunk['chunkSeqId']}")
            kg.query(merge_chunk_node_query, 
                    params={
                        'chunkParam': chunk
                    })
    if (not search):
        create_chunk_relationship(tokens[0])

In [216]:
from tqdm import tqdm 
def extract_data(file, index,  search=False):
    #get the ACT's title
    title = file.find('act:title')
    if title:
        title = title.get_text()
        print(title)
    else:
        return
    #get all the sections
    preamble = file.find_all('bcl:preamble')
    if (preamble):
        item_text = subsection(preamble, 0)
        token = create_chunks(item_text, title, 'preamble', 0)
    sections = file.find_all('bcl:section')
    #find the definition subsection
    left_sections = [14, 15, 16, 22, 24.2, 82]
    for index, section in enumerate(tqdm(sections)):
        token = []
        section_heading = section.find('bcl:marginalnote')#[0].get_text()
        if (section_heading):
            section_heading = section_heading.get_text()
        else:
            section_heading = ""
        section_definitions = section.find_all('bcl:definition')
        if (len(section_definitions) > 1):
            continue
        #find the remaining subsection
        section_subsection = section.find_all('bcl:subsection')
        #find the section number
        section_number = section.find('bcl:num').get_text()
        print("section number is:" + section_number)
        if float(section_number) not in left_sections:
            continue
        if len(section_number):
            print("section number is:" + section_number)
        #print(section_subsection)
        if len(section_subsection):
            item_text = subsection(section_subsection, index+1)
            print(item_text)
            token = create_chunks(item_text, title, section_heading, section_number)
        else:
            item_text = find_links(section)
            token = create_chunks(item_text, title, section_heading, section_number)
        # else:
        #     print("here")
        #     item_text = subsection(section_definitions, index+1)
        #     #print(item_text)
        #     token = create_chunks(item_text, title, section_heading, index+1)
        found = create_chunk_embeddings(token)

In [None]:
soup = BeautifulSoup(Acts_documents[0].get_text(), 'xml') 


In [219]:
extract_data(soup, 0,  False)

Offence Act


  0%|          | 0/155 [00:00<?, ?it/s]

section number is:2
section number is:3
section number is:4
section number is:5
section number is:6
section number is:7
section number is:8
section number is:9
section number is:10
section number is:10.1
section number is:11
section number is:12
section number is:13
section number is:13.1
section number is:14
section number is:14



 1 An enforcement officer may  a sign and issue a violation ticket, other than an eTicket, for contravention of an enactment referred to in the regulations, or   b issue an eTicket for contravention of an enactment referred to in the regulations.  
 2 For the purpose of a violation ticket issued to the owner of a motor vehicle in respect of an offence under section 83.1 (2) or (2.1) of the Motor Vehicle Act but without limiting the application of subsection (1) of this section,  a the violation ticket may be created, completed and signed in electronic format by electronic or any other means that allows the violation ticket to be reproduced in intelligible form including, without limitation, electronically,   b in place of signing the violation ticket under subsection (1), the enforcement officer may identify himself or herself as the person making and authenticating the violation ticket by means of an electronic reproduction of his or her signature that is capable of being assigned to

 10%|█         | 16/155 [00:40<05:47,  2.50s/it]

Embedding created
section number is:14.01
section number is:14.1
section number is:15
section number is:15

 1 A person on whom a violation ticket has been served may, within 30 days of being served, dispute the allegation or the fine portion of the ticketed amount indicated on the ticket by  a delivering or causing to be delivered to the address set out in the ticket or to an address set out in the instructions prescribed under section 132 (2) (a.3) a written notice of dispute, or   b appearing in person at the location set out in the ticket or at a location set out in the instructions prescribed under section 132 (2) (a.3) to give notice of dispute.  
 2 A person to whom a violation ticket has been mailed under section 14 (6) (a) but on whom the ticket has not been served may, within 45 days from the date the ticket is completed, dispute the allegation or the fine portion of the ticketed amount indicated on the ticket by  a delivering or causing to be delivered to the address set out

 12%|█▏        | 19/155 [01:22<11:17,  4.98s/it]

Embedding created
section number is:15.1
section number is:15.2
section number is:15.3
section number is:15.4
section number is:16
section number is:16

 1 If a person who has been served with a violation ticket under section 14 has not, within 30 days after the ticket was served on the person, either paid all or a portion of the ticketed amount or disputed the allegation or the fine portion of the ticketed amount in accordance with section 15,  a the person is deemed to have pleaded guilty to the alleged contravention, and   b the ticketed amount indicated on the ticket is immediately payable to the government.  
 2 Subject to subsection (3), if a person who  a is served with a violation ticket, and   b is convicted under section 15 or deemed to have pleaded guilty under subsection (1)  wishes to dispute the allegation or the amount of the fine, the person must apply to a justice and the justice, on being satisfied of the matters in subsection (2.1) by affidavit in the prescribed form

 15%|█▌        | 24/155 [02:07<13:43,  6.29s/it]

Embedding created
section number is:17
section number is:18
section number is:18.1
section number is:19
section number is:20
section number is:21
section number is:22
section number is:22

 1 If a peace officer believes that an offence punishable on conviction has been committed and that it would be impracticable to appear personally before a justice to apply for a warrant in accordance with section 21, the peace officer may submit an information on oath by telephone or other means of telecommunication to a justice designated for that purpose by the chief judge of the Provincial Court. 
 2 An information submitted by telephone or other means of telecommunication must be on oath and must be recorded word for word by the justice, who must, as soon as practicable, cause the record or a transcription of it, certified by the justice as to time, date and contents, to be filed with the clerk of the court for the territorial division in which the warrant is intended for execution. 
 3 An oath 

 20%|██        | 31/155 [02:56<13:39,  6.61s/it]

Embedding created
section number is:23
section number is:24.1
section number is:24.2
section number is:24.2



 1 Unless otherwise provided by another enactment, an order for the disposition of a thing seized may be made by application under this section  a to a Provincial Court judge, in the case of a thing ordered detained by a judge of that court, or   b to a justice in any other case.  
 1.1 Subsection (1) of this section does not apply if, under section 14.04 (1) (b) (iv) or 23.01 of the Civil Forfeiture Act, the director under that Act serves notice of forfeiture under Part 3.1 of that Act or notice of intent to commence proceedings, as the case may be, on the responsible official having custody of the thing seized, unless  a that notice is withdrawn under section 14.08 (a) or 23.01 (4) (a) (ii) of that Act, or   b in the case of notice of intent to commence proceedings, the 30 day period referred to in section 23.01 (4) (a) of that Act expires and no proceedings are commenced under section 3 of that Act in relation to the thing seized.  
 2 An application under this section may be made 

 23%|██▎       | 35/155 [03:45<15:56,  7.97s/it]

Embedding created
section number is:25
section number is:26
section number is:27
section number is:28
section number is:29
section number is:30
section number is:31
section number is:31.01
section number is:31.1
section number is:32
section number is:33
section number is:34
section number is:35
section number is:36
section number is:37
section number is:38
section number is:39
section number is:40
section number is:41
section number is:42
section number is:43
section number is:44
section number is:45
section number is:46
section number is:47
section number is:48
section number is:49
section number is:50
section number is:51
section number is:52
section number is:53
section number is:54
section number is:55
section number is:56
section number is:57
section number is:58
section number is:59
section number is:60
section number is:61
section number is:62
section number is:63
section number is:63.1
section number is:64
section number is:65
section number is:66
section number is:67
section n


 1 Subject to subsection (7), but despite any other provision of this Act, any other Act, regulation, municipal bylaw or order made by a justice, a justice must not, except under the Small Claims Act, order that a person be imprisoned merely because he or she defaults in paying a fine. 
 2 If a justice imposes a fine authorized by this or any other Act, the justice, despite this Act or the Act under which the fine is imposed, may order that the fine and the victim surcharge levy that is, under section 8.1 of the Victims of Crime Act, to be paid with that fine, be paid  a subject to subsection (3), at once, or   b at a time, in instalments and subject to terms and conditions the justice considers appropriate.  
 3 A justice must not make an order under subsection (2) (a) unless  a the justice is satisfied that the person against whom the fine is imposed has sufficient means and ability to enable the person to pay the fine and the victim surcharge levy that is, under section 8.1 of the 

100%|██████████| 155/155 [04:18<00:00,  1.67s/it]

Embedding created
section number is:83
section number is:84
section number is:85
section number is:86
section number is:87
section number is:88
section number is:89
section number is:89.1
section number is:89.2
section number is:89.3
section number is:89.4
section number is:89.5
section number is:89.6
section number is:89.7
section number is:89.8
section number is:90
section number is:91
section number is:92
section number is:93
section number is:94
section number is:95
section number is:96
section number is:97
section number is:98
section number is:99
section number is:100
section number is:101
section number is:102
section number is:103
section number is:104
section number is:105
section number is:106
section number is:107
section number is:108
section number is:109
section number is:110
section number is:111
section number is:112
section number is:113
section number is:114
section number is:115
section number is:116
section number is:117
section number is:118
section number is:119
s




In [None]:
import google.generativeai as genai

genai.configure(api_key="")

# Set up the model
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 0,
  "max_output_tokens": 8192
}


model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest",
                              generation_config=generation_config)

chat = model.start_chat(history=[])

In [161]:
summary = chat.send_message("what is life?")

In [162]:
print(summary.text)

"What is life?" is a vast and complex question that has captivated philosophers and scientists for centuries.  There's no single, definitive answer, as the concept of life can be explored through various lenses:

**Biology:** 

* **Defining Life:** From a biological standpoint, life is often characterized by a set of properties such as organization, metabolism, growth, adaptation, response to stimuli, and reproduction.  Living organisms maintain homeostasis, meaning they regulate their internal environment to remain stable.
* **Origins:** The emergence of life on Earth is a captivating mystery. Leading theories point to abiogenesis, where life arose from non-living matter, potentially in environments like primordial soup or near hydrothermal vents. 
* **Diversity:** Life on Earth showcases astounding diversity, encompassing simple single-celled organisms to complex multicellular life forms like animals, plants, and fungi. 

**Philosophy:**

* **Meaning and Purpose:** Philosophers grapp

In [163]:
from meta_ai_api import MetaAI
ai = MetaAI()
response = ai.prompt(message="what is life")
print(response["message"])


The age-old question!
Life is a complex and multifaceted concept that has been debated and explored by philosophers, scientists, and many other thinkers throughout human history. Here's a brief overview:
Biological perspective:
Life refers to the characteristic that distinguishes living organisms (such as animals, plants, and microorganisms) from non-living matter. It encompasses the functional, organizational, and developmental processes that occur within an organism, including growth, reproduction, metabolism, and evolution.
Philosophical perspective:
Life can be seen as a journey, an experience, or a state of being. It encompasses our individual and collective existence, our consciousness, and our relationships with others and the world around us. Philosophers have long grappled with questions about the meaning, purpose, and value of life.
Cultural and personal perspective:
Life is a rich tapestry of experiences, emotions, and connections. It's shaped by our cultural backgrounds, be

In [197]:
left_sections = [14, 15, 16, 22, 24.2, 82]

if float("14.2") not in left_sections:
    print("yey")

yey
