In [None]:
!pip install sentence_transformers
!pip install langchain_community
!pip install neo4j

In [26]:
import os
from bs4 import BeautifulSoup
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
import warnings
from langchain_community.document_loaders import DirectoryLoader
warnings.filterwarnings("ignore")
from langchain_community.embeddings import HuggingFaceEmbeddings
from sentence_transformers import CrossEncoder
import numpy as np
import re
from IPython.core.display import display, HTML
import json

In [27]:
# NEO4J_URI = 'bolt://citz-imb-ai-neo4j-svc:7687'
NEO4J_URI = 'bolt://' + 'localhost'+':7687'
NEO4J_USERNAME = 'neo4j'
NEO4J_PASSWORD = 'neo4j'
NEO4J_DATABASE = 'neo4j' #os.getenv('NEO4J_DB')
print(NEO4J_URI)
print(NEO4J_DATABASE)

bolt://localhost:7687
neo4j


In [62]:
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

In [60]:
cypher = """
  MATCH (n) 
  RETURN count(n)
  """
result = kg.query(cypher)
result

[{'count(n)': 156403}]

In [30]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
    separators=["\n\n", "\n", ". ", " ", ""],
)

## Please download the glossary file from S3 before running the below code
if you have the s3 credentials you can excute these commands in the notebook it self
```
from utility.s3_glossary import download_data
bucket_name = "IMBAIPilot"
download_data("bclaws/glossary", "JSON_glossary/", bucket_name)
```

You can also manually download this by executing the script in the utility folder

In [47]:
file_metadata = lambda x: {"filename": x}
f = open('/Users/msihag/repos/citz-imb-ai/JSON_glossary/glossary.json')
glossaries = json.load(f)

In [48]:
print(glossaries)

{'terms': [{'term': 'Act', 'description': 'Also called a statute. When a Bill (proposed law) passes third reading in the Legislative Assembly, and receives Royal Assent, it is thereby enacted and becomes an Act or law. Public Statutes generally deal with issues of significance for the whole province. Private, Local and Special Statutes are enacted by the Legislature on behalf of a person, a group, a municipality or a corporation, and affect only the interests of that person or group.', 'related_terms': ['Bill', 'Regulation']}, {'term': 'Adjournment', 'description': 'An adjournment temporarily ends a sitting or a Session. All business not concluded at the time of adjournment is resumed at the next sitting or Session.', 'related_terms': ['Dissolution', 'Prorogation']}, {'term': 'Amendment', 'description': 'A modification made to the text of a Bill, Act or regulation by adding, removing or substituting text.', 'related_terms': []}, {'term': 'Amended Bill', 'description': 'See Report Bill.

In [49]:
token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=20, tokens_per_chunk=256)
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [50]:
merge_chunk_node_query = """
MERGE(mergedChunk:UpdatedChunk {chunkId: $chunkParam.chunkId})
    ON CREATE SET 
        mergedChunk.chunkSeqId = $chunkParam.chunkSeqId, 
        mergedChunk.text = $chunkParam.text,
        mergedChunk.type = $chunkParam.type,
        mergedChunk.url = $chunkParam.url,
        mergedChunk.glossaryTerm = $chunkParam.glossaryTerm
RETURN mergedChunk
"""

In [51]:
create_embeddings = """
        MATCH (chunk:UpdatedChunk) WHERE
        chunk.chunkId = $chunkParam.chunkId
        AND chunk.chunkSeqId = $chunkParam.chunkSeqId
        AND chunk.text = $chunkParam.text
        AND chunk.type = $chunkParam.type
        AND chunk.url = $chunkParam.url
        AND chunk.glossaryTerm = $chunkParam.glossaryTerm
        AND chunk.textEmbedding is NULL
        CALL db.create.setNodeVectorProperty(chunk, "textEmbedding", $vector)
        RETURN chunk
    """

In [52]:
# Total number of glossaries in bc gov
print(len(glossaries['terms']))

67


## This code below will create the chunk and embeddings of the glossary terms

In [63]:
for glossary in enumerate(glossaries['terms']):
    #print(glossary)
    #print(glossary[1]['term'])
    #print(glossary[1]['description'])
    #print(glossary[1]['related_terms'])
    text = glossary[1]['term'] + ': ' + glossary[1]['description']
    #print(item_text_chunks)
    token_split_texts = []
    data_type = 'glossary'
    # Validate if all the text fits into the 256 token size to create the embeddings
    token_split_texts += token_splitter.split_text(text)
    #print(token_split_texts)
    # create meta data
    for chunk_seq, token in enumerate(token_split_texts):
        chunk = {
            'type': data_type,
            'text': token,
            'chunkSeqId': chunk_seq,
            'chunkId': f'{data_type}-{glossary[1]["term"]}-seq-{str(chunk_seq)}',
            'url': 'https://www.bclaws.gov.bc.ca/glossary.html',
            'glossaryTerm': glossary[1]['term']
        }
        print(chunk)
        kg.query(merge_chunk_node_query, 
            params={
                'chunkParam': chunk
            })
        vector = embeddings.embed_query(chunk['text'])
        result = kg.query(create_embeddings, params={'chunkParam':chunk, 'vector':vector})

{'type': 'glossary', 'text': 'act : also called a statute. when a bill ( proposed law ) passes third reading in the legislative assembly, and receives royal assent, it is thereby enacted and becomes an act or law. public statutes generally deal with issues of significance for the whole province. private, local and special statutes are enacted by the legislature on behalf of a person, a group, a municipality or a corporation, and affect only the interests of that person or group.', 'chunkSeqId': 0, 'chunkId': 'glossary-Act-seq-0', 'url': 'https://www.bclaws.gov.bc.ca/glossary.html', 'glossaryTerm': 'Act'}
{'type': 'glossary', 'text': 'adjournment : an adjournment temporarily ends a sitting or a session. all business not concluded at the time of adjournment is resumed at the next sitting or session.', 'chunkSeqId': 0, 'chunkId': 'glossary-Adjournment-seq-0', 'url': 'https://www.bclaws.gov.bc.ca/glossary.html', 'glossaryTerm': 'Adjournment'}
{'type': 'glossary', 'text': 'amendment : a mod

## Now that we have index all the glossary we now need to attach the references

In [54]:
connect_chunks = """
      MATCH (chunk:UpdatedChunk), (f:UpdatedChunk)
      WHERE
        chunk.chunkId = $chunkParam.chunkId
        AND chunk.chunkSeqId = $chunkParam.chunkSeqId
        AND chunk.text = $chunkParam.text
        AND chunk.glossaryTerm = $chunkParam.glossaryTerm1
        AND chunk.type = $chunkParam.type
        AND f.type = $chunkParam.type
        AND f.glossaryTerm = $chunkParam.glossaryTerm2
        AND f.chunkId = $chunkParam.chunkId2
        AND f.chunkSeqId = 0
      MERGE (chunk)-[newRelationship:RELATED_TERMS]->(f)
      RETURN count(newRelationship)
    """

In [55]:
for glossary in enumerate(glossaries['terms']):
    text = glossary[1]['term'] + ': ' + glossary[1]['description']
    #print(item_text_chunks)
    token_split_texts = []
    data_type = 'glossary'
    # Validate if all the text fits into the 256 token size to create the embeddings
    token_split_texts += token_splitter.split_text(text)
    for chunk_seq, token in enumerate(token_split_texts):
        for glossary_terms in glossary[1]['related_terms']:
            chunk = {
                'type': data_type,
                'text': token,
                'chunkSeqId': chunk_seq,
                'chunkId': f'{data_type}-{glossary[1]["term"]}-seq-{str(chunk_seq)}',
                'url': 'https://www.bclaws.gov.bc.ca/glossary.html',
                'glossaryTerm1': glossary[1]['term'],
                'glossaryTerm2': glossary_terms,
                'chunkId2': f'{data_type}-{glossary_terms}-seq-0'
            }
            print(chunk)
            ret = kg.query(connect_chunks, 
            params={
                'chunkParam': chunk
            })
            print(ret)
            print('\n\n')

{'type': 'glossary', 'text': 'act : also called a statute. when a bill ( proposed law ) passes third reading in the legislative assembly, and receives royal assent, it is thereby enacted and becomes an act or law. public statutes generally deal with issues of significance for the whole province. private, local and special statutes are enacted by the legislature on behalf of a person, a group, a municipality or a corporation, and affect only the interests of that person or group.', 'chunkSeqId': 0, 'chunkId': 'glossary-Act-seq-0', 'url': 'https://www.bclaws.gov.bc.ca/glossary.html', 'glossaryTerm1': 'Act', 'glossaryTerm2': 'Bill', 'chunkId2': 'glossary-Bill-seq-0'}


ServiceUnavailable: Couldn't connect to localhost:7687 (resolved to ()):
Failed to read any data from server ResolvedIPv6Address(('::1', 7687, 0, 0)) after connected (deadline Deadline(timeout=60.0))
Failed to read any data from server ResolvedIPv4Address(('127.0.0.1', 7687)) after connected (deadline Deadline(timeout=60.0))

In [56]:
for idx, glossary in enumerate(glossaries['terms']):
    text = glossary['term'] + ': ' + glossary['description']
    token_split_texts = []
    data_type = 'glossary'
    
    # Split text into chunks fitting the 256 token size
    token_split_texts += token_splitter.split_text(text)
    
    for chunk_seq, token in enumerate(token_split_texts):
        for glossary_term in glossary['related_terms']:
            chunk = {
                'type': data_type,
                'text': token,
                'chunkSeqId': chunk_seq,
                'chunkId': f'{data_type}-{glossary["term"]}-seq-{chunk_seq}',
                'url': 'https://www.bclaws.gov.bc.ca/glossary.html',
                'glossaryTerm1': glossary['term'],
                'glossaryTerm2': glossary_term,
                'chunkId2': f'{data_type}-{glossary_term}-seq-0'  # Assuming seq-0 for related terms
            }
            
            # Execute the query with the current chunk parameters
            ret = kg.query(connect_chunks, params={'chunkParam': chunk})
            print(chunk)
            print(ret)
            print('\n\n')

{'type': 'glossary', 'text': 'act : also called a statute. when a bill ( proposed law ) passes third reading in the legislative assembly, and receives royal assent, it is thereby enacted and becomes an act or law. public statutes generally deal with issues of significance for the whole province. private, local and special statutes are enacted by the legislature on behalf of a person, a group, a municipality or a corporation, and affect only the interests of that person or group.', 'chunkSeqId': 0, 'chunkId': 'glossary-Act-seq-0', 'url': 'https://www.bclaws.gov.bc.ca/glossary.html', 'glossaryTerm1': 'Act', 'glossaryTerm2': 'Bill', 'chunkId2': 'glossary-Bill-seq-0'}
[{'count(newRelationship)': 1}]



{'type': 'glossary', 'text': 'act : also called a statute. when a bill ( proposed law ) passes third reading in the legislative assembly, and receives royal assent, it is thereby enacted and becomes an act or law. public statutes generally deal with issues of significance for the whole provi

In [None]:
get_glossary = """
MATCH (n:UpdatedChunk) WHERE n.type="glossary" RETURN n 
"""