In [None]:
!pip install sentence_transformers
!pip install langchain_community
!pip install neo4j

In [None]:
import os
from bs4 import BeautifulSoup
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
import warnings
from langchain_community.document_loaders import DirectoryLoader
warnings.filterwarnings("ignore")
from langchain_community.embeddings import HuggingFaceEmbeddings
from sentence_transformers import CrossEncoder
import numpy as np
import re
from IPython.core.display import display, HTML
import json

In [None]:
# NEO4J_URI = 'bolt://citz-imb-ai-neo4j-svc:7687'
NEO4J_URI = 'bolt://' + os.getenv('NEO4J_HOST') + ':7687'
NEO4J_USERNAME = os.getenv('NEO4J_USER')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = 'neo4j' #os.getenv('NEO4J_DB')
print(NEO4J_URI)
print(NEO4J_DATABASE)

In [None]:
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

In [None]:
cypher = """
  MATCH (n) 
  RETURN count(n)
  """
result = kg.query(cypher)
result

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
    separators=["\n\n", "\n", ". ", " ", ""],
)

## Please download the glossary file from S3 before running the below code
if you have the s3 credentials you can excute these commands in the notebook it self
```
from utility.s3_glossary import download_data
bucket_name = "IMBAIPilot"
download_data("bclaws/glossary", "JSON_glossary/", bucket_name)
```

You can also manually download this by executing the script in the utility folder

In [None]:
file_metadata = lambda x: {"filename": x}
f = open('./utility/JSON_glossary/glossary.json')
glossaries = json.load(f)

In [None]:
print(glossaries)

In [None]:
token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=20, tokens_per_chunk=256)
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
merge_chunk_node_query = """
MERGE(mergedChunk:UpdatedChunk {chunkId: $chunkParam.chunkId})
    ON CREATE SET 
        mergedChunk.chunkSeqId = $chunkParam.chunkSeqId, 
        mergedChunk.text = $chunkParam.text,
        mergedChunk.type = $chunkParam.type,
        mergedChunk.url = $chunkParam.url,
        mergedChunk.glossaryTerm = $chunkParam.glossaryTerm
RETURN mergedChunk
"""

In [None]:
create_embeddings = """
        MATCH (chunk:UpdatedChunk) WHERE
        chunk.chunkId = $chunkParam.chunkId
        AND chunk.chunkSeqId = $chunkParam.chunkSeqId
        AND chunk.text = $chunkParam.text
        AND chunk.type = $chunkParam.type
        AND chunk.url = $chunkParam.url
        AND chunk.glossaryTerm = $chunkParam.glossaryTerm
        AND chunk.textEmbedding is NULL
        CALL db.create.setNodeVectorProperty(chunk, "textEmbedding", $vector)
        RETURN chunk
    """

In [None]:
# Total number of glossaries in bc gov
print(len(glossaries['terms']))

## This code below will create the chunk and embeddings of the glossary terms

In [None]:
for glossary in enumerate(glossaries['terms']):
    #print(glossary)
    #print(glossary[1]['term'])
    #print(glossary[1]['description'])
    #print(glossary[1]['related_terms'])
    text = glossary[1]['term'] + ': ' + glossary[1]['description']
    #print(item_text_chunks)
    token_split_texts = []
    data_type = 'glossary'
    # Validate if all the text fits into the 256 token size to create the embeddings
    token_split_texts += token_splitter.split_text(text)
    #print(token_split_texts)
    # create meta data
    for chunk_seq, token in enumerate(token_split_texts):
        chunk = {
            'type': data_type,
            'text': token,
            'chunkSeqId': chunk_seq,
            'chunkId': f'{data_type}-{glossary[1]["term"]}-seq-{str(chunk_seq)}',
            'url': 'https://www.bclaws.gov.bc.ca/glossary.html',
            'glossaryTerm': glossary[1]['term']
        }
        print(chunk)
        kg.query(merge_chunk_node_query, 
            params={
                'chunkParam': chunk
            })
        vector = embeddings.embed_query(chunk['text'])
        result = kg.query(create_embeddings, params={'chunkParam':chunk, 'vector':vector})

## Now that we have index all the glossary we now need to attach the references

In [None]:
connect_chunks = """
      MATCH (chunk:UpdatedChunk), (f:UpdatedChunk)
      WHERE
        chunk.chunkId = $chunkParam.chunkId
        AND chunk.chunkSeqId = $chunkParam.chunkSeqId
        AND chunk.text = $chunkParam.text
        AND chunk.glossaryTerm = $chunkParam.glossaryTerm1
        AND chunk.type = $chunkParam.type
        AND f.type = $chunkParam.type
        AND f.glossaryTerm = $chunkParam.glossaryTerm2
        AND f.chunkId = $chunkParam.chunkId2
        AND f.chunkSeqId = 0
      MERGE (chunk)-[newRelationship:RELATED_TERMS]->(f)
      RETURN count(newRelationship)
    """

In [None]:
for glossary in enumerate(glossaries['terms']):
    text = glossary[1]['term'] + ': ' + glossary[1]['description']
    #print(item_text_chunks)
    token_split_texts = []
    data_type = 'glossary'
    # Validate if all the text fits into the 256 token size to create the embeddings
    token_split_texts += token_splitter.split_text(text)
    for chunk_seq, token in enumerate(token_split_texts):
        for glossary_terms in glossary[1]['related_terms']:
            chunk = {
                'type': data_type,
                'text': token,
                'chunkSeqId': chunk_seq,
                'chunkId': f'{data_type}-{glossary[1]["term"]}-seq-{str(chunk_seq)}',
                'url': 'https://www.bclaws.gov.bc.ca/glossary.html',
                'glossaryTerm1': glossary[1]['term'],
                'glossaryTerm2': glossary_terms,
                'chunkId2': f'{data_type}-{glossary_terms}-seq-0'
            }
            print(chunk)
            ret = kg.query(connect_chunks, 
            params={
                'chunkParam': chunk
            })
            print(ret)
            print('\n\n')

In [None]:
for idx, glossary in enumerate(glossaries['terms']):
    text = glossary['term'] + ': ' + glossary['description']
    token_split_texts = []
    data_type = 'glossary'
    
    # Split text into chunks fitting the 256 token size
    token_split_texts += token_splitter.split_text(text)
    
    for chunk_seq, token in enumerate(token_split_texts):
        for glossary_term in glossary['related_terms']:
            chunk = {
                'type': data_type,
                'text': token,
                'chunkSeqId': chunk_seq,
                'chunkId': f'{data_type}-{glossary["term"]}-seq-{chunk_seq}',
                'url': 'https://www.bclaws.gov.bc.ca/glossary.html',
                'glossaryTerm1': glossary['term'],
                'glossaryTerm2': glossary_term,
                'chunkId2': f'{data_type}-{glossary_term}-seq-0'  # Assuming seq-0 for related terms
            }
            
            # Execute the query with the current chunk parameters
            ret = kg.query(connect_chunks, params={'chunkParam': chunk})
            print(chunk)
            print(ret)
            print('\n\n')