In [1]:
import nest_asyncio
import os
import pickle
import re
import spacy
import torch
import uuid

from alive_progress import alive_bar
from dotenv import load_dotenv
from fastembed import TextEmbedding
from graphdatascience import GraphDataScience
from langchain_anthropic import ChatAnthropic
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from llama_index.core import Settings, Document, PropertyGraphIndex 
from llama_index.core.node_parser import MarkdownElementNodeParser
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.extractors.relik.base import RelikPathExtractor
from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore
from llama_index.llms.anthropic import Anthropic
from llama_parse import LlamaParse
from transformers import pipeline
from typing import List, Optional, Tuple

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
CLAUDE_API_KEY = os.getenv('CLAUDE_API_KEY')
LLAMA_API_KEY = os.getenv('LLAMA_API_KEY')

os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
os.environ["ANTHROPIC_API_KEY"] = CLAUDE_API_KEY
os.environ["LLAMA_CLOUD_API_KEY"] = LLAMA_API_KEY

In [3]:
llm = ChatAnthropic(
    model="claude-3-5-sonnet-20240620",
    max_tokens=4096,
    temperature=0.0,
    stop=["\n\nHuman"],
)

llama_llm = Anthropic(
    model="claude-3-5-sonnet-20240620",
    max_tokens=4096,
    temperature=0.0
)

In [4]:
bge_embed_model = TextEmbedding(model_name="BAAI/bge-large-en-v1.5")
llama_openai_embed_model = OpenAIEmbedding(model_name="text-embedding-3-small")

Settings.embed_model = llama_openai_embed_model

Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 33554.43it/s]


In [5]:
relik = RelikPathExtractor(
    model="relik-ie/relik-relation-extraction-small"
)

                ___              __         
               /\_ \      __    /\ \        
 _ __     __   \//\ \    /\_\   \ \ \/'\    
/\`'__\ /'__`\   \ \ \   \/\ \   \ \ , <    
\ \ \/ /\  __/    \_\ \_  \ \ \   \ \ \\`\  
 \ \_\ \ \____\   /\____\  \ \_\   \ \_\ \_\
  \/_/  \/____/   \/____/   \/_/    \/_/\/_/
                                            
                                            







In [6]:
coref_nlp = spacy.load('en_core_web_lg')
coref_nlp.add_pipe('coreferee')



<coreferee.manager.CorefereeBroker at 0x3fd9e0c50>

In [7]:
# instantiate doc parser
parser = LlamaParse(
    result_type="markdown",
    num_workers=8,
    verbose = True,
    language="en",
)

# instantiate node parser
node_parser = MarkdownElementNodeParser(llm=llama_llm, num_workers=8)

In [8]:
nest_asyncio.apply()

In [9]:
def coref_text(text):
    coref_doc = coref_nlp(text.strip())
    resolved_text = ""

    for token in coref_doc:
        repres = coref_doc._.coref_chains.resolve(token)
        if repres:
            resolved_text += " " + " and ".join(
                [
                    t.text
                    if t.ent_type_ == ""
                    else [e.text for e in coref_doc.ents if t in e][0]
                    for t in repres
                ]
            )
        else:
            resolved_text += " " + token.text

    return resolved_text.strip()

def remove_table_of_contents(text):
    pattern = r"TABLE OF CONTENTS.*?(?=#)"
    cleaned_text = re.sub(pattern, "", text, flags=re.DOTALL)
    return cleaned_text.strip()

def convert_nodes_to_documents(text_nodes, object_nodes, source):
    """
    Converts nodes to Documents

    Args:
        text_nodes (List[Nodes]): List of text nodes
        object_nodes (List[Nodes]): List of object nodes
        source (str): Source of the document

    Returns:
        documents (List[Documents]): List of Documents
    """
    documents = []
    for node in text_nodes:
        text = coref_text(node.text)
        doc = Document(
            text= text,
            metadata = {
                "is_table": False,
                "source": source
            }
        )
        documents.append(doc)
        
    for node in object_nodes:
        text = coref_text(node.text)
        doc = Document(
            text= text,
            metadata = {
                "is_table": True,
                "source": source
            }
        )
        documents.append(doc)
        
    return documents

def make_dir(data_folder):
    os.makedirs(data_folder, exist_ok=True)

def parse_docs(file_location: str, data_folder: Optional[str] = None) -> List[Document]:
    """
    Parses PDF Folder and returns a list of Documents

    Args:
        file_location (str): PDF Folder Location
        data_folder (Optional[str], optional): Folder to save pickles (Optional). Defaults to None.

    Returns:
        List[Document]: _description_
    """
    all_docs = []
    for file_name in os.listdir(file_location):
        if not file_name.endswith(".pdf"):
            continue

        print("File: " + str(file_name))
        doc_path = os.path.join(file_location, file_name)
        modified_file_name = os.path.splitext(file_name)[0].lower().replace(' ', '_')

        # results in a list of Document Objects
        documents = parser.load_data(doc_path)
        
        for idx, doc in enumerate(documents):
            doc.text = remove_table_of_contents(doc.text)
            if idx > 4:
                break

        raw_nodes = node_parser.get_nodes_from_documents(documents)
        # list of text_nodes, list of objects
        text_nodes, objects = node_parser.get_nodes_and_objects(raw_nodes)
        
        final_docs = convert_nodes_to_documents(text_nodes, objects, modified_file_name)
        all_docs.append(final_docs)
        
        if data_folder:
            data_path = os.path.join(data_folder, modified_file_name + '.pkl')
            pickle.dump(final_docs, open(data_path, "wb"))
    
    return [item for sublist in all_docs for item in sublist]

def read_pickles(data_folder: str) -> List[Document]:
    doc_list = []
    for file_name in os.listdir(data_folder):
        doc_path = os.path.join(data_folder, file_name)
        if file_name.endswith(".pkl"):
            with open(doc_path, 'rb') as file:
                # data will be a doc_list
                data = pickle.load(file)
                doc_list.append(data)
                
    # since doc_list is a list of list of documents, we need to flatten it
    doc_list = [item for sublist in doc_list for item in sublist]
    return doc_list

def further_split_long_docs(doc_list: List[Document]) -> Tuple[List[Document], List[Document]]:
    long_docs, short_docs = [], []
    for doc in doc_list:
        is_table = doc.metadata["is_table"]
        if not is_table:
            if len(doc.text) > 1500:
                long_docs.append(doc)
            else:
                short_docs.append(doc)
        else:
            short_docs.append(doc)
    return long_docs, short_docs
                
def chunk_doc(doc: Document, text_splitter: RecursiveCharacterTextSplitter) -> List[Document]:
    chunks = text_splitter.split_text(doc.text)
    return [
        Document(
            text=chunk,
            metadata={
                'is_table': doc.metadata['is_table'],
                'source': doc.metadata.get('source', '')
            }
        )
        for i, chunk in enumerate(chunks)
    ]
    
def recursive_chunk_documents(long_docs: List[Document],
                              short_docs: List[Document], 
                              chunk_size: int = 512, 
                              chunk_overlap: int = 64,
                              separators: List[str] = ["\n\n", "\n", " ", ""]) -> List[Document]:
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=separators
    )

    for doc in long_docs:
        short_docs.extend(chunk_doc(doc, text_splitter))

    return short_docs

def get_final_docs(data_folder: Optional[str] = None, doc_list: Optional[List[Document]] = None) -> List[Document]:
    if doc_list is None:
        if data_folder is None:
            raise ValueError("Either data_folder or doc_list must be provided")
        doc_list = read_pickles(data_folder)
    
    long_docs, short_docs = further_split_long_docs(doc_list)
    final_docs = recursive_chunk_documents(long_docs, short_docs)
    return final_docs
        
def parse_and_process_docs(file_location, data_folder: Optional[str] = None) -> List[Document]:
    if data_folder:
        make_dir(data_folder)
        all_docs = parse_docs(file_location=file_location, data_folder=data_folder)
    else:
        all_docs = parse_docs(file_location=file_location)
        
    final_docs = get_final_docs(doc_list=all_docs)
    return final_docs

In [10]:
final_docs = parse_and_process_docs(file_location="pdfs", data_folder="test_pickles")
pickle.dump(final_docs, open('pickles/final_docs.pkl', "wb"))

File: Diabetes Medications.pdf
Started parsing the file under job_id 35efced1-b9db-475a-9f83-752d30994d72


0it [00:00, ?it/s]
1it [00:00, 34100.03it/s]
0it [00:00, ?it/s]


File: managing-pre-diabetes-(updated-on-27-jul-2021)c2bfc77474154c2abf623156a4b93002.pdf
Started parsing the file under job_id 8bf55fd4-63cc-4271-b9e3-4360312e630e
.

0it [00:00, ?it/s]
1it [00:00, 21183.35it/s]
0it [00:00, ?it/s]
1it [00:00, 14873.42it/s]
2it [00:00, 41527.76it/s]
0it [00:00, ?it/s]


File: Diabetic Foot Ulcer_ Symptoms and Treatment.pdf
Started parsing the file under job_id 99916ca4-76f3-45f5-abe7-264559eb2408


0it [00:00, ?it/s]
0it [00:00, ?it/s]


File: Diabetes Treatment_ Insulin.pdf
Started parsing the file under job_id 423c4aa8-b131-432c-9c54-3c4451f08056


0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, 31068.92it/s]


In [11]:
with open("pickles/final_docs.pkl", 'rb') as file:
    final_docs = pickle.load(file)

In [12]:
final_docs

[Document(id_='43c8dd4c-d18a-469b-97f8-05773f129b16', embedding=None, metadata={'is_table': False, 'source': 'diabetes_medications'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text="Diabetes Medications \n\n You should go back to your regular dosage schedule . Check with your doctor or pharmacist before taking any other medicines , including over - the - counter medicines . \n\n  Rare Side Effects of Anti - diabetes Tablets \n\n Please inform your doctor if you have the following symptoms : \n\n - Dark or coloured urine \n - Tiredness \n - Itchy skin or skin rash \n - Loss of appetite \n - Light - coloured stools \n - Unexplained fever and sore throat \n - Increased sensitivity to sunlight \n - Diarrhoea \n - Yellowing of eyes or skin \n - Nausea or vomiting \n - Headache \n\n Take action : \n\n  Beat Diabetes Step by Step \n\n  Are Anti - diabetes Tablets Addictive ? \n\n You can never become addicted to anti - diabetes tablets . This is a commo

In [13]:
diabetes_medications = [i for i in final_docs if i.metadata['source'] == 'diabetes_medications']

In [14]:
diabetes_medications

[Document(id_='43c8dd4c-d18a-469b-97f8-05773f129b16', embedding=None, metadata={'is_table': False, 'source': 'diabetes_medications'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text="Diabetes Medications \n\n You should go back to your regular dosage schedule . Check with your doctor or pharmacist before taking any other medicines , including over - the - counter medicines . \n\n  Rare Side Effects of Anti - diabetes Tablets \n\n Please inform your doctor if you have the following symptoms : \n\n - Dark or coloured urine \n - Tiredness \n - Itchy skin or skin rash \n - Loss of appetite \n - Light - coloured stools \n - Unexplained fever and sore throat \n - Increased sensitivity to sunlight \n - Diarrhoea \n - Yellowing of eyes or skin \n - Nausea or vomiting \n - Headache \n\n Take action : \n\n  Beat Diabetes Step by Step \n\n  Are Anti - diabetes Tablets Addictive ? \n\n You can never become addicted to anti - diabetes tablets . This is a commo

In [15]:
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "15082001"
NEO4J_DATABASE = "neo4j"

graph_store = Neo4jPropertyGraphStore(
    username=NEO4J_USER,
    password=NEO4J_PASSWORD,
    url=NEO4J_URI,
    refresh_schema=False,
)

# gds = GraphDataScience(NEO4J_URI, database=NEO4J_DATABASE, auth=(NEO4J_USER, NEO4J_PASSWORD))

In [16]:
def delete_all_nodes(graph_store):
    graph_store.structured_query("""
    MATCH (n)
    DETACH DELETE n
    """)
    print("All nodes deleted")

In [17]:
# delete_all_nodes(graph_store)

All nodes deleted


In [18]:
def remove_all_neo4j_restrictions(graph_store):
    graph_store.structured_query("""
    CALL apoc.schema.assert({}, {});
    """)

In [19]:
index = PropertyGraphIndex.from_documents(
    diabetes_medications,
    kg_extractors=[relik],
    llm=llama_llm,
    embed_model=llama_openai_embed_model,
    property_graph_store=graph_store,
    show_progress=True,
)

Parsing nodes: 100%|██████████| 11/11 [00:00<00:00, 1956.22it/s]
Extracting triples: 100%|██████████| 11/11 [02:43<00:00, 14.88s/it]
Generating embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.04it/s]
Generating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.54s/it]
