In [None]:
!pip install langchain
#!pip install chromadb
!pip install -U langchain-chroma
!pip install pypdf
!pip install pytest
!pip install ollama

In [1]:
import argparse
import os
import shutil
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain.schema.document import Document
from langchain.vectorstores.chroma import Chroma
import time
from sklearn.decomposition import PCA
import numpy as np
from langchain_chroma import Chroma

In [14]:
# Load all the PDF documents from the "documents" folder
document_loader = PyPDFDirectoryLoader("documents")
documents = document_loader.load()
print(documents[1])

entry 27 in Xref table invalid but object found


page_content='Neuron, Vol. 36, 585–596, November 14, 2002, Copyright 2002 by Cell Press
The Unfolded Protein Response Modulates
Disease Severity in Pelizaeus-Merzbacher Disease
an X-linked recessive pediatric disorder characterized
by three common genetic forms of disease: coding re-gion or splice site mutations, duplications of the wild-Cherie M. Southwood,
1James Garbern,1,3
Wei Jiang,1and Alexander Gow1,2,3,4
1Center for Molecular Medicine and Genetics
type PLP1 gene, and null alleles. These mutations yield2Department of Pediatrics
a broad spectrum of disease phenotypes from severe,3Department of Neurology
connatal disease to mild forms characterized by pure Wayne State University School of Medicine
spastic paraparesis (reviewed in Garbern et al., 1999; Detroit, Michigan 48201
Southwood and Gow, 2001). Mutant alleles that model
all three of these genetic forms of PMD are available inmice, including (1) myelin synthesis-deficient (msd), an Summary
A242V missense mutation causing sev

In [3]:
# Divides documents into chunks of size 1800 characters with 250 character overlap
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1800,
    chunk_overlap=250,
    length_function=len,
    is_separator_regex=False,
)
# Split the documents into smaller chunks for better processing
chunks = text_splitter.split_documents(documents)

In [4]:
# Initialize the Ollama LLM embedding model with llama3 for generating vector embeddings
ollama_emb = OllamaEmbeddings(model="llama3")

In [5]:
# Initialize ChromaDB for storing and retrieving embeddings
CHROMA_PATH = "chroma" # Directory where Chroma will persist the embeddings
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=ollama_emb)

In [6]:
# (Optional) PCA could be applied to reduce embedding dimensionality
#def apply_pca(embeddings, n_components=50):
#    pca = PCA(n_components=n_components)
#    reduced_embeddings = pca.fit_transform(embeddings)
#    return reduced_embeddings

# Variables to track last page ID and chunk index for metadata
last_page_id = None
current_chunk_index = 0
chunk_texts = []  # Store the text of chunks
chunk_metadata = []  # Store metadata to later add to ChromaDB

# Iterate through each chunk to generate metadata (like source, page, and ID)
for chunk in chunks:
    source = chunk.metadata.get("source") # Document source (file path)
    page = chunk.metadata.get("page") # Page number in the document
    current_page_id = f"{source}:{page}" # Create a unique page ID

    # If the page is the same as the last one, increment the chunk index (for multiple chunks from one page)
    if current_page_id == last_page_id:
        current_chunk_index += 1
    else:
        current_chunk_index = 0 # Reset index if it's a new page

    # Create a unique chunk ID using page ID and chunk index
    chunk_id = f"{current_page_id}:{current_chunk_index}"
    last_page_id = current_page_id # Update last seen page ID

    # Add metadata (chunk ID) to the chunk and store for later
    chunk.metadata["id"] = chunk_id
    chunk_metadata.append(chunk.metadata)
    chunk_texts.append(chunk.page_content)

In [7]:
# Define the batch size for embedding chunks in smaller groups to optimize performance
batch_size = 1000  # This can be adjusted based on system resources

# Start measuring time for embedding process
start_time = time.time()

# Helper function to split chunk_texts into batches
def create_batches(chunk_texts, batch_size):
    for i in range(0, len(chunk_texts), batch_size):
        yield chunk_texts[i:i + batch_size] # Yield batches of size batch_size

# Initialize list to store all embeddings
all_embeddings = []

# Process each batch of text chunks to create embeddings
for batch in create_batches(chunk_texts, batch_size):
    # Use the Ollama embedding model to embed the current batch of text
    batch_embeddings = ollama_emb.embed_documents(batch)
    all_embeddings.extend(batch_embeddings) # Add the embeddings to the list

# End time measurement for embedding process
end_time = time.time()

# Calculate and print the time taken to embed all chunks
time_taken = end_time - start_time
print(f"⏱️ Tiempo tomado para embeber los chunks en lotes: {time_taken:.2f} segundos")


⏱️ Tiempo tomado para embeber los chunks en lotes: 491.43 segundos


In [8]:
# Start time measurement
#start_time = time.time()

# Convert the list of embeddings into a numpy array for easier manipulation
embeddings = np.array(all_embeddings)

# (Optional) 
#reduced_embeddings = apply_pca(embeddings, n_components=50)

# End time measurement
#end_time = time.time()

# Calculate and print the time taken
#time_taken = end_time - start_time
#print(f"⏱️ Tiempo tomado para aplicar PCA a los embeddings: {time_taken:.2f} seconds")

In [9]:
# Start measuring time for adding the embeddings to ChromaDB
start_time = time.time()

# Get all existing items in ChromaDB to avoid duplicating data
existing_items = db.get(include=[])  # IDs are always included by default
existing_ids = set(existing_items["ids"]) # Get the IDs of the existing documents
print(f"Number of existing documents in DB: {len(existing_ids)}")

# Filter out chunks that already exist in the DB by their unique IDs
new_chunks = [chunk for chunk in chunks if chunk.metadata["id"] not in existing_ids]
new_embeddings = [embeddings[i] for i, chunk in enumerate(chunks) if chunk.metadata["id"] not in existing_ids]

# Define a constant batch size for adding documents to ChromaDB
BATCH_SIZE = 1000

# Try adding new documents and embeddings to ChromaDB in batches
try:
    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        for start in range(0, len(new_chunks), BATCH_SIZE):
            print(f"Adding chunk batch {start/BATCH_SIZE}")
            batch = new_chunks[start:start + BATCH_SIZE] # Get a batch of chunks
            batch_embeddings = new_embeddings[start:start + BATCH_SIZE] # Get corresponding embeddings
            batch_ids = [chunk.metadata["id"] for chunk in batch] # Get chunk IDs for the batch
            # Add documents and embeddings to ChromaDB
            db.add_documents(batch, embeddings=batch_embeddings, ids=batch_ids)
        print("✅ New documents added successfully")
    else:
        print("✅ No new documents to add")
except Exception as e:
    print(f"Error occurred during ingestion: {e}")

# End time measurement
end_time = time.time()

# Calculate and print the time taken
time_taken = end_time - start_time
print(f"⏱️ Tiempo tomado para ingestar los embeddings a ChromaDB: {time_taken:.2f} seconds")

Number of existing documents in DB: 0
👉 Adding new documents: 1896
Adding chunk batch 0.0
Adding chunk batch 1.0
✅ New documents added successfully
⏱️ Tiempo tomado para ingestar los embeddings a ChromaDB: 497.14 seconds


In [13]:
# Define a prompt template to structure the question and context for LLM
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama

PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""
# Define the question for the model
#question="Can you give me some common issues from all diseases you know about?"
#question="Do you know how chromosome affects Fabry's disease?"
question="How can you relate Fabry disease with any other disease you know about?"

# Perform similarity search on ChromaDB using the question to find relevant chunks
results = db.similarity_search_with_score(question, k=7)

# Compile the search results into a context for the LLM
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])

# Prepare the prompt with context and question
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=question)

# Invoke the LLM model to generate an answer based on the context
model = Ollama(model="llama3")
response_text = model.invoke(prompt)

# Extract the sources (document IDs) from the search results
sources = [doc.metadata.get("id", None) for doc, _score in results]

# Format the final response with the LLM answer and source documents
formatted_response = f"Response: {response_text}\nSources: {sources}"

# Print the final response along with the sources
print(formatted_response)

Response: Based on the provided references, Fabry disease is a genetic disorder caused by a deficiency of alpha-galactosidase A enzyme. This deficiency leads to the accumulation of globotriaosylsphingosine (lyso-Gb3) in various tissues, causing symptoms such as pain, weakness, and organ damage.

One disease that can be related to Fabry disease is Gaucher disease, which is also caused by a deficiency of an enzyme involved in lipid metabolism. In both diseases, the accumulation of abnormal lipids leads to tissue damage and organ dysfunction. Both diseases are characterized by a build-up of sphingolipids, which can cause cellular toxicity and contribute to the development of symptoms.

Another example is Tay-Sachs disease, another lysosomal storage disorder caused by a deficiency of hexosaminidase A enzyme. Like Fabry disease, Tay-Sachs disease is characterized by the accumulation of gangliosides in neurons and other tissues, leading to progressive neurological degeneration and death.

Th