In [None]:
!pip install langchain
#!pip install chromadb
!pip install -U langchain-chroma
!pip install pypdf
!pip install pytest
!pip install ollama

In [2]:
import argparse
import os
import shutil
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
#from get_embedding_function import get_embedding_function
from langchain.vectorstores.chroma import Chroma

CHROMA_PATH = "chroma"

In [11]:
document_loader = PyPDFDirectoryLoader("documents")
documents = document_loader.load()
print(documents[1])

entry 27 in Xref table invalid but object found


page_content='Neuron, Vol. 36, 585–596, November 14, 2002, Copyright 2002 by Cell Press
The Unfolded Protein Response Modulates
Disease Severity in Pelizaeus-Merzbacher Disease
an X-linked recessive pediatric disorder characterized
by three common genetic forms of disease: coding re-gion or splice site mutations, duplications of the wild-Cherie M. Southwood,
1James Garbern,1,3
Wei Jiang,1and Alexander Gow1,2,3,4
1Center for Molecular Medicine and Genetics
type PLP1 gene, and null alleles. These mutations yield2Department of Pediatrics
a broad spectrum of disease phenotypes from severe,3Department of Neurology
connatal disease to mild forms characterized by pure Wayne State University School of Medicine
spastic paraparesis (reviewed in Garbern et al., 1999; Detroit, Michigan 48201
Southwood and Gow, 2001). Mutant alleles that model
all three of these genetic forms of PMD are available inmice, including (1) myelin synthesis-deficient (msd), an Summary
A242V missense mutation causing sev

In [12]:
import time
from sklearn.decomposition import PCA
import numpy as np
from langchain_community.embeddings import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma

# Text Splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1400,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False,
)

# Split documents into chunks
chunks = text_splitter.split_documents(documents)

# Initialize embedding model
ollama_emb = OllamaEmbeddings(model="llama3")

# Initialize ChromaDB
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=ollama_emb)

# Perform PCA on the embeddings before adding them to ChromaDB
def apply_pca(embeddings, n_components=50):
    pca = PCA(n_components=n_components)
    reduced_embeddings = pca.fit_transform(embeddings)
    return reduced_embeddings

# Start time measurement
start_time = time.time()

last_page_id = None
current_chunk_index = 0
chunk_texts = []  # Store the text of chunks
chunk_metadata = []  # Store metadata to later add to ChromaDB

# Iterate over chunks to generate metadata
for chunk in chunks:
    source = chunk.metadata.get("source")
    page = chunk.metadata.get("page")
    current_page_id = f"{source}:{page}"

    # If the page ID is the same as the last one, increment the index.
    if current_page_id == last_page_id:
        current_chunk_index += 1
    else:
        current_chunk_index = 0

    # Calculate the chunk ID.
    chunk_id = f"{current_page_id}:{current_chunk_index}"
    last_page_id = current_page_id

    # Add the ID to the chunk metadata
    chunk.metadata["id"] = chunk_id
    chunk_metadata.append(chunk.metadata)
    chunk_texts.append(chunk.page_content)

# Batch embedding for all chunks at once
embeddings = ollama_emb.embed_documents(chunk_texts)

# Apply PCA to reduce the dimensionality of embeddings
embeddings = np.array(embeddings)
reduced_embeddings = apply_pca(embeddings, n_components=50)

# Add or Update the documents in ChromaDB
existing_items = db.get(include=[])  # IDs are always included by default
existing_ids = set(existing_items["ids"])
print(f"Number of existing documents in DB: {len(existing_ids)}")

# Only add documents that don't exist in the DB.
new_chunks = [chunk for chunk in chunks if chunk.metadata["id"] not in existing_ids]
new_embeddings = [reduced_embeddings[i] for i, chunk in enumerate(chunks) if chunk.metadata["id"] not in existing_ids]

# Batch processing
BATCH_SIZE = 1000
try:
    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        for start in range(0, len(new_chunks), BATCH_SIZE):
            print(f"Adding chunk num {start}")
            batch = new_chunks[start:start + BATCH_SIZE]
            batch_embeddings = new_embeddings[start:start + BATCH_SIZE]
            batch_ids = [chunk.metadata["id"] for chunk in batch]
            # Add the documents to the database with PCA-reduced embeddings
            db.add_documents(batch, embeddings=batch_embeddings, ids=batch_ids)
        print("✅ New documents added successfully")
    else:
        print("✅ No new documents to add")
except Exception as e:
    print(f"Error occurred during ingestion: {e}")

# End time measurement
end_time = time.time()

# Calculate and print the time taken
time_taken = end_time - start_time
print(f"⏱️ Time taken for document ingestion: {time_taken:.2f} seconds")


KeyboardInterrupt: 

In [10]:
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama

PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

question="Give me the points in common of all diseases you know about."
#question="Do you know how chromosome affects Fabry's disease?"
#question="Is X chromosome and 22 the same?"
#expected_response="Holland or Chile."

# Prepare the DB.
#db = Chroma(persist_directory=CHROMA_PATH, embedding_function=ollama_emb)

# Search the DB.
results = db.similarity_search_with_score(question, k=8)

context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=question)
# print(prompt)

model = Ollama(model="llama3")
response_text = model.invoke(prompt)

sources = [doc.metadata.get("id", None) for doc, _score in results]
formatted_response = f"Response: {response_text}\nSources: {sources}"

print(formatted_response)

Response: Based on the provided context, I can identify some commonalities among the diseases mentioned:

1. **Genetic basis**: All the diseases mentioned have a genetic component, with mutations or deletions affecting specific genes.
2. **Neurological involvement**: Fabry disease, spastic paraplegia type 2, Pelizaeus-Merzbacher disease, and connatal Pelizaeus-Merzbacher disease all have neurological symptoms or are characterized by impaired function of the nervous system.
3. **Genetic variation in PLP1 gene**: The PLP1 gene is involved in two diseases: spastic paraplegia type 2 and Pelizaeus-Merzbacher disease. This suggests that genetic variations in the PLP1 gene can contribute to distinct neurological disorders.

These commonalities highlight the importance of genetics and neurology in understanding these complex diseases.
Sources: ['documents/Roles-of-Neurotransmitter-in-Synapse-Formation_neu.pdf:13:3', 'documents/The-Processing-of-Temporal-Pitch-and-Melody-Inform.pdf:9:3', 'docum