In [1]:
!uv pip install sentence_transformers

[2mUsing Python 3.12.12 environment at: /usr[0m
[2mAudited [1m1 package[0m [2min 403ms[0m[0m


In [87]:
import numpy as np
from sentence_transformers import SentenceTransformer
from openai import OpenAI
from google.colab import userdata

In [88]:
# Simple implementation of a Vector database
class VectorDatabase:
    def __init__(self):
        # Store all vectors in an array
        self.vectors = []

    # Add vector to database
    def add_vector(self, vec_id, vector, metadata=None):
        record = {
            "id": vec_id,
            "vector": np.array(vector, dtype=np.float32),
            "metadata": metadata
        }

        self.vectors.append(record)

    # Retreive all vectors from database
    def get_all_vectors(self):
        return self.vectors

    # Calculate consine similarity between vectors
    def _cosine_similarity(self, vec_a, vec_b):
        # Calculate dot product
        dot_product = np.dot(vec_a, vec_b)

        # Calculate the magnitude of vector A
        norm_a = np.linalg.norm(vec_a)

        # Calculate the magnitude of vector B
        norm_b = np.linalg.norm(vec_b)

        cos_sim = dot_product / (norm_a * norm_b + 1e-8)  # small epsilon to avoid division by zero

        return cos_sim

    # Search for similar vectors and return the top_k results
    def search(self, query_vector, top_k = 3):
        query_vector = np.array(query_vector, dtype = np.float32)

        # Stores the top_k results
        results = []

        for record in self.vectors:
            sim = self._cosine_similarity(query_vector, record["vector"])

            results.append({
                "id": record["id"],
                "similarity": sim,
                "metadata": record["metadata"]
            })

        results.sort(key=lambda x: x["similarity"], reverse=True)

        return results[:top_k]

In [89]:
# Helper function to generate multiple hypothetical documents
def generate_hypothetical_docs(client, query, temperature, num_documents=5):
    prompt = f"""
    Write a paragraph in 30–45 words in 3 sentences
    that directly answers the given question.
    If you do not know the correct answer,
    create a hypothetical answer to the query.

    Question: {query}

    Paragraph:
    """

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You write concise paragraphs based on a given query."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=250,
        temperature=temperature,
        n=num_documents
    )

    hypothetical_docs = []

    for i, choice in enumerate(response.choices):
        hypothetical_doc = choice.message.content.strip()
        hypothetical_docs.append(hypothetical_doc)

        print(f"Hypothetical Document {i+1}:")
        print(f"{hypothetical_doc}\n")

    return hypothetical_docs

In [90]:
# Help function to create embeddings for documents and average them.
def create_averaged_embedding(embedding_model, documents):
    embeddings = []

    for doc in documents:
        embedding = embedding_model.encode(doc, normalize_embeddings=True)
        embeddings.append(embedding)

    # Average all embeddings
    averaged_embedding = np.mean(embeddings, axis=0)

    # Normalize the averaged embedding
    averaged_embedding = averaged_embedding / (np.linalg.norm(averaged_embedding) + 1e-8)

    return averaged_embedding

In [91]:
# Helper function to index documents in the vector database
def index_documents(vector_db, embedding_model, documents):
    for idx, doc in enumerate(documents):
        embedding = embedding_model.encode(doc, normalize_embeddings=True)

        vector_db.add_vector(
            vec_id=f"doc_{idx}",
            vector=embedding,
            metadata={"document": doc}
        )

    print(f"Indexed {len(documents)} documents in vector database.\n")

In [92]:
# Helper function to retreive relevant documents from vector database using HyDE
def retrieve_documents(vector_db, embedding_model, client, query, temperature, top_k=3):
    # Generate multiple hypothetical documents
    hypothetical_docs = generate_hypothetical_docs(client, query, num_documents=5, temperature = temperature)

    # Create averaged embedding
    query_embedding = create_averaged_embedding(embedding_model, hypothetical_docs)

    # Search vector database
    results = vector_db.search(query_embedding, top_k=top_k)

    return results

In [93]:
# Helper function to generate final answer
def generate_answer(client, query, retrieved_docs):
    context = "\n\n".join([doc['metadata']['document'] for doc in retrieved_docs])

    prompt = f"""
    Based on the following context, answer the question accurately.

    Context: {context}
    Question: {query}

    Answer:
    """

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that answers questions based on provided context."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=250,
        temperature=0.3
    )

    return response.choices[0].message.content.strip()

In [94]:
# Helper function for the complete RAG pipeline with HyDE
def hyde_rag_pipeline(client, embedding_model, vector_db, query, temperature, top_k=3):

    # Retrieve documents
    retrieved_docs = retrieve_documents(
        vector_db,
        embedding_model,
        client,
        query,
        top_k=top_k,
        temperature = temperature
    )

    # Generate answer
    answer = generate_answer(client, query, retrieved_docs)

    return answer, retrieved_docs

In [95]:
# Documents for the vector database
sentences = [
    "Alzheimer’s disease research shows that early cognitive decline is linked to the accumulation of beta-amyloid plaques and tau tangles in the brain. These pathological changes disrupt neural communication long before symptoms become obvious. Large longitudinal studies continue to clarify which biomarkers best predict progression.",
    "In cancer biology, tumor cells often acquire mutations that enable uncontrolled growth and resistance to normal regulatory signals. Genomic sequencing allows researchers to identify these mutations across thousands of samples. Databases cataloging such genomic profiles help scientists compare patterns across cancer types.",
    "Respiratory infections such as influenza can cause significant inflammation of the upper and lower airways. This inflammation contributes to symptoms like fever, cough, and fatigue. Studies show that viral shedding typically peaks shortly before the onset of noticeable symptoms.",
    "Cardiovascular research has demonstrated that atherosclerosis develops gradually as cholesterol-rich plaques accumulate within arterial walls. These plaques may remain stable for years but can become dangerous if they rupture. Large epidemiological cohorts continue to refine understanding of risk factors.",
    "In type 2 diabetes, insulin resistance limits the ability of cells to take up glucose effectively. Over time, pancreatic beta cells may struggle to compensate, leading to chronically elevated blood sugar. Genetic, metabolic, and environmental factors all contribute to disease development.",
    "Chronic kidney disease is often detected through long-term trends in glomerular filtration rate and markers of kidney damage. Research indicates that early stages may progress slowly and remain asymptomatic. Population studies help identify patterns associated with faster progression.",
    "Parkinson’s disease is characterized by the gradual loss of dopamine-producing neurons in the substantia nigra. This loss leads to motor symptoms such as bradykinesia, rigidity, and tremor. Neuroimaging and molecular studies continue to investigate the underlying mechanisms of neuronal degeneration.",
    "Asthma involves chronic inflammation and hyperreactivity of the airways, which can narrow in response to environmental triggers. Genetic predisposition, allergen exposure, and airway remodeling all play roles in disease expression. Research efforts focus on understanding these contributing pathways.",
    "In inflammatory bowel disease, chronic inflammation affects the gastrointestinal tract through immune dysregulation. Ulcerative colitis and Crohn’s disease differ in their patterns of involvement and depth of inflammation. Ongoing research investigates interactions between the microbiome and the immune system.",
    "Migraine is a neurological condition involving recurrent episodes of moderate to severe head pain. Scientists believe that cortical spreading depression, neurovascular changes, and genetic susceptibility contribute to the disorder. Studies also examine sensory hypersensitivity as a core feature.",
    "Heart failure develops when the heart can no longer pump blood efficiently to meet the body’s needs. Structural changes such as ventricular remodeling often occur over extended periods. Imaging and biomarker data help researchers track disease progression across diverse patient groups.",
    "Osteoporosis results from reduced bone mineral density and deterioration of bone microarchitecture. This increases susceptibility to fractures, especially in older adults. Research highlights the roles of hormonal changes, calcium metabolism, and genetic factors in bone strength.",
    "In multiple sclerosis, immune cells mistakenly attack the myelin sheath that insulates nerve fibers in the central nervous system. This leads to disruptions in nerve signal conduction and varied neurological symptoms. Imaging techniques like MRI reveal characteristic lesions.",
    "Chronic obstructive pulmonary disease typically develops after long-term exposure to airway irritants, leading to persistent airflow limitation. Structural changes such as alveolar destruction and airway narrowing contribute to progressive breathing difficulty. Population studies track global disease burden.",
    "Sickle cell disease arises from a mutation in the beta-globin gene, causing hemoglobin to polymerize under low oxygen conditions. This results in rigid, sickle-shaped red blood cells that can block blood flow. Research focuses on understanding genetic modifiers and hemoglobin dynamics.",
    "Neurons in the brain communicate through electrical impulses and chemical neurotransmitters, and disruptions in these signaling pathways are a common feature of many neurological disorders such as Parkinson's disease.",
    "NeuroVita Therapeutics is pioneering a monoclonal antibody designed to target alpha-synuclein aggregates, a hallmark of Parkinson’s disease. Their innovative approach aims to slow neurodegeneration and improve motor function in patients. Clinical trials are underway to evaluate safety and efficacy in early-stage Parkinson’s."
]

In [96]:
# Define OpenAI client
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
client = OpenAI(api_key = OPENAI_API_KEY)

# Instantiate embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Instantiate vector database
vector_db = VectorDatabase()

# Index documents
index_documents(vector_db, embedding_model, documents = sentences)

Indexed 17 documents in vector database.



In [97]:
# Query
query = "Which company is making new antibody to treat Parkinson's disease and what is it's target?"

print(f"Query: {query}\n")

print("="* 50)

# Run RAG pipeline using HyDE
answer, hyde_results = hyde_rag_pipeline(
    client=client,
    embedding_model=embedding_model,
    vector_db=vector_db,
    query=query,
    top_k=3,
    temperature = 1.2
)

print("="* 50)

print("Top 3 Retrieved Documents\n")

for i, res in enumerate(hyde_results, 1):
    print(f"Result {i}:")
    print(f"Document: {res['metadata']['document']}")
    print(f"Similarity Score: {res['similarity']:.4f}\n")

print("="* 50)

# Print final answer
print(f"Final generated answer: {answer}")

Query: Which company is making new antibody to treat Parkinson's disease and what is it's target?

Hypothetical Document 1:
XYZ Pharmaceuticals is developing a groundbreaking antibody aimed at treating Parkinson's disease. This therapy specifically targets alpha-synuclein, a protein implicated in the neurodegenerative processes of the disease. By reducing its accumulation, the company hopes to slow disease progression and improve patient outcomes.

Hypothetical Document 2:
A biotechnology company called NeuroPharma is currently developing a novel antibody aimed at treating Parkinson's disease. This antibody specifically targets alpha-synuclein proteins, which are implicated in the degeneration of neurons in affected patients. By neutralizing these proteins, NeuroPharma hopes to slow disease progression.

Hypothetical Document 3:
AstraZeneca is developing a new antibody aimed at treating Parkinson's disease. The therapy specifically targets alpha-synuclein, a protein implicated in the d