<a href="https://colab.research.google.com/github/abd-ur/2.-RAGwithPinecone/blob/main/2_RAGwithPinecone.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sentence-transformers pinecone langchain langchain-community sacremoses

Collecting nvidia-cuda-nvrtc-cu12==12.6.77 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.6.77 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.6.80 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.6.4.1 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.6.4.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.7.77 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_curand_cu12-10.3.7.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.5 kB

DATA INGESTION and EMBEDDING

In [2]:
import os
import json
import torch
from sentence_transformers import SentenceTransformer
import pinecone
from google.colab import files

# pinecone environment
os.environ["PINECONE_API_KEY"] = "pcsk_5ynbw1_B3ChsR7YMhGgLZKuuX36TTnGmzGLd4kREuGnmin96enqrjzWX32WukKuEVtUmZ8"
os.environ["PINECONE_ENV"] = "us-east-1"

# upload variant.json
uploaded = files.upload()
variants_file = list(uploaded.keys())[0]
print(f"Uploaded file: {variants_file}")

with open(variants_file, "r", encoding="utf-8") as f:
    variants = json.load(f)

# handles missing ids
for i, v in enumerate(variants):
    if "interpretation" not in v:
        raise ValueError(f"Record {i} missing 'interpretation'")
    if "id" not in v:
        v["id"] = f"var_{i:06d}"

print(f"Loaded {len(variants)} variant records.")

# init embedding model
embed_model = SentenceTransformer("dmis-lab/biobert-base-cased-v1.1")

from pinecone import Pinecone, ServerlessSpec

# create pinecone client
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

INDEX_NAME = "variants-index"
EMBED_DIM = 768 # used dim 768 due to poor embedding performance on 384 dim

# check and create index
existing_indexes = [idx.name for idx in pc.list_indexes()]
if INDEX_NAME not in existing_indexes:
    pc.create_index(
        name=INDEX_NAME,
        dimension=EMBED_DIM,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"))

index = pc.Index(INDEX_NAME)

# embed and upsert variant
BATCH_SIZE = 64

for i in range(0, len(variants), BATCH_SIZE):
    batch = variants[i:i+BATCH_SIZE]
    texts = [v["interpretation"] for v in batch]
    vectors = embed_model.encode(texts, show_progress_bar=False, convert_to_numpy=True).tolist()
    to_upsert = []
    for vobj, vec in zip(batch, vectors):
        metadata = {
            "variant": vobj.get("variant"),
            "gene": vobj.get("gene"),
            "source": vobj.get("source"),
            "interpretation": vobj.get("interpretation"),
            "id": vobj.get("id")}
        to_upsert.append((vobj["id"], vec, metadata))
    index.upsert(vectors=to_upsert)

print(f"Upserted {len(variants)} variants to Pinecone index '{INDEX_NAME}' successfully.")


Saving variant.json to variant.json
Uploaded file: variant.json
Loaded 70 variant records.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Upserted 70 variants to Pinecone index 'variants-index' successfully.


In [3]:
from sentence_transformers import SentenceTransformer
import numpy as np

# init embedding model
query_model = SentenceTransformer("dmis-lab/biobert-base-cased-v1.1")

def query_variants(user_query, top_k=3, similarity_threshold=0.7, gene_filter=None):
    """Args:
        user_query: question or query string.
        top_k: number of vectors to retrieve.
        similarity_threshold: minimum cosine similarity required.
        gene_filter: filter results by specific gene.
    Returns:
        list of dicts containing retrieved metadata and score."""
    # query embed
    query_vec = query_model.encode(user_query, convert_to_numpy=True).tolist()

    # optional filter for gene
    filter_dict = {"gene": {"$eq": gene_filter}} if gene_filter else None

    # query pinecone
    response = index.query(
        vector=query_vec,
        top_k=top_k,
        filter=filter_dict,
        include_metadata=True,
        include_values=False)

    # similarity threshold filter
    results = []
    for match in response.matches:
        if match.score >= similarity_threshold:
            results.append({
                "id": match.id,
                "score": match.score,
                "metadata": match.metadata})

    # edge case handle for no matches
    if not results:
        return [{"message": "Insufficient data; consult a clinician."}]

    return results




RETRIEVAL and GENERATION

In [74]:
## retrieving top 3 matches based on user query with cosine similarity > 0.7 --------------------

def query_variants_biobert(user_query, top_k=3, similarity_threshold=0.8, gene_filter=None):
    query_vec = embed_model.encode(user_query, convert_to_numpy=True).tolist()
    filter_dict = {"gene": {"$eq": gene_filter}} if gene_filter else None
    response = index.query(
        vector=query_vec,
        top_k=top_k,
        filter=filter_dict,
        include_metadata=True,
        include_values=False
    )

    results = []
    for match in response.matches:
        if match.score >= similarity_threshold:
            results.append({
                "id": match.id,
                "score": match.score,
                "metadata": match.metadata
            })

    if not results:
        return [{"message": "Insufficient data; consult a clinician."}]

    return results

# query
user_query = "Best drug for BRCA1 mutation?"
gene_filter = "BRCA1"
top_k = 3
similarity_threshold = 0.8

# retrieve top_k matches
top_matches = query_variants_biobert(
    user_query,
    top_k=top_k,
    similarity_threshold=similarity_threshold,
    gene_filter=gene_filter
)

# inspect similarity
print("=== Retrieved Contexts ===")
for i, match in enumerate(top_matches, 1):
    if "message" in match:
        print(match["message"])
    else:
        interp = match['metadata']['interpretation']
        source = match['metadata']['source']
        score = match['score']
        print(f"{i}. {interp} (Source: {source}) | Cosine similarity: {score:.3f}")


=== Retrieved Contexts ===
1. Likely pathogenic; PARP inhibitors recommended. (Source: ClinVar) | Cosine similarity: 0.889
2. Truncating variant; Radiation not advised. (Source: COSMIC) | Cosine similarity: 0.862
3. Truncating variant; Radiation not advised. (Source: ClinVar) | Cosine similarity: 0.862


In [12]:
## RAG chain ---------------------------------------------------------

from langchain.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# load biogpt
model_name = "microsoft/biogpt"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# langchain pipeline wrapper
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=200
)
llm = HuggingFacePipeline(pipeline=pipe)


config.json:   0%|          | 0.00/595 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

Device set to use cpu


In [54]:
def generate_rag_answer_gpt2(user_query, top_k, similarity_threshold, gene_filter):

    top_matches = query_variants_biobert(user_query, top_k, similarity_threshold, gene_filter)

    # handle no data
    if "message" in top_matches[0]:
        return top_matches[0]["message"]

    # merge interpretation that are identical
    merged_contexts = {}
    for match in top_matches:
        interp = match['metadata']['interpretation']
        source = match['metadata']['source']
        if interp not in merged_contexts:
            merged_contexts[interp] = [source]
        else:
            merged_contexts[interp].append(source)

    # build context text
    contexts_text = ""
    for i, (interp, sources) in enumerate(merged_contexts.items(), 1):
        contexts_text += f"{i}. {interp} (Sources: {', '.join(sources)})\n"

    # prompt for gpt2
    prompt = (
        "You are a biomedical assistant. Using the following contexts with cited sources, "
        "provide a clear treatment recommendation. "
        "Do not repeat the query. Summarize concisely in one sentence, citing sources.\n\n"
        f"Gene: {gene_filter if gene_filter else 'Unknown'}\n"
        f"Query: {user_query}\n\n"
        f"Contexts:\n{contexts_text}\n"
        "Answer:"
    )

    answer = pipe(prompt, max_length=200)[0]['generated_text']
    return answer


In [57]:
user_query = "Best treatment for TP53 R130* mutation?"
answer = generate_rag_answer_gpt2(user_query, top_k=2, similarity_threshold=0.8, gene_filter="TP53")
print("=== RAG Answer ===")
print(answer)


=== RAG Answer ===
You are a biomedical assistant. Using the following contexts with cited sources, provide a clear treatment recommendation. Do not repeat the query. Summarize concisely in one sentence, citing sources.

Gene: TP53
Query: Best treatment for TP53 R130* mutation?

Contexts:
1. Founder mutation; Genetic counseling required. (Sources: ClinVar, COSMIC)

Answer: Genetic counseling required to support clinical decisions and counseling resources (Sources: ClinVar, COSMIC).
