In [2]:
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_community.llms import Ollama
#from langchain_experimental.llms.ollama_functions import OllamaFunctions
from langchain_community.embeddings import OllamaEmbeddings

# log in to VPN or harvard wifi for endpoint

In [3]:
#create the LLM 
ccb_endpoint = 'http://compute-gc-17-255.o2.rc.hms.harvard.edu:11434'

llm = Ollama(base_url= ccb_endpoint, model="llama2", temperature=0)
#create the embedding model
oembed = OllamaEmbeddings(base_url=ccb_endpoint, model="nomic-embed-text")

# the following code blocks are to show the ingest process for the vector storage, they do not need to be run every time.

In [18]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import DirectoryLoader

loader = DirectoryLoader('/n/data1/hms/ccb/projects/bioc-chatbot/data/bioc-top-500/', glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader)
all_docs = loader.load()

  8%|▊         | 87/1030 [00:49<17:29,  1.11s/it]Multiple definitions in dictionary at byte 0x1d8b for key /Group
 77%|███████▋  | 797/1030 [13:40<05:05,  1.31s/it]  Multiple definitions in dictionary at byte 0x125d for key /Group
100%|██████████| 1030/1030 [16:15<00:00,  1.06it/s]


In [None]:
#save to persistent chroma-db
from langchain_community.vectorstores import Chroma

# split it into chunks (i think azure search default is size=1000 and chunk overlap=200)
#text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
#after a few failures it seems that the context window, while larger than 003-ada, is 8000 tokens the vector is only 512 dim long meaning it can only take 500 tokens
#text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
doc2vect = text_splitter.split_documents(all_docs)

db2 = Chroma.from_documents(doc2vect, oembed, persist_directory="/n/data1/hms/ccb/projects/bioc-chatbot/data/chromadb_vignettesT500_csize500_overlap10")



#text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
#splits = text_splitter.split_documents(docs)
#vectorstore = Chroma.from_documents(documents=splits, embedding=oembed)

#this is done in a seperate slurm script

# load from existing chromaDB

In [4]:
#load from persist 
T500bioc_db = Chroma(persist_directory="/n/data1/hms/ccb/projects/bioc-chatbot/data/chromadb_vignettesT500_csize500_overlap10", embedding_function=oembed)

In [5]:
#test retrival

retriever = T500bioc_db.as_retriever(search_type="similarity", search_kwargs={"k": 6})

retrieved_docs = retriever.invoke("How many classes are there in a SummarizedExperiment object?")
print(retrieved_docs[0].page_content)

22 SummarizedExperiment-class
SummarizedExperiment-class
SummarizedExperiment objects
Description
The SummarizedExperiment class is a matrix-like container where rows represent features of in-
terest (e.g. genes, transcripts, exons, etc...) and columns represent samples (with sample data
summarized as a DataFrame). A SummarizedExperiment object contains one or more assays, each
represented by a matrix-like object of numeric or other mode.


In [20]:
# Retrieve and generate using the relevant snippets of the blog.
retriever = T500bioc_db.as_retriever(search_type="similarity", search_kwargs={"k": 6})

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
from langchain_core.prompts import ChatPromptTemplate

template = """
Act as an expert in the R programming language and the Bioconductor suite of packages.  ​\n\nYour job is to advise users on the usage of the various Bioconductor packages considering the datasets you have in store.  ​\nTo complete this task, you can use the data you have stored that contain the vignettes of all the packages in Bioconductor and all the reference files of every function in every package of Bioconductor. ​\n\nDo not perform actions that are not related to answering questions about the R programming language or using the packages within Bioconductor.​ \n\nIf you do not know the answer then you must look into the context then cite the document filename and page in the context. Do not include DOI numbers or make up citations not found in the context. 
Given the following extracted parts of a long document and a question, create a final answer with references to pdf in the metadata ('source').\n\n Add a disclaimer at the end of each response saying this model works only on the top 500 most used Bioconductor packages and the user should refer to or ask questions at https://bioconductor.org.

QUESTION: {question}
=========
{context}
=========
FINAL ANSWER: """
prompt = ChatPromptTemplate.from_template(template)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [21]:
for chunk in rag_chain.stream("What is SummarizedExperiment?"):
    print(chunk, end="", flush=True)

The SummarizedExperiment class in Bioconductor is a matrix-like container that stores features of interest (e.g. genes, transcripts, exons, etc.) and samples. It is similar to the historical ExpressionSet, but allows for more flexible row information, allowing both GRanges based as well as those described by arbitrary DataFrame s. This makes it ideal for a variety of experiments, particularly sequencing-based experiments such as RNA-Seq and ChIp-Seq.

The SummarizedExperiment package contains two classes: SummarizedExperiment and RangedSummarizedExperiment. The SummarizedExperiment class is the main class used to store data, while the RangedSummarizedExperiment class provides a way to construct SummarizedExperiments with multiple assays.

The constructor for SummarizedExperiment instances is documented in the ?RangedSummarizedExperiment function. The objects contain one or more assays, each represented by a matrix-like object of numeric or other mode. The rows of a SummarizedExperiment

In [25]:
#without streaming loop
query = "How many classes are there in the SummarizedExperiment package?"
response = rag_chain.invoke(query)
response

'According to the documentation provided, the SummarizedExperiment package contains two classes: `SummarizedExperiment` and `RangedSummarizedExperiment`. (Ref: [1], [2])\n\nTherefore, the answer to the question is: There are 2 classes in the SummarizedExperiment package.\n\nReferences:\n[1] Morgan, M. (2024). tidySummarizedExperiment: Brings SummarizedExperiment to the Tidyverse. Version 1.12.0. Retrieved from <https://bioconductor.org/packages/release/bioc/html/tidySummarizedExperiment.html>\n[2] SummarizedExperiment Package. (2024). SummarizedExperiment. Retrieved from <https://bioconductor.org/packages/release/bioc/html/SummarizedExperiment.html>'

In [28]:
#programmatic access to csv
import polars as pl
df = pl.read_csv('bioc_qa.csv')
questions = df['Question'].to_list()
answers_rag = []

for query in questions:
    result = rag_chain.invoke(query)
    answers_rag.append(result)

answers_rag
    

['Thank you for reaching out to me as an expert in the R programming language and the Bioconductor suite of packages. I understand that you are a bit confused about the concepts of FDR, FDR-adjusted p-value, and q-value. Let me clarify these terms for you.\n\nFDR (False Discovery Rate) is a measure of the proportion of false positives among all rejected hypotheses. In other words, it measures the number of genes that are declared differentially expressed when they are actually not. FDR is calculated as the number of false positives divided by the total number of rejected hypotheses.\n\nFDR-adjusted p-value is a modified p-value that takes into account the FDR. It is calculated by multiplying the original p-value by the FDR. For example, if the original p-value is 0.05 and the FDR is 0.1, the FDR-adjusted p-value would be 0.05 x 0.1 = 0.005.\n\nq-value, on the other hand, is a measure of the probability that a gene is differentially expressed given the observed data, under the assumptio

In [40]:
#s = pl.Series("Response_llama2_Bioc_RAG",answers_rag)
df = df.with_columns(pl.Series(name="Response_llama2_Bioc_RAG", values=answers_rag)) 
df

AID,QID,Question,Response,Response_llama2_Bioc_RAG
str,str,str,str,str
"""answer1""","""question1""","""I am a bit con…","""The thing to u…","""Thank you for …"
"""answer2""","""question2""","""I am working o…","""Just to be cle…","""Thank you for …"
"""answer3""","""question3""","""I am new in th…","""There is no go…","""As an expert i…"
"""answer4""","""question4""","""I am testing s…","""To answer your…","""Thank you for …"
"""answer5""","""question5""","""In all RNA-seq…","""The most compl…","""The dispersion…"
"""answer6""","""question6""","""I know findOve…","""From the discu…","""It seems like …"
"""answer7""","""question7""","""I have just do…","""I wrote two he…","""The question y…"
"""answer8""","""question8""","""How can I filt…","""If you want to…","""To filter out …"
"""answer9""","""question9""","""I am analysing…","""You can use th…","""As an expert i…"
"""answer10""","""question10""","""How do I merge…","""Merge is a pre…","""To merge a lis…"


In [43]:

plain_template = """
Act as an expert in the R programming language and the Bioconductor suite of packages. 
Your job is to advise users on the usage of the various Bioconductor packages.  
Do not perform actions that are not related to answering questions about the R programming language or using the packages within Bioconductor.​ 
If you do not know the answer ask the user to refer to https://bioconductor.org. 
Add a disclaimer at the end of each response saying this model works only on the top 500 most used Bioconductor packages.

QUESTION: {question}
FINAL ANSWER: """
prompt = ChatPromptTemplate.from_template(plain_template)

non_rag_chain = (
    {"question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [44]:
for chunk in non_rag_chain.stream("What is SummarizedExperiment?"):
    print(chunk, end="", flush=True)

As an expert in the R programming language and the Bioconductor suite of packages, I can advise you on the usage of various Bioconductor packages. The package "SummarizedExperiment" is a popular package in Bioconductor that provides a comprehensive summary of gene expression data. It allows users to summarize and visualize large-scale gene expression data sets, including RNA-seq, microarray, and other types of gene expression data.

The SummarizedExperiment package provides several functions for summarizing and visualizing gene expression data, including:

1. `summarize()`: This function generates a summary of the gene expression data, including the mean, standard deviation, minimum, maximum, and median expression levels for each gene.
2. `plot()`: This function creates a variety of plots to visualize the gene expression data, such as heatmaps, scatter plots, and box plots.
3. `cluster()`: This function groups genes into clusters based on their expression patterns, allowing users to id

In [45]:
#run the loop
#programmatic access to csv
questions = df['Question'].to_list()
answers_non_rag = []

for query in questions:
    result = non_rag_chain.invoke(query)
    answers_non_rag.append(result)

answers_non_rag
    

['As an expert in the R programming language and the Bioconductor suite of packages, I\'d be happy to help you understand the concepts of FDR, FDR-adjusted p-value, and q-value.\n\nFDR (False Discovery Rate) is a method for controlling the family-wise error rate (FWER) in multiple testing scenarios. It is a measure of the proportion of false positives among all rejected hypotheses. In other words, it\'s a measure of how many "real" discoveries are lost due to the stringent significance threshold.\n\nFDR-adjusted p-value, on the other hand, is a way to adjust the traditional p-value to account for the multiple testing problem. It\'s calculated as the original p-value multiplied by the number of genes/rank of the gene. This means that if you have 100 genes tested and 5 are found to be differentially expressed at an FDR cutoff of 0.05, the FDR-adjusted p-value would be 0.05 x 100 = 5.\n\nSo, to answer your question, no, FDR-adjusted p-value is not the same as q-value. Q-value is a measure

In [46]:
df = df.with_columns(pl.Series(name="Response_llama2_Temp0", values=answers_non_rag)) 
df

AID,QID,Question,Response,Response_llama2_Bioc_RAG,Response_llama2_Temp0
str,str,str,str,str,str
"""answer1""","""question1""","""I am a bit con…","""The thing to u…","""Thank you for …","""As an expert i…"
"""answer2""","""question2""","""I am working o…","""Just to be cle…","""Thank you for …","""As an expert i…"
"""answer3""","""question3""","""I am new in th…","""There is no go…","""As an expert i…","""As an expert i…"
"""answer4""","""question4""","""I am testing s…","""To answer your…","""Thank you for …","""As an expert i…"
"""answer5""","""question5""","""In all RNA-seq…","""The most compl…","""The dispersion…","""As an R progra…"
"""answer6""","""question6""","""I know findOve…","""From the discu…","""It seems like …","""Thank you for …"
"""answer7""","""question7""","""I have just do…","""I wrote two he…","""The question y…","""As an expert i…"
"""answer8""","""question8""","""How can I filt…","""If you want to…","""To filter out …","""As an expert i…"
"""answer9""","""question9""","""I am analysing…","""You can use th…","""As an expert i…","""As an expert i…"
"""answer10""","""question10""","""How do I merge…","""Merge is a pre…","""To merge a lis…","""As an R expert…"


In [49]:
import pathlib

path: pathlib.Path = "llama_bioc_qa.csv"
df.write_csv(path, separator=",")

Below is an attempt to use parallel running chain for document citation retrival

In [59]:
query = "How many classes are there in a SummarizedExperiment object?"
res = T500bioc_db.similarity_search(query)
print(res[0].page_content[:100])

22 SummarizedExperiment-class
SummarizedExperiment-class
SummarizedExperiment objects
Description
Th


In [24]:
#adding sources
from langchain_core.runnables import RunnableParallel

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

rag_chain_with_source.invoke("What is SummarizedExperiment?")

{'context': [Document(page_content='22 SummarizedExperiment-class\nSummarizedExperiment-class\nSummarizedExperiment objects\nDescription\nThe SummarizedExperiment class is a matrix-like container where rows represent features of in-\nterest (e.g. genes, transcripts, exons, etc...) and columns represent samples (with sample data\nsummarized as a DataFrame). A SummarizedExperiment object contains one or more assays, each\nrepresented by a matrix-like object of numeric or other mode.', metadata={'page': 21, 'source': '/n/data1/hms/ccb/projects/bioc-chatbot/data/bioc-top-500/SummarizedExperiment_ SummarizedExperiment container.pdf'}),
  Document(page_content='SummarizedExperiment  object.', metadata={'page': 9, 'source': '/n/data1/hms/ccb/projects/bioc-chatbot/data/bioc-top-500/proDA_data-import.pdf'}),
  Document(page_content='incorrect results and retractions so this is a very desirable property.\nSummarizedExperiment  is in many ways similar to the historical ExpressionSet , the main\nd

In [50]:
for chunk in rag_chain_with_source.stream("How many classes are there in a SummarizedExperiment object?"):
    print(chunk, end="", flush=True)

{'question': 'How many classes are there in a SummarizedExperiment object?'}{'context': [Document(page_content='22 SummarizedExperiment-class\nSummarizedExperiment-class\nSummarizedExperiment objects\nDescription\nThe SummarizedExperiment class is a matrix-like container where rows represent features of in-\nterest (e.g. genes, transcripts, exons, etc...) and columns represent samples (with sample data\nsummarized as a DataFrame). A SummarizedExperiment object contains one or more assays, each\nrepresented by a matrix-like object of numeric or other mode.', metadata={'page': 21, 'source': '/n/data1/hms/ccb/projects/bioc-chatbot/data/bioc-top-500/SummarizedExperiment_ SummarizedExperiment container.pdf'}), Document(page_content='SummarizedExperiment  object.', metadata={'page': 9, 'source': '/n/data1/hms/ccb/projects/bioc-chatbot/data/bioc-top-500/proDA_data-import.pdf'}), Document(page_content='(https://bioconductor.org/packages/3.18/SummarizedExperiment) objects as input\nand/or outpu