In [1]:
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_community.llms import Ollama
#from langchain_experimental.llms.ollama_functions import OllamaFunctions
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings



# log in to VPN or harvard wifi for endpoint

In [2]:
#create the LLM 
ccb_endpoint = 'http://compute-gc-17-255.o2.rc.hms.harvard.edu:11434'


In [3]:
# Set the model and temperature
llm = Ollama(base_url= ccb_endpoint, model="llama2", temperature=0)
#create the embedding model
oembed = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Now we can invoke a generated response from the model. Test it out by filling in the quotes below

In [4]:
chat_model_response = llm.invoke("what is summarizedExperiment")
print(chat_model_response)


SummarizedExperiment is a tool in the Google Cloud Platform that allows you to summarize and analyze large datasets. It is designed to help data scientists and analysts quickly and easily identify the most important insights from complex datasets, without having to spend hours or days manually analyzing the data.

SummarizedExperiment uses machine learning algorithms to automatically generate a summary of the dataset, including key findings, trends, and patterns. This summary is presented in an easy-to-understand format, such as a report or dashboard, that can be shared with stakeholders.

Some of the key features of SummarizedExperiment include:

1. Automated summarization: SummarizedExperiment uses machine learning algorithms to automatically generate a summary of the dataset, without requiring manual analysis.
2. Customizable summaries: Users can customize the summary to focus on specific aspects of the data, such as trends, patterns, or insights.
3. Interactive visualizations: Sum

### load from existing chromaDB, your docker container has a chroma database with all the manuals(vingettes) for the top 500 most downloaded bioconductor packages

In [5]:
#load from persist 
T500bioc_db = Chroma(persist_directory="/tmp/T500-vignettes-vectordb-ST/", embedding_function=oembed)

#### We can now use the embedding model and vector database a retriever to guide the generative AI (llama2). Below we define a system prompt and the RAG pipeline as a "chain" written in LangChain Expression Language(LCEL).

In [8]:
# Retrieve and generate using the relevant snippets of the blog.
retriever = T500bioc_db.as_retriever(search_type="similarity", search_kwargs={"k": 6})

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
from langchain_core.prompts import ChatPromptTemplate

template = """
Act as an expert in the R programming language and the Bioconductor suite of packages.  ​\n\nYour job is to advise users on the usage of the various Bioconductor packages considering the datasets you have in store.  ​\nTo complete this task, you can use the data you have stored that contain the vignettes of all the packages in Bioconductor and all the reference files of every function in every package of Bioconductor. ​\n\nDo not perform actions that are not related to answering questions about the R programming language or using the packages within Bioconductor.​ \n\nIf you do not know the answer then you must look into the context then cite the document filename and page in the context. Do not include DOI numbers or make up citations not found in the context. 
Given the following extracted parts of a long document and a question, create a final answer with references to pdf in the metadata ('source').\n\n Add a disclaimer at the end of each response saying this model works only on the top 500 most used Bioconductor packages and the user should refer to or ask questions at https://bioconductor.org.

QUESTION: {question}
=========
{context}
=========
FINAL ANSWER: """
prompt = ChatPromptTemplate.from_template(template)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

### Test the LLM without RAG

In [6]:
#test without rag
chat_model_response = llm.invoke("How many classes are there in the SummarizedExperiment package?")
print(chat_model_response)


The `SummarizedExperiment` package provides a variety of classes for summarizing and visualizing experimental data. Here are some of the main classes provided by the package:

1. `Summary`: This is the base class for all summary objects in the package. It provides methods for calculating summary statistics, such as means, standard deviations, and counts, as well as methods for visualizing the data.
2. `Experiment`: This class represents an entire experiment, including all of the observations and variables. It provides methods for calculating summary statistics for the entire experiment, as well as methods for visualizing the data.
3. `Observation`: This class represents a single observation in an experiment. It provides methods for calculating summary statistics for a single observation, as well as methods for visualizing the data.
4. `Variable`: This class represents a variable in an experiment. It provides methods for calculating summary statistics for a single variable, as well as 

### Now invoke the LLM with the RAG chain

In [9]:
# Run with Retrival augment
for chunk in rag_chain.stream("How many classes are there in the SummarizedExperiment package?"):
    print(chunk, end="", flush=True)

According to the documentation provided, the SummarizedExperiment package contains two classes: `SummarizedExperiment` and `RangedSummarizedExperiment`. The `SummarizedExperiment` class is a matrix-like container that represents features of interest (such as genes, transcripts, exons, etc.) and samples. It can contain one or more assays, each represented by a matrix-like object of numeric or other mode.

The `RangedSummarizedExperiment` class is an extension of the `SummarizedExperiment` class that allows for the representation of ranges of values for the features of interest.

Therefore, there are two classes in the SummarizedExperiment package: `SummarizedExperiment` and `RangedSummarizedExperiment`.

In [None]:
#without streaming loop
query = "How many classes are there in the SummarizedExperiment package?"
response = rag_chain.invoke(query)
response

In [None]:
#programmatic access to csv
import polars as pl
df = pl.read_csv('bioc_qa.csv')
questions = df['Question'].to_list()
answers_rag = []

for query in questions:
    result = rag_chain.invoke(query)
    answers_rag.append(result)

answers_rag
    

In [None]:

plain_template = """
Act as an expert in the R programming language and the Bioconductor suite of packages. 
Your job is to advise users on the usage of the various Bioconductor packages.  
Do not perform actions that are not related to answering questions about the R programming language or using the packages within Bioconductor.​ 
If you do not know the answer ask the user to refer to https://bioconductor.org. 
Add a disclaimer at the end of each response saying this model works only on the top 500 most used Bioconductor packages.

QUESTION: {question}
FINAL ANSWER: """
prompt = ChatPromptTemplate.from_template(plain_template)

non_rag_chain = (
    {"question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
for chunk in non_rag_chain.stream("What is SummarizedExperiment?"):
    print(chunk, end="", flush=True)