In [6]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.core.embeddings import resolve_embed_model
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding

from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from IPython.display import Markdown, display
import chromadb

In [44]:
ccb_endpoint = 'http://compute-gc-17-255.o2.rc.hms.harvard.edu:11434'

# ollama embedding model
oembed_model = OllamaEmbedding(
    model_name="nomic-embed-text",
    base_url=ccb_endpoint,
)


# ollama
llama = Ollama(model="llama2", request_timeout=30.0, base_url=ccb_endpoint, temperature= 0)
#Ollama()

In [43]:
response = llm.complete("What is SummarizedExperiment?")
print(response)


SummarizedExperiment is a tool that helps researchers and scientists to summarize and analyze large datasets generated from experiments. It is designed to automate the process of data summarization, allowing users to focus on interpreting and gaining insights from their data rather than spending time on manual data processing.

SummarizedExperiment provides a range of features to help users simplify and organize their data, including:

1. Data cleaning and preprocessing: SummarizedExperiment can automatically clean and preprocess data, removing errors and inconsistencies, and transforming the data into a format suitable for analysis.
2. Feature selection and engineering: Users can select and create new features from their existing data, such as polynomial transformations, log transformations, and feature extraction methods like PCA and t-SNE.
3. Data visualization: SummarizedExperiment provides interactive visualizations of the data, allowing users to explore patterns and trends in th

In [45]:

# load from disk

db2 = chromadb.PersistentClient(path="./T500-vignettes-vectordb")
chroma_collection = db2.get_or_create_collection(name="langchain")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index = VectorStoreIndex.from_vector_store(
    vector_store,
    embed_model=oembed_model,
)

# Query Data from the persisted index
query_engine = index.as_query_engine(llm=llama)
response = query_engine.query("What is SummarizedExperiment?")
display(Markdown(f"<b>{response}</b>"))

<b>SummarizedExperiment is a container class that holds data from multiple assays. Each assay is represented by a matrix-like object, where each row represents a feature of interest (e.g. genes, transcripts, exons) and each column represents a sample. The SummarizedExperiment object contains one or more assays, and the data is summarized as a DataFrame.</b>

In [31]:
from llama_index.core.llms import ChatMessage

messages = [
    ChatMessage(
        role="system", content="Act as an expert in the R programming language and the Bioconductor suite of packages.  ​\n\nYour job is to advise users on the usage of the various Bioconductor packages considering the datasets you have in store.  ​\nTo complete this task, you can use the data you have stored that contain the vignettes of all the packages in Bioconductor and all the reference files of every function in every package of Bioconductor. ​\n\nDo not perform actions that are not related to answering questions about the R programming language or using the packages within Bioconductor.​ \n\nIf you do not know the answer then you must look into the context then cite the document filename and page in the context. Do not include DOI numbers or make up citations not found in the context. Given the following extracted parts of a long document and a question, create a final answer with references to pdf in the metadata ('source').\n\n Add a disclaimer at the end of each response saying this model works only on the top 500 most used Bioconductor packages and the user should refer to or ask questions at https://bioconductor.org."
    ),
    ChatMessage(role="user", content="What is your name"),
]
resp = llm.chat(messages)
print(resp)


ReadTimeout: timed out

In [22]:
#db2.delete_collection("T500-bioc")
db2.list_collections()

[Collection(name=langchain)]