In [3]:
# ollama_rag.py
import traceback
import os
import logging
from typing import List, Optional, Dict, Any
import json
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    PromptTemplate,
    Settings,
    load_index_from_storage,
    Document
)
from llama_index.llms.ollama import Ollama
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb
from dotenv import load_dotenv
import logging
from IPython.display import Markdown, display
from enum import Enum
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
)
# Import llama_index modules
from llama_index.core.postprocessor import SentenceEmbeddingOptimizer
from llama_index.core.postprocessor import KeywordNodePostprocessor
from llama_index.core.chat_engine.types import ChatMode
from llama_index.core.memory import ChatMemoryBuffer


## Embedding Model + Simple Chroma Storage

In [8]:
# define embedding function
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

# load documents
documents = SimpleDirectoryReader("../sample_docs").load_data()

# save to disk
db = chromadb.PersistentClient(path="chroma_db")
chroma_collection = db.get_or_create_collection("demo")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, embed_model=embed_model
)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [9]:
# load from disk
db2 = chromadb.PersistentClient(path="chroma_db")
chroma_collection = db2.get_or_create_collection("demo")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index = VectorStoreIndex.from_vector_store(
    vector_store,
    embed_model=embed_model,
)

In [11]:
# Query Data from the persisted index
query_engine = index.as_query_engine()
response = query_engine.query("what about NVIDIA's ecosystem? And show me its location of the file")
display(Markdown(f"<b>{response}</b>"))

<b>NVIDIA's ecosystem includes optimization stacks, ML and DL libraries, GPU bypassing CPU for remote storage access, GPU to GPU data transfers, software ecosystems like NGC registry and GDS, and target architectures like DGX-2 systems and GPU-heavy clusters. The file location is /mnt/d/nexusync/notebooks/../sample_docs/Nvidia ecosystem.pptx.</b>