<a href="https://colab.research.google.com/github/aiegoo/llmware/blob/tony/LLMRAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

RAG adventure

In [None]:
from typing import List
import nest_asyncio
import tempfile
import streamlit as st
import pytesseract
from PIL import Image
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import StorageContext, SimpleDirectoryReader, VectorStoreIndex, Settings
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.storage.docstore import SimpleDocumentStore


In [None]:
def init_llm():
    # llm
    embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

    # Settings.llm = llm
    Settings.embed_model = embed_model

In [None]:
def store_document(uploaded_file):
    """Chunk the PDF & store it in Chromadb Vector Store."""
    if uploaded_file is not None:
        temp_dir = tempfile.TemporaryDirectory()
        temp_file_path = os.path.join(temp_dir.name, uploaded_file.name)

        with open(temp_file_path, "wb") as f:
            f.write(uploaded_file.getvalue())

        loader = SimpleDirectoryReader(input_files=[temp_file_path])
        documents = loader.load_data()

        # save to disk

        db = chromadb.PersistentClient(path="./chroma_db")
        chroma_collection = db.get_or_create_collection("quickstart")
        vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
        storage_context = StorageContext.from_defaults(vector_store=vector_store)

        index = VectorStoreIndex.from_documents(
            documents, storage_context=storage_context, embed_model=embed_model
        )

        # load from disk
        db2 = chromadb.PersistentClient(path="./chroma_db")
        chroma_collection = db2.get_or_create_collection("quickstart")
        vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
        index = VectorStoreIndex.from_vector_store(
            vector_store,
            embed_model=embed_model,
        )

        st.info(f"PDF loaded into vector store in {len(documents)} documents")
        return index
    return None

In [None]:
def main() -> None:
    llm_model = st.sidebar.selectbox("Select LLM", options=[""])

    if "llm_model" not in st.session_state:
        st.session_state["llm_model"] = llm_model

    elif st.session_state["llm_model"] != llm_model:
        st.session_state["llm_model"] = llm_model

    uploaded_file = st.sidebar.file_uploader("Upload a file (PDF, Text, or Image)", type=["pdf", "txt", "png", "jpg", "jpeg"])

    if uploaded_file:
        index = store_document(uploaded_file)