In [None]:
from __future__ import annotations

import dotenv


# Reload the variables in your '.env' file (override the existing variables)
dotenv.load_dotenv("../.env", override=True)

# Run basic example from Langchain Vectorstores Chroma page

[Read more here](https://python.langchain.com/v0.2/docs/integrations/vectorstores/chroma/)

# Basic Example

In [None]:
import rich
from goob_ai import debugger
import logging
import shutil
import logging
import os
import sys
from pathlib import Path
from typing import TYPE_CHECKING, Any, List, Tuple

import faiss

from goob_ai.services import (
    answer_question_from_context,
    bm25_retrieval,
    create_question_answer_from_context_chain,
    encode_from_string,
    encode_pdf,
    get_chunk_by_index,
    read_pdf_to_string,
    replace_t_with_space,
    retrieve_context_per_question,
    retrieve_with_context_overlap,
    show_context,
    split_text_to_chunks_with_indices,
    text_wrap,
)
from langchain.vectorstores import FAISS, VectorStore
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableSerializable
from langchain_core.vectorstores.base import VectorStoreRetriever
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from loguru import logger as LOGGER
from rank_bm25 import BM25Okapi

from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_text_splitters import CharacterTextSplitter

from goob_ai.bot_logger import get_logger, global_log_config
from goob_ai.aio_settings import aiosettings, get_rich_console


global_log_config(
    log_level=logging.getLevelName("DEBUG"),
    json=False,
)


In [None]:
# import

path = "example_data/Understanding_Climate_Change.pdf"


In [None]:
def get_faiss_vector_store(path_to_pdf: Path, chunk_size: int = 400, chunk_overlap: int = 200) -> Tuple[VectorStore, List[Document], str]:
    content = read_pdf_to_string(f"{path_to_pdf}")
    docs = split_text_to_chunks_with_indices(content, chunk_size=chunk_size, chunk_overlap=chunk_overlap)

    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    # This line creates a Faiss index using the IndexFlatL2 class. The dimension of the index is determined by the length of the embedding generated for the query "hello world".
    index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

    # This line creates an instance of the FAISS vector store, specifying the embedding function (embeddings), the Faiss index (index), an in-memory document store (InMemoryDocstore()), and an empty dictionary to map index IDs to document store IDs.
    vectorstore = FAISS.from_documents(
        docs, embeddings, index=index, docstore_cls=InMemoryDocstore(), index_to_docstore_id={}
    )
    return vectorstore, docs, content

In [None]:
vectorstore, docs, content = get_faiss_vector_store(path)

chunks_query_retriever = vectorstore.as_retriever(search_kwargs={"k": 1})
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")


In [None]:
chunk = get_chunk_by_index(vectorstore, 0)
rich.print(chunk.page_content)

# Comparing regular retrival and retrival with context window

In [None]:
# Baseline approach
query = "Explain the role of deforestation and fossil fuels in climate change."
baseline_chunk = chunks_query_retriever.get_relevant_documents(query
    ,
    k=1
)
# Focused context enrichment approach
enriched_chunks = retrieve_with_context_overlap(
    vectorstore,
    chunks_query_retriever,
    query,
    num_neighbors=1,
    chunk_size=400,
    chunk_overlap=200
)

rich.print("Baseline Chunk:")
rich.print(baseline_chunk[0].page_content)
rich.print("\nEnriched Chunks:")
rich.print(enriched_chunks[0])