In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys

current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)
sys.path.append(parent_directory)

In [None]:
from backend.rag_components.main import RAG
from langchain.indexes import SQLRecordManager, index
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
load_dotenv()

In [None]:
rag = RAG()
rag.vector_store

In [None]:
namespace = f"chromadb/my_docs"
record_manager = SQLRecordManager(
    namespace, db_url=os.environ.get("DATABASE_CONNECTION_STRING")
)
# pointer le record_manager vers une table dans db sql 
record_manager.create_schema()

In [None]:
loader = CSVLoader(f"{parent_directory}/data/billionaires_csv.csv")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n"], chunk_size=1500, chunk_overlap=100
)
texts = text_splitter.split_documents(documents)
texts[:5]

In [None]:
loader = CSVLoader(f"{parent_directory}/data/billionaires_csv_bis.csv")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n"], chunk_size=1500, chunk_overlap=100
)
texts_bis = text_splitter.split_documents(documents)
texts_bis[:5]

In [None]:
index(
    [],
    record_manager,
    rag.vector_store,
    cleanup="full", #incremental
    source_id_key="source",
)

In [None]:
index(
    texts[:100],
    record_manager,
    rag.vector_store,
    cleanup="incremental", #incremental
    source_id_key="source",
)

In [None]:
index(
    texts_bis[50:100],
    record_manager,
    rag.vector_store,
    cleanup="incremental",
    source_id_key="source",
)

In [None]:
import os

# print(os.environ.get("APIFY_API_TOKEN"))

from langchain.document_loaders.base import Document
from langchain.utilities import ApifyWrapper
from dotenv import load_dotenv
load_dotenv()

apify = ApifyWrapper()

loader = apify.call_actor(
    actor_id="apify/website-content-crawler",
    run_input={"startUrls": [{"url": "https://python.langchain.com/en/latest/modules/indexes/document_loaders.html"}]},
    dataset_mapping_function=lambda item: Document(
        page_content=item["text"] or "", metadata={"source": item["url"]}
    ),
)

In [None]:
loader #.apify_client()

In [None]:
from apify_client import ApifyClient

apify_client = loader.apify_client

len(apify_client.dataset(loader.dataset_id).list_items().items)

In [None]:
index(
    [loader],
    record_manager,
    rag.vector_store,
    cleanup="incremental",
    source_id_key="source",
)