### Add Documents the standard way


In [4]:
from langchain_openai import AzureOpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_postgres import PGVector
from langchain_community.document_loaders import DirectoryLoader
import os
from dotenv import load_dotenv

app_dir = os.path.join(os.getcwd(), "app")
load_dotenv(os.path.join(app_dir, ".env"))

True

In [12]:
embeddings = AzureOpenAIEmbeddings(
    azure_endpoint=os.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_EMBEDDING_API_KEY"),
    model="text-embedding-3-small",
    openai_api_version="2023-05-15",
)

CONNECTION_STRING = "postgresql+psycopg2://admin:admin@127.0.0.1:5432/vectordb"
COLLECTION_NAME = "vectordb"

loader = DirectoryLoader("./data", glob="**/*.txt")
docs = loader.load()
print(f"{len(docs)} documents loaded!")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
chunks = text_splitter.split_documents(docs)
print(f"{len(chunks)} chunks from {len(docs)} docs created!")

libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.


3 documents loaded!
56 chunks from 3 docs created!


In [13]:
vectorstore = PGVector(
    connection=CONNECTION_STRING,
    embeddings=embeddings,
    collection_name=COLLECTION_NAME,
)

In [14]:
vectorstore.add_documents(chunks)

['5c4f89f8-8533-4f05-8578-1001a6ecc220',
 'c1607138-2753-4a70-b298-cacff23184f6',
 '661c21ba-b4d0-44f3-bde0-b5c698f91e9e',
 '2171a79b-a15e-486e-a3fe-8340dca22905',
 'd5da0475-d7a4-4e11-8b57-4a2eddf4d2f3',
 '5122584e-74d6-4e4a-ab3d-e05a7394fd0f',
 '20f9ec99-0ded-4a91-aa6f-c4e9695fec13',
 '0a794fdc-a885-4af6-832d-476d9a5d0a50',
 '459d9087-0be7-4afe-9104-d59e1ceabbbd',
 '868a59b6-a131-498c-8ab3-778d4c432a18',
 '33759c99-5be9-46d0-b658-30ae7ef13775',
 '9fd9b2b7-8104-4c1e-8c9e-1e59c473bf9c',
 '7c43f1c1-0245-42a2-b5db-18fc47a2c062',
 'd44557f7-c861-4ff5-a489-5ceced815580',
 '7681fce7-0dee-40e7-aebd-74ef5b3ca2aa',
 '1bad82c7-2116-48af-9dd5-27e55367b21f',
 'aafcd85c-1baa-40f1-bd14-588650b2365e',
 'bec9d8d0-eda1-4c2b-8934-b8a4c0b9cbcd',
 '45a9431b-ba29-4ddb-a0cc-3f30cfeeea6b',
 '49a1b733-aefe-4bfb-befe-4377ab231b3f',
 '2c92fcea-1855-4b31-9ca3-8fbd4809891a',
 '4d4af01f-d619-4b6a-a4b0-b9252c97562a',
 'c9dadd03-c6e2-45c7-b39b-e2fe93360d25',
 '1405d40f-f297-4f1f-910a-aa87289c4580',
 'c27c0b5c-de02-

In [15]:
import psycopg2

TABLE_NAME = "langchain_pg_embedding"
CONN_STRING = "dbname='vectordb' user='admin' host='127.0.0.1' password='admin'"

conn = psycopg2.connect(CONN_STRING)
cur = conn.cursor()

query = f"SELECT COUNT(*) FROM {TABLE_NAME};"

cur.execute(query)
row_count = cur.fetchone()[0]

print(f"Total rows in '{TABLE_NAME}': {row_count}")

cur.close()
conn.close()

Total rows in 'langchain_pg_embedding': 56


In [16]:
delete_query = f"DELETE FROM {TABLE_NAME};"

conn = psycopg2.connect(CONN_STRING)
cur = conn.cursor()
cur.execute(delete_query)
conn.commit()

print(f"All rows from '{TABLE_NAME}' have been deleted.")

cur.close()
conn.close()

All rows from 'langchain_pg_embedding' have been deleted.


### Indexing API


In [17]:
from langchain.indexes import SQLRecordManager, index

In [18]:
namespace = f"pgvector/{COLLECTION_NAME}"
record_manager = SQLRecordManager(namespace, db_url=CONNECTION_STRING)

In [19]:
record_manager.create_schema()

Update the documents to see some changes (2nd run)


In [21]:
index(
    chunks,
    record_manager,
    vectorstore,
    cleanup=None,
    source_id_key="source",
)

{'num_added': 0, 'num_updated': 0, 'num_skipped': 56, 'num_deleted': 0}

In [22]:
from langchain.schema import Document

chunks[1].page_content = "updated"
del chunks[6]
chunks.append(Document(page_content="new content", metadata={"source": "important"}))

In [23]:
index(
    chunks,
    record_manager,
    vectorstore,
    cleanup=None,
    source_id_key="source",
)

{'num_added': 2, 'num_updated': 0, 'num_skipped': 54, 'num_deleted': 0}

In [24]:
chunks[1].page_content = "updated again"
del chunks[2]
del chunks[3]
del chunks[4]
chunks.append(
    Document(page_content="more new content", metadata={"source": "important"})
)

In [25]:
index(
    chunks,
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source",
)

{'num_added': 2, 'num_updated': 0, 'num_skipped': 52, 'num_deleted': 6}

In [26]:
index(
    [],
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source",
)

{'num_added': 0, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [None]:
index([], record_manager, vectorstore, cleanup="full", source_id_key="source")