In [None]:
%pip install langchain

In [9]:
import os
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.pgvector import PGVector

In [2]:
#move to parameter
urls = ["https://kubernetes.io/docs/concepts/workloads/controllers/deployment/"]

#loads document from url
loader = WebBaseLoader(urls)
documents = loader.load()

In [3]:
# Chunk all the kubernetes concept documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
docs = text_splitter.split_documents(documents)

print("%s chunks in %s pages" % (len(docs), len(documents)))

79 chunks in 1 pages


In [4]:
# Load sentence transformer embeddings
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device":"cpu"} # use {"device":"cuda"} for distributed embeddings

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

In [10]:
CONNECTION_STRING = PGVector.connection_string_from_db_params(
    driver=os.environ.get("PGVECTOR_DRIVER", "psycopg2"),
    host=os.environ.get("PGVECTOR_HOST", "localhost"),
    port=int(os.environ.get("PGVECTOR_PORT", "5432")),
    database=os.environ.get("PGVECTOR_DATABASE", "postgres"),
    user=os.environ.get("PGVECTOR_USER", "postgres"),
    password=os.environ.get("PGVECTOR_PASSWORD", "secretpassword"),
)

In [11]:
COLLECTION_NAME = "kubernetes_concepts"

db = PGVector(
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
    embedding_function=embeddings,
)

db.add_documents(docs)

['ed61a314-76ce-11ee-98e6-0242ac110004',
 'ed61a594-76ce-11ee-98e6-0242ac110004',
 'ed61a62a-76ce-11ee-98e6-0242ac110004',
 'ed61a6b6-76ce-11ee-98e6-0242ac110004',
 'ed61a72e-76ce-11ee-98e6-0242ac110004',
 'ed61a7a6-76ce-11ee-98e6-0242ac110004',
 'ed61a81e-76ce-11ee-98e6-0242ac110004',
 'ed61a896-76ce-11ee-98e6-0242ac110004',
 'ed61a904-76ce-11ee-98e6-0242ac110004',
 'ed61a97c-76ce-11ee-98e6-0242ac110004',
 'ed61a9ea-76ce-11ee-98e6-0242ac110004',
 'ed61aa58-76ce-11ee-98e6-0242ac110004',
 'ed61aad0-76ce-11ee-98e6-0242ac110004',
 'ed61ab48-76ce-11ee-98e6-0242ac110004',
 'ed61abb6-76ce-11ee-98e6-0242ac110004',
 'ed61ac24-76ce-11ee-98e6-0242ac110004',
 'ed61ac92-76ce-11ee-98e6-0242ac110004',
 'ed61ad0a-76ce-11ee-98e6-0242ac110004',
 'ed61ad78-76ce-11ee-98e6-0242ac110004',
 'ed61adf0-76ce-11ee-98e6-0242ac110004',
 'ed61ae68-76ce-11ee-98e6-0242ac110004',
 'ed61aee0-76ce-11ee-98e6-0242ac110004',
 'ed61af4e-76ce-11ee-98e6-0242ac110004',
 'ed61afbc-76ce-11ee-98e6-0242ac110004',
 'ed61b034-76ce-