# pgvector quickstart


### Import libraries and environment variables

In [1]:
from __future__ import annotations

import dotenv


# Reload the variables in your '.env' file (override the existing variables)
dotenv.load_dotenv("../../.env", override=True)

True

In [None]:
import os
import sys
from dotenv import load_dotenv
import logging



sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', '..'))) # Add the parent directory to the path since we work with notebooks
from helper_functions import *
from evaluation.evalute_rag import *

from logging_utils import get_logger, global_log_config


global_log_config(
    log_level=logging.getLevelName("DEBUG"),
    json=False,
)


# # Load environment variables from a .env file
# load_dotenv()

# # Set the OpenAI API key environment variable
# os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

### Read Docs

In [None]:
# SOURCE: https://www.youtube.com/watch?v=Ff3tJ4pJEa4
import pinecone
from dotenv import load_dotenv
from pgvector_service import PgvectorService
import os
import time
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import PGVector, Pinecone
from langchain_text_splitters import CharacterTextSplitter

# Load the documents

> 4. Initialize a LangChain vector store
> Now that you've built your Pinecone index, you need to initialize a LangChain vector store using the index. This step uses the OpenAI API key you set as an environment variable earlier. Note that OpenAI is a paid service and so running the 
> remainder of this tutorial may incur some small cost.

In [4]:
loader = TextLoader(
    "../../data/The Project Gutenberg eBook of A Christmas Carol in Prose; Being a Ghost Story of Christmas.txt"
)
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

# Create a unique ID for each document
# SOURCE: https://github.com/theonemule/azure-rag-sample/blob/1e37de31678ffbbe5361a8ef3acdb770194f462a/import.py#L4
for idx, doc in enumerate(docs):
    doc.metadata["id"] = str(idx)

# vectorstore.add_documents(docs, ids=[doc.metadata["id"] for doc in docs])

# embeddings = OpenAIEmbeddings()

from langchain_openai import OpenAIEmbeddings
# get openai api key from platform.openai.com
model_name = 'text-embedding-ada-002'
embeddings = OpenAIEmbeddings(
    model=model_name,
)

query = "The Project Gutenberg eBook of A Christmas Carol in Prose; Being a Ghost Story of Christmas"


"""
First, we compare to Pinecone, a managed vector store service.

"""

'\nFirst, we compare to Pinecone, a managed vector store service.\n\n'

# Create / Load the Pinecone index

In [5]:
use_serverless = True

# pinecone.init(
#     api_key=os.getenv("PINECONE_API_KEY"), environment=os.getenv("PINECONE_ENV")
# )

# SOURCE: https://docs.pinecone.io/integrations/langchain#4-initialize-a-langchain-vector-store

import os
# from pinecone.control.pinecone import Pinecone
from pinecone.grpc.pinecone import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec, PodSpec
import time
from langchain_pinecone import PineconeVectorStore
import rich

pc: Pinecone = Pinecone(
    api_key=os.environ.get("PINECONE_API_KEY")
)

if use_serverless:
    spec = ServerlessSpec(cloud='aws', region='us-east-1')
else:
    # if not using a starter index, you should specify a pod_type too
    spec = PodSpec()

index_name = "demo-index"


# delete all indexes
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)


rich.print(pc.list_indexes())
rich.print(pc.list_indexes().names())

if index_name not in pc.list_indexes().names():

    pc.create_index(name=index_name, metric="cosine", dimension=1536, spec=spec)

    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

    vectorstore = PineconeVectorStore(
        index_name=index_name, embedding=embeddings, text_key="text"
    )
    vectorstore.add_documents(docs, ids=[doc.metadata["id"] for doc in docs])

else:
    print(f"Index {index_name} already exists")
    vectorstore = PineconeVectorStore(
        index_name=index_name, embedding=embeddings, text_key="text"
    )



index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 96}},
 'total_vector_count': 96}

# Initialize the LangChain vector store: 

> https://docs.pinecone.io/integrations/langchain#4-initialize-a-langchain-vector-store


The text_field parameter sets the name of the metadata field that stores the raw text when you upsert records using a LangChain operation such as vectorstore.from_documents or vectorstore.add_texts. This metadata field is used as the page_content in the Document objects retrieved from query-like LangChain operations such as vectorstore.similarity_search. If you do not specify a value for text_field, it will default to "text".

In [6]:
from langchain_pinecone import PineconeVectorStore
text_field = "text"
vectorstore = PineconeVectorStore(
    index_name=index_name, embedding=embeddings, text_key="text"
)

# Now you can query the vector store directly using vectorstore.similarity_search:

In [8]:
vectorstore.similarity_search(
    query,  # our search query
    k=4  # return 3 most relevant docs
)

[Document(metadata={'id': '0', 'source': '../../data/The Project Gutenberg eBook of A Christmas Carol in Prose; Being a Ghost Story of Christmas.txt'}, page_content="\ufeffThe Project Gutenberg eBook of A Christmas Carol in Prose; Being a Ghost Story of Christmas\n\nThis ebook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License included with this ebook or online\nat www.gutenberg.org. If you are not located in the United States,\nyou will have to check the laws of the country where you are located\nbefore using this eBook.\n\nTitle: A Christmas Carol in Prose; Being a Ghost Story of Christmas\n\n\nAuthor: Charles Dickens\n\nIllustrator: John Leech\n\nRelease date: August 11, 2004 [eBook #46]\n                Most recently updated: October 17, 2021\n\nLanguage: English\n\n*** START OF THE PROJECT GUTENB

# Query the index with LanChain

In [9]:
def run_query_pinecone(docsearch: PineconeVectorStore, query: str):
    docs = docsearch.similarity_search(query, k=4)
    result = docs[0].page_content
    return result


def calculate_average_execution_time(func, *args, **kwargs):
    total_execution_time = 0
    num_runs = 10
    for _ in range(num_runs):
        start_time = time.time()
        result = func(*args, **kwargs)  # Execute the function with its arguments
        end_time = time.time()
        execution_time = end_time - start_time
        total_execution_time += execution_time
    average_execution_time = round(total_execution_time / num_runs, 2)
    print(result)
    print(
        f"\nThe function took an average of {average_execution_time} seconds to execute."
    )
    return


calculate_average_execution_time(
    run_query_pinecone, docsearch=vectorstore, query=query
)


"""
Now, we compare to PGVector, an open source vector store service.

"""

The Project Gutenberg eBook of A Christmas Carol in Prose; Being a Ghost Story of Christmas

This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: A Christmas Carol in Prose; Being a Ghost Story of Christmas


Author: Charles Dickens

Illustrator: John Leech

Release date: August 11, 2004 [eBook #46]
                Most recently updated: October 17, 2021

Language: English

*** START OF THE PROJECT GUTENBERG EBOOK A CHRISTMAS CAROL IN PROSE; BEING A GHOST STORY OF CHRISTMAS ***

A CHRISTMAS CAROL

IN PROSE
BEING
A Ghost Story of Christmas

by Charles Dickens

PREFACE

I HAVE endeavoured in this 

'\nNow, we compare to PGVector, an open source vector store service.\n\n'

# Create a PGVector Store

In [10]:

"""
Donwload postgresql to run locally:
https://www.postgresql.org/download/

How to install the pgvector extension:
https://github.com/pgvector/pgvector

Fix common installation issues:
https://github.com/pgvector/pgvector?tab=readme-ov-file#installation-notes
"""

COLLECTION_NAME = "The Project Gutenberg eBook of A Christmas Carol in Prose"

CONNECTION_STRING = PGVector.connection_string_from_db_params(
    driver=os.environ.get("PGVECTOR_DRIVER", "psycopg"),
    host=os.environ.get("PGVECTOR_HOST", "localhost"),
    port=int(os.environ.get("PGVECTOR_PORT", "6432")),
    database=os.environ.get("PGVECTOR_DATABASE", "langchain"),
    user=os.environ.get("PGVECTOR_USER", "langchain"),
    password=os.environ.get("PGVECTOR_PASSWORD", "langchain"),
)

rich.print(CONNECTION_STRING)

DATABASE_URL = "postgresql+psycopg://langchain:langchain@localhost:6432/langchain"

# create the store
db = PGVector.from_documents(
    embedding=embeddings,
    documents=docs,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
    pre_delete_collection=False,
    ids=[doc.metadata["id"] for doc in docs]
)

# load the store
pgvector_docsearch = PGVector(
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
    embedding_function=embeddings,
)


  warn_deprecated(
  warn_deprecated(


# Query the index with PGVector

In [11]:

def run_query_pgvector(docsearch: PineconeVectorStore, query: str):
    docs = docsearch.similarity_search(query, k=4)
    result = docs[0].page_content
    return result


calculate_average_execution_time(
    run_query_pgvector, docsearch=pgvector_docsearch, query=query
)


The Project Gutenberg eBook of A Christmas Carol in Prose; Being a Ghost Story of Christmas

This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: A Christmas Carol in Prose; Being a Ghost Story of Christmas


Author: Charles Dickens

Illustrator: John Leech

Release date: August 11, 2004 [eBook #46]
                Most recently updated: October 17, 2021

Language: English

*** START OF THE PROJECT GUTENBERG EBOOK A CHRISTMAS CAROL IN PROSE; BEING A GHOST STORY OF CHRISTMAS ***

A CHRISTMAS CAROL

IN PROSE
BEING
A Ghost Story of Christmas

by Charles Dickens

PREFACE

I HAVE endeavoured in this 

# Add more collections to the database

In [13]:

loader = TextLoader("../../data/The Project Gutenberg eBook of Romeo and Juliet.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
new_docs = text_splitter.split_documents(documents)


# Create a unique ID for each document
# SOURCE: https://github.com/theonemule/azure-rag-sample/blob/1e37de31678ffbbe5361a8ef3acdb770194f462a/import.py#L4
for idx, doc in enumerate(new_docs):
    doc.metadata["id"] = str(idx)

COLLECTION_NAME_2 = "The Project Gutenberg eBook of Romeo and Juliet"

db = PGVector.from_documents(
    embedding=embeddings,
    documents=new_docs,
    collection_name=COLLECTION_NAME_2,
    connection_string=CONNECTION_STRING,
    pre_delete_collection=False,
    ids=[doc.metadata["id"] for doc in new_docs]
)

  warn_deprecated(


# Query the index with multiple collections


In [None]:

pg = PgvectorService(CONNECTION_STRING)


def run_query_multi_pgvector(docsearch, query):
    docs = docsearch.custom_similarity_search_with_scores(query, k=4)
    result = docs[0][0].page_content
    print(result)


run_query_multi_pgvector(pg, query)

# --------------------------------------------------------------
# Delete the collection
# --------------------------------------------------------------
pg.delete_collection(COLLECTION_NAME)
pg.delete_collection(COLLECTION_NAME_2)

# --------------------------------------------------------------
# Update the collection
# --------------------------------------------------------------
pg.update_collection(docs=docs, collection_name=COLLECTION_NAME)


Look into https://medium.com/@towards-agi/dont-use-pinecone-or-milvus-for-vector-database-pgvector-is-70-faster-and-cheaper-and-open-66a698358415

We probably need to enable some of these items and should think about how it would look to run locally.