In [9]:
## Loading Environment Variables
from typing import List, Tuple
from dotenv import load_dotenv
load_dotenv()

True

In [10]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector
from langchain.document_loaders import TextLoader
from langchain.docstore.document import Document

In [8]:
loader = TextLoader('../textData.txt')
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings()

In [13]:
## PGVector needs the connection string to the database.
## We will load it from the environment variables.
import os
CONNECTION_STRING = PGVector.connection_string_from_db_params(
    driver=os.environ.get("PGVECTOR_DRIVER", "psycopg2"),
    host=os.environ.get("PGVECTOR_HOST", "localhost"),
    port=int(os.environ.get("PGVECTOR_PORT", "5432")),
    database="andrewpassero",
    user=os.environ.get("PGVECTOR_USER", "postgres"),
    password=os.environ.get("PGVECTOR_PASSWORD", "postgres"),
)


## Example
# postgresql+psycopg2://username:password@localhost:5432/database_name

In [15]:
# The PGVector Module will try to create a table with the name of the collection. So, make sure that the collection name is unique and the user has the 
# permission to create a table.

db = PGVector(
    embedding_function=embeddings,
    collection_name="KBitem",
    connection_string=CONNECTION_STRING,
)

hello = db.add_documents(docs)

hello

['dc10c8ce-1e90-11ee-bf07-f45c89a6fd3f']

In [11]:
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.metadata)
    print("-" * 80)

--------------------------------------------------------------------------------
Score:  0.7942059236850544
{'source': '../textData.txt'}
--------------------------------------------------------------------------------


In [12]:
loader = TextLoader('test.txt')
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=0)
docs = text_splitter.split_documents(documents)


db = PGVector.from_documents(
    embedding=embeddings,
    documents=docs,
    collection_name="KBitem",
    connection_string=CONNECTION_STRING,
)

In [14]:
query = "IF YOU DONT GO OVER THE TOP OF THE GOLF BALL YOU WONT SLICE IT. HERE IS A DRILL TO NOT SLICE YOUR GOLF BALL"
docs_with_score: List[Tuple[Document, float]] = db.similarity_search_with_score(query)
    
    
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)

--------------------------------------------------------------------------------
Score:  0.0
IF YOU DONT GO OVER THE TOP OF THE GOLF BALL YOU WONT SLICE IT. HERE IS A DRILL TO NOT SLICE YOUR GOLF BALL
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Score:  0.0
IF YOU DONT GO OVER THE TOP OF THE GOLF BALL YOU WONT SLICE IT. HERE IS A DRILL TO NOT SLICE YOUR GOLF BALL
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Score:  0.0033272900945513976
IF YOU DONT GO OVER THE TOP OF THE GOLF BALL YOU WONT SLICE IT. HERE IS A DRILL TO NOT SLICE YOUR GOLF BALL
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Score:  0.6356360337626551
Great Swing  Golf Swing on I

In [11]:
content = "item contenent fdsfdsfdsfdsfdsfdsfdsfdsdsdsfdsfdsdsfdsdsfdsfdsdsfds"
document = Document(page_content = content, metadata={"text":1})

textSplitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=0)
document = textSplitter.split_documents(documents = document)

document

AttributeError: 'tuple' object has no attribute 'page_content'