In [1]:
from dotenv import load_dotenv
from langchain.document_loaders import CSVLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores.pgvector import PGVector, DistanceStrategy
from langchain.docstore.document import Document
import os

load_dotenv()

os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv('HUGGINGFACEHUB_API_TOKEN')

embeddings = HuggingFaceInstructEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

connection_string = PGVector.connection_string_from_db_params(                                                  
    driver = os.environ.get("PGVECTOR_DRIVER"),
    user = os.environ.get("PGVECTOR_USER"),                                      
    password = os.environ.get("PGVECTOR_PASSWORD"),                                  
    host = os.environ.get("PGVECTOR_HOST"),                                            
    port = os.environ.get("PGVECTOR_PORT"),                                          
    database = os.environ.get("PGVECTOR_DATABASE")                                       
)

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer
max_seq_length  512


In [2]:
loader = CSVLoader('./data/test.csv', source_column="comments")
documents = loader.load()

In [3]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
print(len(documents))
print(len(docs))

# Access the content and metadata of each document
for document in documents:
    content = print(document.page_content)
    metadata = print(document.metadata)

10
10
comments: great hotel night quick business trip, loved little touches like goldfish leopard print robe, complaint wifi complimentary not internet access business center, great location library service fabulous,
{'source': 'great hotel night quick business trip, loved little touches like goldfish leopard print robe, complaint wifi complimentary not internet access business center, great location library service fabulous,  ', 'row': 0}
comments: horrible customer service hotel stay february 3rd 4th 2007my friend picked hotel monaco appealing website online package included champagne late checkout 3 free valet gift spa weekend, friend checked room hours earlier came later, pulled valet young man just stood, asked valet open said, pull bags didn__Ç_é_ offer help, got garment bag suitcase came car key room number says not valet, car park car street pull, left key working asked valet park car gets, went room fine bottle champagne oil lotion gift spa, dressed went came got bed noticed b

In [4]:
from typing import List, Tuple

collection_name = 'trip_advisor_hotel_reviews'
# The PGVector Module will try to create a table with the name of the collection. 
# So, make sure that the collection name is unique and the user has the permission to create a table.
db = PGVector.from_documents(
    embedding=embeddings,
    documents=docs,
    collection_name=collection_name,
    connection_string=connection_string
)

In [5]:
query = "What do some of the positive reviews say?"
docs_with_score: List[Tuple[Document, float]] = db.similarity_search_with_score(query)

In [6]:
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print(doc.metadata)
    print("-" * 80)

--------------------------------------------------------------------------------
Score:  0.9238530395691034
comments: nice hotel expensive parking got good deal stay hotel anniversary, arrived late evening took advice previous reviews did valet parking, check quick easy, little disappointed non-existent view room room clean nice size, bed comfortable woke stiff neck high pillows, not soundproof like heard music room night morning loud bangs doors opening closing hear people talking hallway, maybe just noisy neighbors, aveda bath products nice, did not goldfish stay nice touch taken advantage staying longer, location great walking distance shopping, overall nice experience having pay 40 parking night,
{'source': 'nice hotel expensive parking got good deal stay hotel anniversary, arrived late evening took advice previous reviews did valet parking, check quick easy, little disappointed non-existent view room room clean nice size, bed comfortable woke stiff neck high pillows, not soundproo

In [7]:
store = PGVector(
    connection_string=connection_string, 
    embedding_function=embeddings, 
    collection_name='trip_advisor_hotel_reviews',
    distance_strategy=DistanceStrategy.COSINE
)

retriever = store.as_retriever(search_kwargs={"k": 1})

In [13]:
retriever.get_relevant_documents(query='What do some of the positive reviews say?')

[Document(page_content='comments: nice hotel expensive parking got good deal stay hotel anniversary, arrived late evening took advice previous reviews did valet parking, check quick easy, little disappointed non-existent view room room clean nice size, bed comfortable woke stiff neck high pillows, not soundproof like heard music room night morning loud bangs doors opening closing hear people talking hallway, maybe just noisy neighbors, aveda bath products nice, did not goldfish stay nice touch taken advantage staying longer, location great walking distance shopping, overall nice experience having pay 40 parking night,', metadata={'source': 'nice hotel expensive parking got good deal stay hotel anniversary, arrived late evening took advice previous reviews did valet parking, check quick easy, little disappointed non-existent view room room clean nice size, bed comfortable woke stiff neck high pillows, not soundproof like heard music room night morning loud bangs doors opening closing he