In [37]:
# !pip install langchain_community pgvector --quiet

[0m

In [27]:
import json, boto3
import pandas as pd
from langchain.vectorstores import PGVector
from langchain_core.documents import Document
from langchain_community.embeddings import BedrockEmbeddings

In [28]:
df = pd.read_csv("hotel-recommendations-cleaned.csv")

metadata_df = df.drop(
    columns=["room_type", "room_amenities", "roomdescription", "hotel_name"], 
    axis=1
)

# print(metadata_df.loc[0].to_dict())
# print("-"*10)
# print(df.loc[0].to_dict())

In [26]:
docs=[]
for idx, row in df.fillna("").iterrows():
    doc = Document(
        page_content=json.dumps(row.to_dict(), indent=2),
        metadata=metadata_df.loc[idx].to_dict()
    )
    docs.append(doc)
    
print(docs[0].metadata)

{'onsite_rate': 636.09, 'max_occupancy': 4, 'city': 'Beddgelert', 'country': 'United Kingdom', 'star_rating': 3, 'meals_included': False}


In [35]:
bedrock_embeddings = BedrockEmbeddings(
    model_id="amazon.titan-embed-text-v1",
    client=boto3.client("bedrock-runtime")
)

collection_name = "hotel-self-query-search"

with open("/home/ubuntu/config.json") as file:
    config = json.load(file)
pgvector_connection_string = "postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}".format(**config["rds_connect"])

# from langchain.sql_database import SQLDatabase
# database_uri = "postgresql://{user}:{password}@{host}:{port}/{database}".format(**config["rds_connect"])
# rds_db = SQLDatabase.from_uri(database_uri=database_uri)
# print(rds_db.dialect)
# print(rds_db.get_usable_table_names())
# print(rds_db.get_table_info())

In [43]:
%%time

pgvector_vectorstore = PGVector.from_documents(
    documents=docs,
    embedding=bedrock_embeddings,
    collection_name=collection_name,
    connection_string=pgvector_connection_string,
    pre_delete_collection=True,
)

CPU times: user 50.2 s, sys: 2.34 s, total: 52.6 s
Wall time: 18min 32s


In [44]:
%%time
store = PGVector(
    collection_name=collection_name,
    connection_string=pgvector_connection_string,
    embedding_function=bedrock_embeddings,
)

store

CPU times: user 21.3 ms, sys: 0 ns, total: 21.3 ms
Wall time: 65.7 ms


<langchain_community.vectorstores.pgvector.PGVector at 0x7f41948c4ca0>

In [45]:
pgvector_vectorstore.similarity_search_with_score(
    "Suggest some hotels near United Kingdom"
)

[(Document(page_content='{\n  "room_type": "Family",\n  "onsite_rate": 89.75,\n  "room_amenities": "Air conditioning: ;Coffee/tea maker: ;Free Wi-Fi in all rooms!: ;Hair dryer: ;In-room safe box: ;Laptop safe box: ;Linens: ;Satellite/cable channels: ;Telephone: ;Towels: ;",\n  "max_occupancy": 2,\n  "roomdescription": "Room size: 21 m\\u00b2/226 ft\\u00b2, Garden view, Non-smoking, Shower and bathtub, 1 double bed",\n  "hotel_name": "Britannia Russ Hill",\n  "city": "London",\n  "country": "United Kingdom",\n  "star_rating": 3,\n  "meals_included": true\n}', metadata={'onsite_rate': 89.75, 'max_occupancy': 2, 'city': 'London', 'country': 'United Kingdom', 'star_rating': 3, 'meals_included': True}),
  0.311207046162955),
 (Document(page_content='{\n  "room_type": "1 Queen Bed Non-Smoking",\n  "onsite_rate": 130.93,\n  "room_amenities": "Air conditioning: ;Coffee/tea maker: ;Free Wi-Fi in all rooms!: ;Hair dryer: ;In-room safe box: ;Refrigerator: ;Satellite/cable channels: ;Shower: ;",\n