# Imports

In [None]:
# System
import os 
import sys
import re
import json
from pathlib import Path
from dotenv import load_dotenv

# Data Type
import json
from textwrap import dedent
from pprint import pprint

# Configs

In [None]:
# Add project root to path
sys.path.append(str(Path.cwd().parent))
print("Project root added to path.")

env_path = Path.cwd().parent / '.env'
load_dotenv(dotenv_path=env_path)

# Load environment variables
print(f"Environment variables loaded: {load_dotenv()}.")

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Embedding Model

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

text = "This is a test document."
query_result = embeddings.embed_query(text)

# show only the first 100 characters of the stringified vector
print(str(query_result)[:100] + "...")


# Alternative
# from langchain_google_genai import GoogleGenerativeAIEmbeddings

# embeddings = GoogleGenerativeAIEmbeddings(
#     model="models/gemini-embedding-exp-03-07",
#     google_api_key=os.environ.get("GEMINI_API_KEY")
# )


# PGVector

In [None]:
from langchain_postgres.vectorstores import PGVector

# See docker command above to launch a postgres instance with pgvector enabled.
pg_user = os.environ.get('DB_USER')
pg_password = os.environ.get('DB_PASSWORD')
pg_db = os.environ.get('DB_NAME')
pg_host = os.environ.get('DB_HOST')
pg_port = os.environ.get('DB_PORT')
schema = 'vector_store,public'
connection = (
    f"postgresql+psycopg://{pg_user}:{pg_password}@{pg_host}:{pg_port}/{pg_db}"
    f"?options=-csearch_path%3D{schema}"
)
collection_name = "game_embeddings"
distance_strategy = 'cosine'

vector_store = PGVector(
    embeddings=embeddings,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
    distance_strategy=distance_strategy,
    # pre_delete_collection=True      # Set to True to delete the collection before adding documents
)

# Read Document Data

In [None]:
import psycopg2
from psycopg2 import sql
import pandas as pd
import polars as pl

db_params = {
    'host': os.getenv('DB_HOST'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD'),
    'database': os.getenv('DB_NAME'),
    'port': os.getenv('DB_PORT')
}

with psycopg2.connect(**db_params) as conn:
    cur = conn.cursor()
    query = (
        sql
        .SQL(
            """
            SELECT 
                * 
            FROM {table}
            WHERE metadata ->> %s IS NOT NULL;
            """
        )
        .format(
            table=sql.Identifier('vector_store', 'documents')
        )
    )
    cur.execute(query, ('document_hash',))
    rows = cur.fetchall()
    
    columns = [desc[0] for desc in cur.description]
    df = pl.DataFrame(rows, schema=columns, orient='row')

In [None]:
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm.notebook import tqdm
from more_itertools import chunked


# Step 1: Convert your DataFrame rows to Documents
raw_docs = []

for row in df.rows(named=True):
    text = row["document"]
    metadata = row["metadata"]
    raw_docs.append(Document(page_content=text, metadata=metadata))
    
# Chunking raw_docs
chunks = list(chunked(raw_docs, 1000))
for chunk in tqdm(chunks, desc="Batch Processing Documents to Vector Store"):
    
    # Step 2: Use a text splitter to chunk the content
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,       # Adjust based on your embedding model's context length
        chunk_overlap=50,     # Helps maintain context across chunks
        add_start_index=True
    )

    # Step 3: Split the raw documents into smaller chunks
    split_docs = text_splitter.split_documents(chunk)

    # Step 4: Add to vector store
    vector_store.add_documents(split_docs, ids=[f"{doc.metadata["appid"]}-{doc.metadata["start_index"]}" for index, doc in enumerate(split_docs)])

In [None]:
query = "Recommend me a game that has cowboys in it"
results = vector_store.similarity_search_with_score(
    query,
    k=5
)
for doc, score in results:
    game_name = doc.metadata['name']
    game_score = score
    print(f"* [SIM={score:3f}] {game_name}")
    print(doc.page_content[:300], "\n")

In [None]:
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 1})
retriever.invoke("kitty")