# Hybrid Search

In [4]:
from openai import OpenAI
import os
import psycopg2
from pgvector.psycopg2 import register_vector
import instructor
from pydantic import BaseModel
import openai
import sqlvalidator
from langchain_community.utilities import SQLDatabase
from langchain.chains import create_sql_query_chain

In [5]:
postgres_username = os.getenv("POSTGRES_USERNAME")
postgres_pwd = os.getenv("POSTGRES_PASSWORD")

In [6]:
client = instructor.from_openai(OpenAI())

In [7]:
class SQLQuery(BaseModel):
    sql_command: str
    selected_columns: list


class RAGGenerationResponse(BaseModel):
    answer: str

In [8]:
conn = psycopg2.connect(
    dbname="postgresdb",
    user=postgres_username,
    password=postgres_pwd,
    host="host.docker.internal",  # e.g., "localhost"
    port="5433"        # default PostgreSQL port
)
conn.autocommit = True
with conn.cursor() as cur:
    cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
    conn.commit()
register_vector(conn)  # Register pgvector type with psycopg2
cursor = conn.cursor()

In [9]:
def is_valid_sql(query):
    parsed = sqlvalidator.parse(query)
    return parsed.is_valid()


def get_embedding(text):
    result = openai.embeddings.create(
        input=[text],
        model="text-embedding-3-small"
    )
    embedding = result.data[0].embedding 
    return embedding


def generate_sql_query(question, llm_client):
    response, _ = llm_client.chat.completions.create_with_completion(
        model="gpt-4.1-mini",
        response_model=SQLQuery,
        messages=[{"role":"user", "content": question}],
        temperature=0
    )
    return response


def execute_sql_query(cursor, sql_query, params):
    rows = list()
    if is_valid_sql(sql_query):
        if params:
            cursor.execute(sql_query, params)
        else:
            cursor.execute(sql_query)
        rows = cursor.fetchall()
    return rows




# def execute_sql_similarity_match(cursor, input_embedding, top_k=5):
#     vector_search_sql = "SELECT * FROM us_attractions ORDER BY embedding <-> %s::vector LIMIT %s"
#     cursor.execute(vector_search_sql, (input_embedding, top_k))
#     rows = cursor.fetchall()
#     return rows

In [24]:
user_question = "Which Nashville attractions are related to music and musicians?"
user_question_embedding = get_embedding(user_question)
user_question_embedding_str = '[' + ','.join(map(str, user_question_embedding)) + ']'



In [25]:
# user_question = "Are there any amusement or water parks in Charlotte, NC?"

# System prompt for SQL generation as specified by user
SYSTEM_PROMPT = f"""
You are a PostgreSQL expert and you perform vector similarity search. 

You only respond with PostgreSQL commands for the question asked by the user.

You are given a database schema:
    Schema: public
    Table: us_attractions
    Columns:
    - id INTEGER PRIMARY KEY
    - name VARCHAR(250)
    - main_category VARCHAR(250)
    - rating REAL
    - reviews REAL
    - categories VARCHAR(250)
    - address VARCHAR(250)
    - city VARCHAR(250)
    - country VARCHAR(250)
    - state VARCHAR(250)
    - zipcode INTEGER
    - broader_category VARCHAR(250)
    - weighted_score REAL
    - weighted_average REAL
    - all_cities VARCHAR(250)
    - embedding VECTOR

The us_attractions table only has information for USA only. The values under the country column are all 'USA'.

Always include the 'id' column in the SQL command.

Select a few relevant columns dynamically based on the question, such as id, name, rating, main_category, etc.

Always include the following vector similarity comparison in your query to rank results by similarity:

embedding::vector <=> %(embedding)s AS distance

where %(embedding)s is a placeholder. You are to leave the placeholder as instructed.

Translate the following user question into PostgreSQL query statement:

'{user_question}'

Instructions:
- Write PostgreSQL query using the "public" schema for all tables (e.g., public.us_attractions).
- Order results by this distance (ascending, with closest matches first)
- Limis results to 5 rows unless another limit is specified by the user
- Does NOT include any WHERE clauses, filters, or other conditions because the vector similarity ranking fully determines relevance
- Always returns the vector distance column named "distance"
- If you cannot respond a PostgreSQL command respond with 'Sorry, no relevant data was found in the database for your query.'. Don't respond with anything else.
"""

In [26]:
response = generate_sql_query(SYSTEM_PROMPT, client)

In [27]:
sql_query = response.sql_command
print(sql_query)
print(response.selected_columns)

SELECT id, name, main_category, rating, city, embedding::vector <=> %(embedding)s AS distance FROM public.us_attractions ORDER BY distance ASC LIMIT 5
['id', 'name', 'main_category', 'rating', 'city', 'distance']


In [28]:
params = {
    'embedding': user_question_embedding_str
}

sql_query_results = execute_sql_query(cursor, sql_query, params)

In [1]:
# sql_query_results

In [29]:
formatted_rows = "\n".join([", ".join(map(str, row)) for row in sql_query_results])
# print(formatted_rows)

# Create a prompt for the LLM

prompt = f'''Here are the query results:\n{formatted_rows}

Generated by this SQL query: {sql_query}\n
'''
if user_question:
    prompt += f"Based on these results, answer the question: {user_question}"

# print(prompt)

response, raw_response = client.chat.completions.create_with_completion(
    model="gpt-4.1-mini",
    response_model=RAGGenerationResponse,
    messages=[{"role":"user", "content": prompt}],
    temperature=0
)


In [30]:
print(response.answer)

The Nashville attractions related to music and musicians are:
1. Musicians Hall of Fame and Museum
2. National Museum of African American Music
3. Country Music Hall of Fame and Museum
