# Smart Query
This Notebook allows user to query a random piece of information from SumDB first. If the information is found in SumDB, it will then go to LogosCluster to find the full-length article. Otherwise, return nothing.

In [1]:
%pip install psycopg2-binary rich scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting numpy>=1.19.5 (from scikit-learn)
  Downloading numpy-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.

In [2]:
from typing import List, Tuple
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from psycopg2 import sql
from rich import print
import psycopg2


# HELPER FUNCTIONS

In [3]:
def log(message: str) -> None:
    """
    Logs a message to the console.
    """
    print(message)
    with open("sumdb_log.txt", "a") as log_file:
        log_file.write(message + "\n")

In [4]:
def get_column_names(conn, table):
    """
    Fetches and logs the column names of a specified table.
    """
    query = sql.SQL("""
        SELECT column_name 
        FROM information_schema.columns 
        WHERE table_name = %s
        ORDER BY ordinal_position;
    """)
    cur = conn.cursor()
    cur.execute(query, (table,))
    columns = cur.fetchall()
    cur.close()
    return [col[0] for col in columns]

# Perform SumDB Query

In [5]:
# sumb db config
sumdb_topic = "logosdb-sumdb"  # using localhost for now
port = "5432"
dbname = "db"  # internal database name
username = "user"
password = "password"

table = "test"  # Name of table to query

In [6]:
# Connect to the database
conn = psycopg2.connect(
    dbname=dbname,
    user=username,
    password=password,
    host=sumdb_topic,
    port=port
)

# Format datetime for readability
formatted_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log(f"[{formatted_datetime}] Connected to database '{dbname}' on {sumdb_topic}:{port} as '{username}'")

# Get and log column names
column_names = get_column_names(conn, table)
log("Column names in '{}' table: {}".format(table, ", ".join(column_names)))


## HELPER FUNCTIONS

In [7]:
def similarity_search(user_query: str, chunk_summary: str) -> float:
    """
    Calculate the similarity between the user query and a chunk summary using cosine similarity.
    """
    # Create a TF-IDF Vectorizer
    vectorizer = TfidfVectorizer()

    # Combine the user query and chunk summary into a list
    documents = [user_query, chunk_summary]

    # Fit and transform the documents into TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform(documents)

    # Calculate the cosine similarity between the first and second document
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])

    # Return the similarity score as a float
    return similarity[0][0]

In [8]:
def get_all_sumdb_data(conn: psycopg2.extensions.connection, table: str) -> List[Tuple[str]]:
    """
    Fetches all data from the specified table.
    """
    query = sql.SQL("SELECT chunkstart, chunkend, topic, summary, updatedat FROM {}").format(sql.Identifier(table))
    cur = conn.cursor()
    cur.execute(query)
    data = cur.fetchall()
    cur.close()
    return data

In [9]:
def get_relevant_vectors_from_summary(chunk_summary: str, user_query: str, threshold: float=0.5) -> List[Tuple[str, float]]:
    """
    Find all vectors (rows) in the chunk summary that have a similarity score above the threshold with the user query.
    """
    
    relevant_rows = []
    current_row = ""

    # Iterate over each character in the chunk summary
    for char in chunk_summary:
        # Split summary into rows by newline symbol
        if char != '\n':
            current_row += char
            continue

        # Skip rows that are too short
        current_row = current_row.lower().strip()
        if len(current_row) <= 5:
            current_row = ""
            continue

        score = similarity_search(user_query, current_row)

        # only add rows with a similarity score above the threshold
        if score > threshold:
            relevant_rows.append((current_row, score))

        current_row = ""

    
    # Check the last row if it doesn't end with a newline
    current_row = current_row.lower().strip()
    if len(current_row) > 5:
        score = similarity_search(user_query, current_row)
        if score > threshold:
            relevant_rows.append((current_row, score))
    
    return relevant_rows

In [10]:
def get_relevant_vectors_of_all_chunk(raw_data: List[Tuple[str]], user_query: str, threshold: float=0.5) -> List[Tuple[float, str, int, int, str]]:
    """
    Get all relevant vectors from all chunks in the raw data.
    (which has similarity > threshold)
    """

    # Do similarity search for each chunk summary
    relevant_vectors = []
    
    progress = 0 # out of 100%
    # A chunk is considered relevant if 1 row within it is relevant
    for i, chunk_row in enumerate(raw_data):
        if i % (len(raw_data) // 10) == 0 and i != 0:
            progress += 10
            log(f"Progress: {progress}%, Sum Chunk: {i}/{len(raw_data)}")
        chunk_start, chunk_end, topic, chunk_summary = chunk_row[:4]
        valid_rows = get_relevant_vectors_from_summary(chunk_summary, user_query, threshold)
        
        # Add valid vectors from chunk to relevant_vectors
        for row, score in valid_rows:
            relevant_vectors.append((score, row, chunk_start, chunk_end, topic))

    return relevant_vectors

In [25]:
def find_detail_articles(relevant_vectors: List[Tuple[float, str, int, int, str]], k_docs: int=5) -> List[Tuple[float, str, str, int, int, str]]:
    """
    Fetches the detailed articles for the top-k most relevant vectors.
    """
    logos_dbname = "db"  # internal database name
    logos_username = "user"
    logos_password = "password"

    logos_table = "test"
    output = []

    # sorted by similarity score
    relevant_vectors.sort(key=lambda x: x[0], reverse=True)
    for vect in relevant_vectors[:k_docs]:
        score, row, chunk_start, chunk_end, node_topic = vect
        row_id = int(row.split(".")[0]) # extract row id from row string

        # connect to db first
        logos_conn = psycopg2.connect(
            dbname=logos_dbname,
            user=logos_username,
            password=logos_password,
            host=node_topic,
        )
        
        # then do query on topic node
        query = sql.SQL("""
            SELECT question, answer, keywords FROM {} WHERE id = %s
        """).format(sql.Identifier(logos_table))
        
        with logos_conn.cursor() as cur:
            cur.execute(query, (row_id,))
            articles = cur.fetchall()
            output.append((score, row, articles[0][1], chunk_start, chunk_end, node_topic))
            
        logos_conn.close()

    return output

## MAIN FUNCTION

In [12]:
log("Fetching all data from the SumDB")
raw_data = get_all_sumdb_data(conn, table)
log("Data fetched successfully.")

In [13]:
# Get Relevant Chunks
user_query = r"datamaker is a canadian developer and marketer of test data management software datamaker was founded by mathieu pelletier in 2020"
threshold = 0.5
relevant_vectors = get_relevant_vectors_of_all_chunk(raw_data, user_query, threshold)

In [14]:
if not relevant_vectors:
    log(f"No relevant vectors found for query '{user_query}'")
else:
    log(f"Found {len(relevant_vectors)} relevant vectors for query '{user_query}'")
    log(f"Print first 5 relevant vectors:")
    for i, chunk in enumerate(relevant_vectors[:5]):
        score, row, chunk_start, chunk_end, topic = chunk
        log(f"Relevant Vector {i+1}:")
        log(f"Topic: {topic}, Score: {score}")
        log(f"Vector Content: {row}")

In [26]:
# Get top k articles
k = 5
detail_articles = find_detail_articles(relevant_vectors, k)

print(detail_articles)

In [None]:
conn.close()