In [4]:
%pip install psycopg2 rich

from datetime import datetime
from collections import defaultdict
from psycopg2 import sql
from rich import print
import psycopg2
import sys

Note: you may need to restart the kernel to use updated packages.


# HELPER FUNCTIONS

In [5]:
def log(message: str) -> None:
    """
    Logs a message to the console.
    """
    print(message)
    with open("sumdb_log.txt", "a") as log_file:
        log_file.write(message + "\n")

In [6]:
def get_column_names(conn, table):
    """
    Fetches and logs the column names of a specified table.
    """
    query = sql.SQL("""
        SELECT column_name 
        FROM information_schema.columns 
        WHERE table_name = %s
        ORDER BY ordinal_position;
    """)
    cur = conn.cursor()
    cur.execute(query, (table,))
    columns = cur.fetchall()
    cur.close()
    return [col[0] for col in columns]

# OVERVIEW OF SUMDB

In [7]:
dirs = '../inputs'
filename = "input.csv"
topicFileName = "topics.txt"

inputFilePath = f"{dirs}/{filename}"
topicFilePath = f"{dirs}/{topicFileName}"

In [8]:
# find len
with open(topicFilePath, 'r') as file:
    true_topics = file.readlines()
    true_topics = [topic.strip() for topic in true_topics]
    
print(f"Number of true topics: {len(true_topics)}")
print(f"True topics: {true_topics}")

In [9]:
db_topic = "localhost"  # using localhost for now
port = "5432"
dbname = "db"  # internal database name
username = "user"
password = "password"

table = "test"  # Name of table to query

In [10]:
# Connect to the database
conn = psycopg2.connect(
    dbname=dbname,
    user=username,
    password=password,
    host=db_topic,
    port=port
)

# Format datetime for readability
formatted_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log(f"[{formatted_datetime}] Connected to database '{dbname}' on {db_topic}:{port} as '{username}'")

# Get and log column names
column_names = get_column_names(conn, table)
log("Column names in '{}' table: {}".format(table, ", ".join(column_names)))


In [11]:
# Prepare and execute the query
query = sql.SQL("SELECT chunkstart, chunkend, topic FROM {}").format(sql.Identifier(table))
with conn.cursor() as cur:
    cur.execute(query)
    rows = cur.fetchall()

# conn.close()

print(rows[:5])

In [12]:
topic_chunk_count = defaultdict(int)
row_topic_pairs = defaultdict(list)
row_topic_count = defaultdict(int)

for row in rows:
    chunk_start, chunk_end, topic = row

    topic_chunk_count[topic] += 1
    row_topic_pairs[topic].append((chunk_start, chunk_end))
    row_topic_count[topic] += abs(chunk_end - chunk_start) + 1


In [13]:
# log the count of chunks for each topic
log('\nTOPIC NODE ANALYSIS')
for topic, count in topic_chunk_count.items():
    log(f"{topic}: {count} chunks")

log('Chunk Count: ' + str(len(rows)) + ' saved chunks')
log(f'Actual topic node: {len(topic_chunk_count)} nodes')
log(f'Expected topic node: {len(true_topics)} nodes')
log(f'Missing {len(true_topics) - len(topic_chunk_count)} nodes: {set(true_topics) - set(topic_chunk_count.keys())}')


In [14]:

log('\nDATA COUNT ANALYSIS')
row_topic_pairs = dict(sorted(row_topic_pairs.items(), key=lambda x: x[1], reverse=True))

for topic, rows in row_topic_pairs.items():
    log(f"{topic}: {row_topic_count[topic]:,}")
    # log(f"Pairs: {rows}")

total_rows = sum(row_topic_count.values())
log(f'\nTotal rows: {total_rows:,}')


In [15]:
log('\nCONTINUITY CHECK')
no_gap = True
for topic, chunks in row_topic_pairs.items():
    # Sort the chunks by chunkstart to ensure correct order
    chunks.sort(key=lambda x: x[0])
    for i in range(len(chunks) - 1):
        current_chunk_end = chunks[i][1]
        next_chunk_start = chunks[i + 1][0]
        # Check if there is a gap
        if next_chunk_start != current_chunk_end + 1:
            log(f"Gap found in topic '{topic}' within chunks [{current_chunk_end}, {next_chunk_start}]")
            no_gap = False

if no_gap:
    log('No gap found!')


In [16]:
conn.close()

# Custom Analysis

In [24]:
# this will select the most recent 3 rows
query = sql.SQL("""
    SELECT chunkstart, chunkend, topic, summary 
    FROM {} 
    ORDER BY updatedat DESC
    LIMIT 3;
""").format(sql.Identifier(table))

conn = psycopg2.connect(
    dbname=dbname,
    user=username,
    password=password,
    host=db_topic,
    port=port
)

with conn.cursor() as cur:
    cur.execute(query)
    rows = cur.fetchall()

conn.close()

for i, row in enumerate(rows):
    log(f"CURRENT CHUNKS {i + 1}")
    log(f"Chunk range: ({row[0]}, {row[1]})")
    log(f'Topic: {row[2]}')
    # log(f"Chunk content: {row[3][-1000:]}")

In [33]:
# print 1st 5 rows
first_chunk = rows[0][3]

formatted_chunk = first_chunk.split('\n')

# remove empty lines
formatted_chunk = [line for line in formatted_chunk if line.strip()]

# remove lines with only whitespace or with less than 10 characters
formatted_chunk = [line for line in formatted_chunk if len(line.strip()) > 10]

# print first 5 lines
print(formatted_chunk[:5])