In [None]:
%pip install psycopg2-binary pandas pyarrow

In [None]:
topics = []
with open("../inputs/topics.txt", "r") as file:
    topics = file.read().split()
topics

In [None]:
from collections import defaultdict
from psycopg2 import sql
import psycopg2
import pandas as pd

In [None]:
def get_column_names(conn, table):
    """
    Fetches and prints the column names of a specified table.
    """
    query = sql.SQL("""
        SELECT column_name 
        FROM information_schema.columns 
        WHERE table_name = %s
        ORDER BY ordinal_position;
    """)
    cur = conn.cursor()
    cur.execute(query, (table,))
    columns = cur.fetchall()
    cur.close()
    return [col[0] for col in columns]

In [None]:
dbname = "db"  # internal database name
username = "user"
password = "password"

table = "test"  # Name of table to query

# QUERY ON ALL TOPIC NODES

In [None]:
count_topic = defaultdict(int)

for topic in topics:
    conn = psycopg2.connect(
                dbname=dbname,
                user=username,
                password=password,
                host=topic,
            )

    # Prepare and execute the query
    query = sql.SQL("SELECT id, question, answer, keywords FROM {}").format(sql.Identifier(table))
    try:
        with conn.cursor() as cur:
            cur.execute(query)
            rows = cur.fetchall()
            count_topic[topic] = len(rows)
    except psycopg2.errors.UndefinedTable:  # Handle table not found error
        print(f"Table '{table}' does not exist in database '{dbname}' on node '{topic}'")
        count_topic[topic] = 0  # Or handle the case as you see fit
    finally:
        conn.close()


In [None]:
# sort by desending frequency
count_topic = dict(sorted(count_topic.items(), key=lambda x: x[1], reverse=True))

print('Num Topic:', len(count_topic))
total = sum(count_topic.values())

print(count_topic)
print(total)

# CUSTOM QUERY ON SINGLE NODE


In [None]:
count_topic = defaultdict(int)

# test for topic[0]
topic = 'Biology'
conn = psycopg2.connect(
        dbname=dbname,
        user=username,
        password=password,
        host=topic,
    )

query = sql.SQL("SELECT id, question, answer, keywords FROM {}").format(sql.Identifier(table))

try:
    with conn.cursor() as cur:
        header = get_column_names(conn, table)
        cur.execute(query)
        rows = cur.fetchall()
except psycopg2.errors.UndefinedTable:  # Handle table not found error
    print(f"Table '{table}' does not exist in database '{dbname}' on node '{topic}'")
finally:
    conn.close()

print(header)

In [None]:
def get_chunk(node_df, start, end):
    chunk_df = node_df[(node_df['id'] >= start) & (node_df['id'] <= end)]

    return chunk_df

In [None]:
node_df = pd.DataFrame(rows, columns=header[:-1])
# node_df.sort_values('id', inplace=True)
# node_df.head(10)

# for idx, row in node_df[114495:115015].iterrows():
#     # print(row['id'])
#     print(row['question'], row['answer'])

cur_chunk = get_chunk(node_df, 732501, 733000)
next_chunk = get_chunk(node_df, 733001, 733501)

# Display rows with null 'id'
# null_id_rows = node_df[pd.isnull(node_df['id'])]
# print(null_id_rows)

In [None]:
print('Cur Chunk Start')
cur_chunk.head(10)

In [None]:
print('Cur Chunk End')
cur_chunk.tail(10)

In [None]:
print('Next Chunk Start')
next_chunk.head(10)

In [None]:
print('Next Chunk End')
next_chunk.tail(10)

In [None]:
# Save next chunk to csv
next_chunk.to_csv('biology.csv')

In [None]:
input_df = pd.read_csv('Automotive.csv')
# input_df.drop('index', axis=1, inplace=True)
# input_df.sort_values('question', inplace=True)

# check if any id is null
input_df.head(10)

In [None]:
node_df.dtypes
input_df.dtypes

In [None]:
def escape(s):
    result = ""
    for c in s:
        if c == "'":
            result += "''"  # Escape single quote with another single quote
        elif c == '"':
            result += '\\"'  # Escape double quote
        elif c == '\\':
            result += '\\\\'  # Escape backslash
        else:
            result += c
    return result

In [None]:
diff = []
for (index1, row1), (index2, row2) in zip(node_df.iterrows(), input_df.iterrows()):
    r1 = row1['question'].strip()
    r2 = row2['question'].strip()
    if r1 != r2:
        print(r1, '\n\n', r2)
        break
        diff.append(index1)
len(diff)