In [1]:
!pip install psycopg2-binary



In [2]:
topics = []
with open("./inputs/topics.txt", "r") as file:
    topics = file.read().split()
topics

['Software',
 'Grocery-and-Gourmet-Food',
 'Baby',
 'Electronics',
 'Musical-Instruments',
 'Cell-Phones-and-Accessories',
 'Tools-and-Home-Improvement',
 'Toys-and-Games',
 'Patio-Lawn-and-Garden',
 'Sports-and-Outdoors',
 'Pet-Supplies',
 'Health-and-Personal-Care',
 'Arts-Crafts-and-Sewing',
 'Beauty',
 'Home-and-Kitchen',
 'Clothing-Shoes-and-Jewelry',
 'Appliances',
 'Office-Products',
 'Automotive',
 'Video-Games',
 'Industrial-and-Scientific']

In [3]:
from collections import defaultdict
from psycopg2 import sql
import psycopg2

In [4]:
def get_column_names(conn, table):
    """
    Fetches and prints the column names of a specified table.
    """
    query = sql.SQL("""
        SELECT column_name 
        FROM information_schema.columns 
        WHERE table_name = %s
        ORDER BY ordinal_position;
    """)
    cur = conn.cursor()
    cur.execute(query, (table,))
    columns = cur.fetchall()
    cur.close()
    return [col[0] for col in columns]

In [8]:
dbname = "db"  # internal database name
username = "user"
password = "password"

table = "test"  # Name of table to query
count_topic = defaultdict(int)

for topic in topics:
    conn = psycopg2.connect(
                dbname=dbname,
                user=username,
                password=password,
                host=topic,
            )

    # Prepare and execute the query
    query = sql.SQL("SELECT id, question, answer, keywords FROM {}").format(sql.Identifier(table))
    try:
        with conn.cursor() as cur:
            cur.execute(query)
            rows = cur.fetchall()
            count_topic[topic] = len(rows)
    except psycopg2.errors.UndefinedTable:  # Handle table not found error
        print(f"Table '{table}' does not exist in database '{dbname}' on node '{topic}'")
        count_topic[topic] = 0  # Or handle the case as you see fit
    finally:
        conn.close()

# sort by desending frequency
count_topic = dict(sorted(count_topic.items(), key=lambda x: x[1], reverse=True))

count_topic

Table 'test' does not exist in database 'db' on node 'Toys-and-Games'
Table 'test' does not exist in database 'db' on node 'Video-Games'


{'Electronics': 314263,
 'Home-and-Kitchen': 184439,
 'Sports-and-Outdoors': 146891,
 'Automotive': 89923,
 'Cell-Phones-and-Accessories': 85865,
 'Health-and-Personal-Care': 80496,
 'Tools-and-Home-Improvement': 68985,
 'Patio-Lawn-and-Garden': 59595,
 'Office-Products': 43608,
 'Beauty': 42422,
 'Pet-Supplies': 36607,
 'Baby': 28933,
 'Musical-Instruments': 23322,
 'Clothing-Shoes-and-Jewelry': 22068,
 'Arts-Crafts-and-Sewing': 21262,
 'Grocery-and-Gourmet-Food': 19538,
 'Industrial-and-Scientific': 12136,
 'Software': 10636,
 'Appliances': 9011,
 'Toys-and-Games': 0,
 'Video-Games': 0}

In [None]:
# TODO: Fix these bugs
# Get wrong frequencies on Tools-and-Home-Improvement
# Missing Data Insert on 2 nodes: Toys-and-Games, Video-Games