In [1]:
!pip install psycopg2-binary pandas pyarrow

Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting pandas
  Downloading pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting pyarrow
  Downloading pyarrow-16.1.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting numpy>=1.23.2 (from pandas)
  Downloading numpy-2.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownlo

In [2]:
topics = []
with open("./inputs/topics.txt", "r") as file:
    topics = file.read().split()
topics

['Sports-and-Outdoors',
 'Home-and-Kitchen',
 'Grocery-and-Gourmet-Food',
 'Automotive',
 'Software',
 'Office-Products',
 'Clothing-Shoes-and-Jewelry',
 'Baby',
 'Cell-Phones-and-Accessories',
 'Patio-Lawn-and-Garden',
 'Beauty',
 'Musical-Instruments',
 'Health-and-Personal-Care',
 'Pet-Supplies',
 'Electronics',
 'Tools-and-Home-Improvement',
 'Toys-and-Games',
 'Industrial-and-Scientific',
 'Video-Games',
 'Arts-Crafts-and-Sewing',
 'Appliances']

In [3]:
from collections import defaultdict
from psycopg2 import sql
import psycopg2
import pandas as pd

In [7]:
def get_column_names(conn, table):
    """
    Fetches and prints the column names of a specified table.
    """
    query = sql.SQL("""
        SELECT column_name 
        FROM information_schema.columns 
        WHERE table_name = %s
        ORDER BY ordinal_position;
    """)
    cur = conn.cursor()
    cur.execute(query, (table,))
    columns = cur.fetchall()
    cur.close()
    return [col[0] for col in columns]

In [4]:
# QUERY ON ALL TOPIC NODES
dbname = "db"  # internal database name
username = "user"
password = "password"

table = "test"  # Name of table to query
count_topic = defaultdict(int)

for topic in topics:
    conn = psycopg2.connect(
                dbname=dbname,
                user=username,
                password=password,
                host=topic,
            )

    # Prepare and execute the query
    query = sql.SQL("SELECT id, question, answer, keywords FROM {}").format(sql.Identifier(table))
    try:
        with conn.cursor() as cur:
            cur.execute(query)
            rows = cur.fetchall()
            count_topic[topic] = len(rows)
    except psycopg2.errors.UndefinedTable:  # Handle table not found error
        print(f"Table '{table}' does not exist in database '{dbname}' on node '{topic}'")
        count_topic[topic] = 0  # Or handle the case as you see fit
    finally:
        conn.close()

# sort by desending frequency
count_topic = dict(sorted(count_topic.items(), key=lambda x: x[1], reverse=True))

print('Num Topic:', len(count_topic))
count_topic

Num Topic: 21


{'Electronics': 314263,
 'Home-and-Kitchen': 184439,
 'Sports-and-Outdoors': 146891,
 'Tools-and-Home-Improvement': 101088,
 'Automotive': 89923,
 'Cell-Phones-and-Accessories': 85865,
 'Health-and-Personal-Care': 80496,
 'Patio-Lawn-and-Garden': 59595,
 'Toys-and-Games': 51486,
 'Office-Products': 43608,
 'Beauty': 42422,
 'Pet-Supplies': 36607,
 'Baby': 28933,
 'Musical-Instruments': 23322,
 'Clothing-Shoes-and-Jewelry': 22068,
 'Arts-Crafts-and-Sewing': 21262,
 'Grocery-and-Gourmet-Food': 19538,
 'Video-Games': 13307,
 'Industrial-and-Scientific': 12136,
 'Software': 10636,
 'Appliances': 9011}

In [8]:
# CUSTOM QUERY ON SINGLE NODE
dbname = "db"  # internal database name
username = "user"
password = "password"

table = "test"  # Name of table to query
count_topic = defaultdict(int)

# test for topic[0]
topic = 'Automotive'
conn = psycopg2.connect(
        dbname=dbname,
        user=username,
        password=password,
        host=topic,
    )

query = sql.SQL("SELECT id, question, answer, keywords FROM {}").format(sql.Identifier(table))

try:
    with conn.cursor() as cur:
        header = get_column_names(conn, table)
        cur.execute(query)
        rows = cur.fetchall()
except psycopg2.errors.UndefinedTable:  # Handle table not found error
    print(f"Table '{table}' does not exist in database '{dbname}' on node '{topic}'")
finally:
    conn.close()

# for row in rows[:100]:
#     print(row)

node_df = pd.DataFrame(rows, columns=header[:-1])
# node_df.sort_values('question', inplace=True)
node_df.head(10)

Unnamed: 0,id,question,answer,keywords
0,1,What is the most useful length to get?,at least 20 feet.......heres why....say you ha...,"[kw1, kw2, kw3, kw4, kw5]"
1,2,Are these cables made of copper or aluminum?,Coleman's website does indeed say copper equiv...,"[kw1, kw2, kw3, kw4, kw5]"
2,3,I bought the Red Extra Heavy Duty. Is that too...,"For jumper cables, you can never have too much...","[kw1, kw2, kw3, kw4, kw5]"
3,4,"Hi, Being 20ft 4gauge how heavy is this?",Not nearly heavy enough. I keep them under my ...,"[kw1, kw2, kw3, kw4, kw5]"
4,5,Do these cables come with a bag?,No,"[kw1, kw2, kw3, kw4, kw5]"
5,6,Are the wires paired together? Am surprised bo...,"Yes, it's a twined cable. And why does it surp...","[kw1, kw2, kw3, kw4, kw5]"
6,7,How many amps can this handle?,Per Coleman Cable specifications 4 gauge 20 fo...,"[kw1, kw2, kw3, kw4, kw5]"
7,8,Can I use this cables to boost a school bus ?,I would think so. I bought them to charge a pu...,"[kw1, kw2, kw3, kw4, kw5]"
8,9,What is the most useful length to get?,at least 20 feet.......heres why....say you ha...,"[kw1, kw2, kw3, kw4, kw5]"
9,10,Are these cables made of copper or aluminum?,Coleman's website does indeed say copper equiv...,"[kw1, kw2, kw3, kw4, kw5]"


In [9]:
input_df = pd.read_csv('Automotive.csv')
# input_df.drop('index', axis=1, inplace=True)
# input_df.sort_values('question', inplace=True)
input_df.head(10)

Unnamed: 0,question,answer,topic,index
0,What is the most useful length to get?,at least 20 feet.......heres why....say you ha...,Automotive,30273
1,Are these cables made of copper or aluminum?,Coleman's website does indeed say copper equiv...,Automotive,30274
2,I bought the Red Extra Heavy Duty. Is that too...,"For jumper cables, you can never have ""too muc...",Automotive,30275
3,"Hi, Being 20ft 4gauge how heavy is this?",Not nearly heavy enough. I keep them under my ...,Automotive,30276
4,Do these cables come with a bag?,No,Automotive,30277
5,Are the wires paired together? Am surprised bo...,"Yes, it's a twined cable. And why does it surp...",Automotive,30278
6,How many amps can this handle?,Per Coleman Cable specifications 4 gauge 20 fo...,Automotive,30279
7,Can I use this cables to boost a school bus ?,I would think so. I bought them to charge a pu...,Automotive,30280
8,What is the most useful length to get?,at least 20 feet.......heres why....say you ha...,Automotive,30281
9,Are these cables made of copper or aluminum?,Coleman's website does indeed say copper equiv...,Automotive,30282


In [9]:
node_df.dtypes
input_df.dtypes

id           int64
question    object
answer      object
keywords    object
dtype: object

In [23]:
def escape(s):
    result = ""
    for c in s:
        if c == "'":
            result += "''"  # Escape single quote with another single quote
        elif c == '"':
            result += '\\"'  # Escape double quote
        elif c == '\\':
            result += '\\\\'  # Escape backslash
        else:
            result += c
    return result

In [10]:
diff = []
for (index1, row1), (index2, row2) in zip(node_df.iterrows(), input_df.iterrows()):
    r1 = row1['question'].strip()
    r2 = row2['question'].strip()
    if r1 != r2:
        print(r1, '\n\n', r2)
        break
        diff.append(index1)
len(diff)

How is this motor different than the one Amazon sells that is called Lomanco Power Vent Attic Fan Motor 1/10hp 1100 RPM 115 Volts # F0510B2497 ? 

 How is this motor different than the one Amazon sells that is called "Lomanco Power Vent Attic Fan Motor 1/10hp 1100 RPM 115 Volts # F0510B2497 ?


0