In [1]:
%pip install psycopg2-binary pandas pyarrow

Note: you may need to restart the kernel to use updated packages.


In [2]:
topics = []
with open("../inputs/topics.txt", "r") as file:
    topics = file.read().split()
topics

['Literature',
 'Astronomy',
 'Biology',
 'Chemistry',
 'Psychology',
 'Art',
 'Computer-Science',
 'Economics',
 'Philosophy',
 'Geology',
 'Politics',
 'History',
 'Education',
 'Physics',
 'Mathematics']

In [3]:
from collections import defaultdict
from psycopg2 import sql
import psycopg2
import pandas as pd

In [4]:
def get_column_names(conn, table):
    """
    Fetches and prints the column names of a specified table.
    """
    query = sql.SQL("""
        SELECT column_name 
        FROM information_schema.columns 
        WHERE table_name = %s
        ORDER BY ordinal_position;
    """)
    cur = conn.cursor()
    cur.execute(query, (table,))
    columns = cur.fetchall()
    cur.close()
    return [col[0] for col in columns]

In [5]:
dbname = "db"  # internal database name
username = "user"
password = "password"

table = "test"  # Name of table to query

# QUERY ON ALL TOPIC NODES

In [6]:
count_topic = defaultdict(int)

for topic in topics:
    conn = psycopg2.connect(
                dbname=dbname,
                user=username,
                password=password,
                host=topic,
            )

    # Prepare and execute the query
    query = sql.SQL("SELECT id, question, answer, keywords FROM {}").format(sql.Identifier(table))
    try:
        with conn.cursor() as cur:
            cur.execute(query)
            rows = cur.fetchall()
            count_topic[topic] = len(rows)
    except psycopg2.errors.UndefinedTable:  # Handle table not found error
        print(f"Table '{table}' does not exist in database '{dbname}' on node '{topic}'")
        count_topic[topic] = 0  # Or handle the case as you see fit
    finally:
        conn.close()


In [7]:
# sort by desending frequency
count_topic = dict(sorted(count_topic.items(), key=lambda x: x[1], reverse=True))

print('Num Topic:', len(count_topic))
total = sum(count_topic.values())

print(count_topic)
print(total)

Num Topic: 15
{'Biology': 2370426, 'Politics': 1181574, 'History': 816872, 'Geology': 568769, 'Art': 554301, 'Literature': 552794, 'Education': 406907, 'Economics': 405529, 'Astronomy': 233765, 'Computer-Science': 224853, 'Chemistry': 190785, 'Philosophy': 94673, 'Mathematics': 71554, 'Physics': 44644, 'Psychology': 32561}
7750007


# CUSTOM QUERY ON SINGLE NODE


In [21]:
count_topic = defaultdict(int)

# test for topic[0]
topic = 'Biology'
conn = psycopg2.connect(
        dbname=dbname,
        user=username,
        password=password,
        host=topic,
    )

query = sql.SQL("SELECT id, question, answer, keywords FROM {}").format(sql.Identifier(table))

try:
    with conn.cursor() as cur:
        header = get_column_names(conn, table)
        cur.execute(query)
        rows = cur.fetchall()
except psycopg2.errors.UndefinedTable:  # Handle table not found error
    print(f"Table '{table}' does not exist in database '{dbname}' on node '{topic}'")
finally:
    conn.close()

print(header)

['id', 'question', 'answer', 'keywords', 'updatedat']


In [41]:
def get_chunk(node_df, start, end):
    chunk_df = node_df[(node_df['id'] >= start) & (node_df['id'] <= end)]

    return chunk_df

In [42]:
node_df = pd.DataFrame(rows, columns=header[:-1])
# node_df.sort_values('id', inplace=True)
# node_df.head(10)

# for idx, row in node_df[114495:115015].iterrows():
#     # print(row['id'])
#     print(row['question'], row['answer'])

cur_chunk = get_chunk(node_df, 732501, 733000)
next_chunk = get_chunk(node_df, 733001, 733501)

# Display rows with null 'id'
# null_id_rows = node_df[pd.isnull(node_df['id'])]
# print(null_id_rows)

In [43]:
print('Cur Chunk Start')
cur_chunk.head(10)

Cur Chunk Start


Unnamed: 0,id,question,answer,keywords
499319,732546,,asian culture asian culture categoryeastern cu...,"[kw1, kw2, kw3, kw4, kw5]"
499369,732547,,a dame to kill forjpg the sin city graphic nov...,"[kw1, kw2, kw3, kw4, kw5]"
499387,732548,,ailments of unknown cause illnesses of unknown...,"[kw1, kw2, kw3, kw4, kw5]"
752983,732745,,arch bridges categorybridges by structural typ...,"[kw1, kw2, kw3, kw4, kw5]"
753065,732827,,bosnia and herzegovina geography stubs bosnia ...,"[kw1, kw2, kw3, kw4, kw5]"
753107,732853,,indian film producers categoryasian film produ...,"[kw1, kw2, kw3, kw4, kw5]"
753122,732854,,vertigo marvel comics vertigo is a fictional c...,"[kw1, kw2, kw3, kw4, kw5]"
753290,732855,,caribbean geography stubs caribbean geography...,"[kw1, kw2, kw3, kw4, kw5]"
753712,732824,,2000s albums this category contains articles a...,"[kw1, kw2, kw3, kw4, kw5]"
753765,732501,,nazi concentration camp survivors survivors of...,"[kw1, kw2, kw3, kw4, kw5]"


In [44]:
print('Cur Chunk End')
cur_chunk.tail(10)

Cur Chunk End


Unnamed: 0,id,question,answer,keywords
754278,732991,,compositions by johann strauss ii categorycomp...,"[kw1, kw2, kw3, kw4, kw5]"
754279,732992,,james berges james berges was president of eme...,"[kw1, kw2, kw3, kw4, kw5]"
754280,732993,,suite no2 for cello all by its lonesomejpg cov...,"[kw1, kw2, kw3, kw4, kw5]"
754281,732994,,asturian monarchs categoryspanish monarchs mon...,"[kw1, kw2, kw3, kw4, kw5]"
754282,732995,,team golf tournaments categorygolf tournaments...,"[kw1, kw2, kw3, kw4, kw5]"
754283,732996,,busch stadium exteriorjpg exterior view of bus...,"[kw1, kw2, kw3, kw4, kw5]"
754284,732997,,castilian monarchs castilian monarchs of the m...,"[kw1, kw2, kw3, kw4, kw5]"
754285,732998,,czech explorers explorers categoryexplorers by...,"[kw1, kw2, kw3, kw4, kw5]"
754286,732999,,australian rugby league players categoryrugby ...,"[kw1, kw2, kw3, kw4, kw5]"
754287,733000,,english rugby league players dshelton dube rug...,"[kw1, kw2, kw3, kw4, kw5]"


In [45]:
print('Next Chunk Start')
next_chunk.head(10)

Next Chunk Start


Unnamed: 0,id,question,answer,keywords
499651,733004,,schools in devon imageenglanddevonsvg schools ...,"[kw1, kw2, kw3, kw4, kw5]"
499744,733098,,1770s in rail transport categoryrail transport...,"[kw1, kw2, kw3, kw4, kw5]"
499761,733099,,1760s in rail transport categoryrail transport...,"[kw1, kw2, kw3, kw4, kw5]"
499850,733100,,1730s in rail transport categoryrail transport...,"[kw1, kw2, kw3, kw4, kw5]"
499873,733101,,companies of venezuela venezuela categoryecono...,"[kw1, kw2, kw3, kw4, kw5]"
499886,733102,,australian rules football clubs clubs category...,"[kw1, kw2, kw3, kw4, kw5]"
499940,733103,,transport in tyne and wear categorytyne and we...,"[kw1, kw2, kw3, kw4, kw5]"
499987,733104,,bridges in tyne and wear tyne and wear categor...,"[kw1, kw2, kw3, kw4, kw5]"
500004,733208,,magazines published in australia australia mag...,"[kw1, kw2, kw3, kw4, kw5]"
500022,733209,,high schools in wisconsin wisconsin categorysc...,"[kw1, kw2, kw3, kw4, kw5]"


In [46]:
print('Next Chunk End')
next_chunk.tail(10)

Next Chunk End


Unnamed: 0,id,question,answer,keywords
754769,733492,,washington bullets players this is a list of b...,"[kw1, kw2, kw3, kw4, kw5]"
754770,733493,,syracuse nationals players this is a list of b...,"[kw1, kw2, kw3, kw4, kw5]"
754771,733494,,house of hanover categorybritish families of g...,"[kw1, kw2, kw3, kw4, kw5]"
754772,733495,,english psychiatrists categorypsychiatrists by...,"[kw1, kw2, kw3, kw4, kw5]"
754773,733496,,british sailors sailors categorysailors by nat...,"[kw1, kw2, kw3, kw4, kw5]"
754774,733497,,british philanthropists categoryphilanthropist...,"[kw1, kw2, kw3, kw4, kw5]"
754775,733498,,covenant biblical the hebrew bible makes refer...,"[kw1, kw2, kw3, kw4, kw5]"
754776,733499,,portuguese scientists list of scientists who w...,"[kw1, kw2, kw3, kw4, kw5]"
754777,733500,,colnejpg colne lancashire england photograph b...,"[kw1, kw2, kw3, kw4, kw5]"
754778,733501,,carcinisation carcinisation or carcinization i...,"[kw1, kw2, kw3, kw4, kw5]"


In [47]:
# Save next chunk to csv
next_chunk.to_csv('biology.csv')

In [None]:
input_df = pd.read_csv('Automotive.csv')
# input_df.drop('index', axis=1, inplace=True)
# input_df.sort_values('question', inplace=True)

# check if any id is null
input_df.head(10)

In [None]:
node_df.dtypes
input_df.dtypes

In [None]:
def escape(s):
    result = ""
    for c in s:
        if c == "'":
            result += "''"  # Escape single quote with another single quote
        elif c == '"':
            result += '\\"'  # Escape double quote
        elif c == '\\':
            result += '\\\\'  # Escape backslash
        else:
            result += c
    return result

In [None]:
diff = []
for (index1, row1), (index2, row2) in zip(node_df.iterrows(), input_df.iterrows()):
    r1 = row1['question'].strip()
    r2 = row2['question'].strip()
    if r1 != r2:
        print(r1, '\n\n', r2)
        break
        diff.append(index1)
len(diff)