## PostgreSQL

This is a three step process:
- Generate 20M rows of dataframe and write them to CSV files
- Populate a PSQL instance with each of those CSVs five times
- Use setvis to read each of the PSQL tables and record the results

## Prep and parameters

In [None]:
import os
import sys
import time
import psycopg2
from setvis.membership import *
from IPython.display import clear_output
from utils import (generate_data)

In [None]:
def get_connection():
    conn = psycopg2.connect(
        host="localhost",
        port="5432",
        user="postgres",
        password="postgres",
        dbname="public"
    )
    return conn

In [None]:
# Do setvis eval
def postgresql_intersections(conn, table = "setvis"):
    data = Membership.from_postgres(
        conn,
        table,
        "key",
    )
    return data

In [None]:
# generate csvs for 100M row trial
# 1 hour process
PM = 'planned missing'
GM = 'general missing'
SET = 'sets'
PATTERNS = [SET, GM, PM]
# redefine factors_of func
def factors_of(x):
    d2 = [0.001, 0.005, 0.01]
    return [int(i * x) for i in d2]

# constants
# 20M rows
GM_ROW = int(2e7)
GM_COL = 20
# one value in each
ROWS = [GM_ROW] 
COLS = [GM_COL]
GM_INTS = factors_of(GM_ROW)

## Step 1

Generate 20M rows of dataframe and write them to CSV files

In [None]:
# this can take up to 7 hour, on the 12 core cpu 3.6GH
# requires minmum 24GB machine
# 
# generate 20m rows & write csvs to directory named csvs
start_time = time.time()
for c in COLS:
    for r in ROWS:
        for p in PATTERNS:
            num_int = None
            if (p == GM) | (p == SET):
                for i in GM_INTS:
                    num_int = i
                    if p == SET:
                        num_int = [num_int, int(num_int/10)]
                    print(p, "\t", "\t combs: ", num_int)
                    t = time.time()
                    df = generate_data(p, GM_ROW, GM_COL, num_int)
                    print(f"Time to generate data: {time.time()-t:.2f} secs")
                    print(f"Size of df: {sys.getsizeof(df)/1024/1024:.3f}MB")
                    if p == GM:
                        df.to_csv(f"csvs/{p}-{c}x{r}-{i}.csv", index=False)
                    else:
                        df.to_csv(f"csvs/{p}-{len(num_int)}x{r}-{i}.csv", index=False)
            else:
                print(p, "\t:", r, "\t", c)
                t = time.time()
                df = generate_data(p, r, c, num_int)
                print(f"Time to generate data: {time.time()-t:.2f} secs")
                print(f"Size of df: {sys.getsizeof(df)/1024/1024:.3f}MB")
                df.to_csv(f"csvs/{p}-{c}x{r}.csv", index=False)
# clear_output()
print(f"Done. Total time ({time.time() - start_time:.2f}s)")

## Step 2
Populate a PSQL instance with each of those CSVs five times. The script `populate-sql.sh` is carefully curated with both Step 1 and 3 in mind. It also uses PSQL `COPY` to insert the CSVs into PSQL relations in two steps:
- copy without a key column
- create a key column for setvis to use in Step 3

In [None]:
# copy written csv files in "csvs" directory
# to PSQL using script named populate-sql.sh
# requires appropriate PSQL user & db privilege on machine
# this takes x hours
# time sh populate-sql.sh csvs
# real	177m33.442s ~ about 3 hrs
# requires some 65GB in /var/lib/postgresql/12

## Step 3

Use setvis to read each of the PSQL tables and record the results

In [None]:
rams = []
times = []

def evaluate(table_name, row, col, pattern, num_int = 0):
    """Evaluates the performance of the "Membership.from_postgres" function
    for a given PostgreSQL relation populated using rows and columns.
    """
    # 1. capture time
    t = time.time()
    data = postgresql_intersections(conn, table_name)
    t = time.time() - t

    print(f'{t:.2f} secs')
    # 2. capture memory
    m_col = sys.getsizeof(data._intersection_id_to_columns) 
    m_row = sys.getsizeof(data._intersection_id_to_records)
    m = m_col + m_row
    output_shape = data._intersection_id_to_columns.shape
    times.append((row * 5, col, t, pattern, num_int, 
                 output_shape[0], output_shape[1]))
    rams.append((row * 5, col, m, pattern, num_int, 
                 output_shape[0], output_shape[1],
                 m_col, m_row))
    

In [None]:
# read and evaluate each 100M row table in PSQL
# Done. Total time (4895.91s).
conn = get_connection()
start_time = time.time()
for c in COLS:
    for r in ROWS:
        for p in PATTERNS:
            num_int = None
            table_name = p.replace(" ", "_") + f"_{c}x{r}"
            if (p == GM) | (p == SET):
                for i in GM_INTS:
                    num_int = i
                    table_name = p.replace(" ", "_") + f"_{c}x{r}_{i}"
                    if p == SET:
                        num_int = [num_int, int(num_int/10)]
                        table_name = p.replace(" ", "_") + f"_{len(num_int)}x{r}_{i}"
                    print(f"Evaluating table: {table_name} ...")
                    evaluate(table_name, r, c, p, i)
            else:
                print(f"Evaluating table: {table_name} ...")
                evaluate(table_name, r, c, p)

# Close the connection
conn.close()
# clear_output()
print(f"Done. Total time ({time.time() - start_time:.2f}s).")

In [None]:
times_df = pd.DataFrame([
    {
        "rows": r,
        "columns": c,
        "colxrow": r*c,
        "seconds": t,
        "pattern": p,
        "combinations": comb,
        "output_rows": out_row,
        "output_cols": out_col
    } for r, c, t, p, comb, out_row, out_col in times
])
mem_df = pd.DataFrame([
    {
        "rows": r,
        "columns": c,
        "colxrow": r*c,
        "memory": round(m/1024/1024,5),
        "pattern": p,
        "combinations": comb,
        "output_rows": out_row,
        "output_cols": out_col,
        "memory_col": round(mcol/1024/1024,5),
        "memory_row": round(mrow/1024/1024,5),
    } for r, c, m, p, comb, out_row, out_col, mcol, mrow in rams
])

In [None]:
# write 100M results to csv file 
times_df.to_csv("100M-psql-times.csv", index=False)
mem_df.to_csv("100M-psql-mems.csv", index=False)