In [None]:
# In this notebook, building (A, B) -> Measures ground truth pairs
# A: column from DB_x
# B: column from DB_x
# Measures: from SQL queries run against A, B aggregations and operations
# Note: DB_x is the same DB (not doing across DB, because SQL query engines are singular at a time right now)
# this is 2ary because (A, B) -> Measures is the 2ary relationship
# this is ground truth because it is the actual SQL queries run against the DB

In [None]:
from databases import Database
from sketch.api import data
import os

basepath = "/Users/jawaugh/labs"

database = Database(
    f"sqlite+aiosqlite:///{os.path.join(basepath, 'sketch/sketch')}/test.db",
)

pf = await data.get_portfolio(database, 'justin')

In [None]:
# organize sketchpads based on db.
from collections import defaultdict

db_sketchpads = defaultdict(list)
for sketchpad in pf.sketchpads.values():
    db_sketchpads[sketchpad.reference.data['path']].append(sketchpad)

In [None]:
import pandas as pd
import sqlite3
from itertools import chain

def distance_metric_queries(ref1, ref2):
    yield "union", f"WITH union_table as ({ref1.data['query']} UNION {ref2.data['query']}) SELECT count(*) FROM union_table;"
    yield "intersect", f"WITH union_table as ({ref1.data['query']} INTERSECT {ref2.data['query']}) SELECT count(*) FROM union_table;"
    yield "difference_left", f"WITH union_table as ({ref1.data['query']} EXCEPT {ref2.data['query']}) SELECT count(*) FROM union_table;"
    yield "difference_right", f"WITH union_table as ({ref2.data['query']} EXCEPT {ref1.data['query']}) SELECT count(*) FROM union_table;"
    yield "jaccard", f"WITH union_table as ({ref1.data['query']} UNION {ref2.data['query']}), intersect_table as ({ref1.data['query']} INTERSECT {ref2.data['query']}) SELECT count(*) / (SELECT count(*) FROM union_table) FROM intersect_table;"

def single_column_queries(ref, prepend='left'):
    yield f"{prepend}_count", f"SELECT count(*) FROM ({ref.data['query']})"
    yield f"{prepend}_unique_count", f"""SELECT count(distinct "{ref.data['column']}") FROM ({ref.data['query']})"""

def execute_queries(ref1, ref2):
    assert ref1.data['path'] == ref2.data['path']
    path = ref1.data['path']
    conn = sqlite3.connect(path)
    conn.text_factory = lambda b: b.decode(errors="ignore")
    result = {}
    result['path'] = path
    result['left'] = ref1.to_json()
    result['left_string'] = ref1.to_searchable_string()
    result['right'] = ref2.to_json()
    result['right_string'] = ref2.to_searchable_string()

    queries = chain(
        single_column_queries(ref1, prepend='left'),
        single_column_queries(ref2, prepend='right'),
        distance_metric_queries(ref1, ref2),
    )
    for name, query in queries:
        try:
            res, = conn.execute(query).fetchone()
            result[name] = res
        except:
            print(f"Failed to execute query: {query}")
            result[name] = None
    return result


In [None]:
import random

def get_n_pairs(n, sketchpads):
    return [random.sample(sketchpads, 2) for _ in range(n)]

def get_measures_from_pairs(pairs):
    return [execute_queries(*[s.reference for s in p]) for p in pairs]

def get_n_measures(n, sketchpads):
    return get_measures_from_pairs(get_n_pairs(n, sketchpads))

def get_training_dataset(db_organized, n1, n2):
    # Not sure if n1 x n2 is right, or .. different sampling mechanism
    results = []
    for i in range(n1):
        options = random.choice(list(db_organized.values()))
        results.extend(get_n_measures(n2, options))
    return pd.DataFrame(results)

In [None]:
big_answer = get_training_dataset(db_sketchpads, 500, 500)

In [None]:
big_answer.to_parquet('220905_250k_train_groundtruth.parquet')

In [None]:
# 100 x 100 -> 18.9s
# 500 x 500 -> 4m 55s

In [None]:
# ! ls -lh .

In [None]:
big_answer