In [None]:
# in this notebook, functions that produce 2 sets of metrics
# set 1, direct metrics measured on each sketchpad
# set 2, distance-metrics measured between sketchpads

# Given n_chunks*k sketchpads -> get vectorized V in R^d representations for each object as well as a W in R^(d x d)
# #  cross-distance representation. (first d/2 tokens represent BERT on reference, second d/2 represent content-derived 
# #  sketch-measures) -> Note: for a given set of chunks, V and W are "freezable" (unchanging during training if DB is
# #  unchanged), so can "cache" these for faster lookup.


# likely should put these operations into "metrics" and "cross-metrics" (or something like that) into 
# the actual sketch library

In [None]:
import os
import json
import pandas as pd
from sketch.metrics import unary_metrics, binary_metrics
from sketch.core import SketchPad

run_name = "220910_2ary_groundtruth"
base_path = "/home/jawaugh"

groundtruth_path = os.path.join(base_path, f'sketch/sketch/examples/Text2SQL_Iterations/{run_name}.parquet')
knn_path = os.path.join(base_path, f'sketch/sketch/examples/Text2SQL_Iterations/{run_name}_knn.parquet')
sketchpad_path = os.path.join(base_path, f'sketch/sketch/examples/Text2SQL_Iterations/{run_name}_sketchpad.parquet')
metrics_path = os.path.join(base_path, f'sketch/sketch/examples/Text2SQL_Iterations/{run_name}_metrics.parquet')

In [None]:
sketchpads_df = pd.read_parquet(sketchpad_path)
sketchpads = {}
for i, row in sketchpads_df.iterrows():
    sketchpads[row.sketchpad_id] = SketchPad.from_dict(json.loads(row.sketchpad))

In [None]:
training_df = pd.read_parquet(knn_path)

In [None]:
import time
import os
import shutil
shutil.rmtree("metrics", ignore_errors=True)
os.makedirs("metrics", exist_ok=True)
wide_metrics = []
st = time.time()
sketchpad_id_cols = [x for x in training_df.columns if 'sketchpad' in x]
scores_cols = [x for x in training_df.columns if '_score' in x]
for i_outer, row in training_df.iterrows():
    local_metrics = {}
    # sketchpads
    local_sp = []
    for k in sketchpad_id_cols:
        local_sp.append(sketchpads[row[k]])
    # metrics
    for score in scores_cols:
        local_metrics[score] = row[score]
    # unary metrics
    for i, sp in enumerate(local_sp):
        for k, v in sp.get_metrics().items():
            local_metrics[f"sketchpad_{i}_{k}"] = v
    # binary metrics
    for i1, sp1 in enumerate(local_sp):
        for i2, sp2 in enumerate(local_sp):
            if i1 < i2:
                for k, v in sp1.get_cross_metrics(sp2).items():
                    local_metrics[f"sketchpad_{i1}_{i2}_{k}"] = v
    wide_metrics.append(local_metrics)
    if (i_outer+1) % 100 == 0:
        pd.DataFrame(wide_metrics).to_parquet(f"metrics/temp_{i_outer+1:07}.parquet")
        print(i_outer+1, time.time()-st)
        wide_metrics=[]
if (i_outer+1) % 100 == 0:
    pd.DataFrame(wide_metrics).to_parquet(f"metrics/temp_{i_outer+1:07}.parquet")
    print(i_outer+1, time.time()-st)
    wide_metrics=[]

In [None]:
# TODO: add the reference via BERT sentence embeddings to each row as well for every sketchpad

In [None]:
import glob

dfs = []
for f in sorted(glob.glob("metrics/*.parquet"), key=lambda x: int(x.split("_")[-1].split(".")[0])):
    print(f)
    dfs.append(pd.read_parquet(f))
metrics_df = pd.concat(dfs)
metrics_df.to_parquet(metrics_path)

In [None]:
len(metrics_df)

In [None]:
pd.set_option('display.max_rows', None)

print(metrics_df.head().T)