In [1]:
import numpy as np
import os
import pandas as pd

from valentwin.embedder.text_embedder import HFTextEmbedder
from valentwin.embedder.visualizer import EmbeddingVisualizer


In [2]:
data_root_dir = "../../data"


def prepare_dataset(dataset_name, dataset_collection):
    dataset_files = os.listdir(os.path.join(data_root_dir, dataset_collection, dataset_name, "sample", "100-test"))

    gt_labels = {}
    gt_df = pd.read_csv(
        os.path.join(data_root_dir, dataset_collection, dataset_name, "ground-truth-mapping", "ground-truth.csv"),
        index_col=0)

    for i, row in gt_df.iterrows():
        col_1 = str(row["source_table"]) + "-" + str(row["source_column"])
        col_2 = str(row["target_table"]) + "-" + str(row["target_column"])

        found_key = None
        for key, value_list in gt_labels.items():
            if col_1 in value_list or col_2 in value_list:
                found_key = key
                break
        if found_key is None:
            found_key = col_1

        value_list = gt_labels.get(found_key, set())

        value_list.add(col_1)
        value_list.add(col_2)

        gt_labels[found_key] = value_list

    dataset_text = []
    dataset_labels = []
    dataset_fnames = []
    dataset_column_names = []
    for fname in dataset_files:
        df = pd.read_csv(os.path.join(data_root_dir, dataset_collection, dataset_name, "sample", "100-test", fname))
        for col in df.columns:
            unique_texts = df[col].astype(str).unique().tolist()
            found_key = None
            for key, value_list in gt_labels.items():
                if fname.replace(".csv", "") + "-" + col in value_list:
                    found_key = key
                    break
            if found_key is None:
                continue
            dataset_text.extend(unique_texts)
            dataset_labels.extend([found_key] * len(unique_texts))
            dataset_fnames.extend([fname.replace(".csv", "")] * len(unique_texts))
            dataset_column_names.extend([col] * len(unique_texts))

    dataset = pd.DataFrame({"text": dataset_text, "label": dataset_labels, "dataset_name": dataset_fnames,
                            "column_name": dataset_column_names})
    return dataset

In [3]:
dataset_name = "1009ipopayments"
dataset = prepare_dataset(dataset_name, "alite")

In [4]:
model_names = ["princeton-nlp/sup-simcse-roberta-base",
               f"albertus-andito/valentwin-{dataset_name}-n-100-hn-10-selective-noneg-lr-3e5-bs-512",
               f"albertus-andito/valentwin-{dataset_name}-n-100-hn-10-selective-neg-lr-3e5-bs-512",
               f"albertus-andito/valentwin-{dataset_name}-n-100-hn-10-selective-neginter-lr-3e5-bs-512",
               ]

In [5]:
viz = EmbeddingVisualizer("text", "label", ["dataset_name", "column_name"])

# SimCSE

In [None]:
model = HFTextEmbedder(model_names[0], use_cache=True, device="cuda:0")
fig = viz.visualize_data(model, dataset, 2)
fig.update_layout(
    font_family="Times Roman",
    font_color="black",
    font_size=16,
    plot_bgcolor='white',
    margin = {'l':0,'r':0,'t':0,'b':0},
    showlegend=False,
    width=400,
    height=400,
)
fig.update_xaxes(
    mirror=True,
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey'
)
fig.update_yaxes(
    mirror=True,
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey'
)
fig.add_annotation(x=-12, y=16,
            text="Silhoutte score: 0.07",
            showarrow=False,
            yshift=0)
fig.show()

# No in-batch negatives

In [None]:
model = HFTextEmbedder(model_names[1], use_cache=True, device="cuda:0")
fig = viz.visualize_data(model, dataset, 2)
fig.update_layout(
    font_family="Times Roman",
    font_color="black",
    font_size=16,
    plot_bgcolor='white',
    margin = {'l':0,'r':0,'t':0,'b':0},
    showlegend=False,
    width=400,
    height=400,
)
fig.update_xaxes(
    mirror=True,
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey'
)
fig.update_yaxes(
    mirror=True,
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey'
)
fig.add_annotation(x=-3, y=26,
            text="Silhoutte score: 0.87",
            showarrow=False,
            yshift=0)
fig.show()

# Intra-table in-batch negatives

In [None]:
model = HFTextEmbedder(model_names[2], use_cache=True, device="cuda:0")
fig = viz.visualize_data(model, dataset, 2)
fig.update_layout(
    font_family="Times Roman",
    font_color="black",
    font_size=16,
    plot_bgcolor='white',
    margin = {'l':0,'r':0,'t':0,'b':0},
    showlegend=False,
    width=400,
    height=400,
)
fig.update_xaxes(
    mirror=True,
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey'
)
fig.update_yaxes(
    mirror=True,
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey'
)
fig.add_annotation(x=-3, y=26,
            text="Silhoutte score: 0.91",
            showarrow=False,
            yshift=0)
fig.show()

# Inter-table in-batch negatives

In [None]:
model = HFTextEmbedder(model_names[3], use_cache=True, device="cuda:0")
fig = viz.visualize_data(model, dataset, 2)
fig.update_layout(
    font_family="Times Roman",
    font_color="black",
    font_size=16,
    plot_bgcolor='white',
    margin = {'l':0,'r':0,'t':0,'b':0},
    showlegend=False,
    width=400,
    height=400,
)
fig.update_xaxes(
    mirror=True,
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey'
)
fig.update_yaxes(
    mirror=True,
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey'
)
fig.add_annotation(x=-3, y=26,
            text="Silhoutte score: 0.74",
            showarrow=False,
            yshift=0)
fig.show()