In [1]:
from engine.dataset2vec.data import get_preprocessing_pipeline

from engine.dataset2vec.train import LightningWrapper as D2vWrapper
from liltab.train.utils import LightningWrapper as LiltabWrapper
from pathlib import Path
from torch import Tensor
from tqdm import tqdm

import numpy as np
import pandas as pd
import torch

import warnings
from sklearn.isotonic import IsotonicRegression

warnings.simplefilter("ignore")

In [2]:
data_path = Path("data/uci/raw")
liltab_encoder_path = "models/liltab.ckpt"
d2v_encoder_path = "models/d2v.ckpt"

In [3]:
dataframes = [pd.read_csv(data_path) for data_path in data_path.iterdir()]
datasets = [
    (
        (
            Tensor(
                get_preprocessing_pipeline()
                .fit_transform(df.iloc[:, :-1])
                .values
            ),
            Tensor(df.iloc[:, -1].values).reshape(-1, 1),
        )
    )
    for df in dataframes
]
n_datasets = len(datasets)

In [4]:
def get_sample(datasets):
    if np.random.uniform() <= 0.5:
        return sample_from_same_datasets(datasets)
    else:
        return sample_from_two_datasets(datasets)

def sample_from_same_datasets(datasets):
    idx = np.random.choice(n_datasets)
    X, y = datasets[idx]

    all_rows_idx = np.arange(X.shape[0])

    first_rows_idx = sample_with_random_size(all_rows_idx).tolist()
    first_cols_idx = sample_with_random_size(X.shape[1]).tolist()

    second_rows_idx = sample_with_random_size(
        np.setdiff1d(all_rows_idx, first_rows_idx)
    ).tolist()
    second_cols_idx = sample_with_random_size(X.shape[1]).tolist()
    return (
        (index_tensor(X, first_rows_idx, first_cols_idx), y[first_rows_idx]),
        (index_tensor(X, second_rows_idx, second_cols_idx), y[second_rows_idx]),
        1
    )


def sample_from_two_datasets(datasets):
    idx1, idx2 = np.random.choice(n_datasets, size=2)
    X1, y1 = datasets[idx1]
    X2, y2 = datasets[idx2]

    first_rows_idx = sample_with_random_size(X1.shape[0]).tolist()
    first_cols_idx = sample_with_random_size(X1.shape[1]).tolist()

    second_rows_idx = sample_with_random_size(X2.shape[0]).tolist()
    second_cols_idx = sample_with_random_size(X2.shape[1]).tolist()

    return (
        (index_tensor(X1, first_rows_idx, first_cols_idx), y1[first_rows_idx]),
        (
            index_tensor(X2, second_rows_idx, second_cols_idx),
            y2[second_rows_idx],
        ),
        0
    )


def sample_with_random_size(arr):
    if isinstance(arr, int):
        arr = np.arange(arr)
    size = np.random.choice(np.arange(1, len(arr)))
    return np.random.choice(arr, size=size)

def index_tensor(tensor, row_idx, col_idx):
    return tensor[row_idx].T[col_idx].T

In [5]:
liltab_encoder = LiltabWrapper.load_from_checkpoint(liltab_encoder_path).model
d2v_encoder = D2vWrapper.load_from_checkpoint(d2v_encoder_path).encoder

In [6]:
encoders = {
    "liltab": lambda X, y: liltab_encoder.encode_support_set(X, y).mean(dim=0),
    "d2v": d2v_encoder,
}

In [7]:
encoder = encoders["d2v"]
correctness = []

probas = []
labels = []
for i in tqdm(range(1000)):
    (X1, y1), (X2, y2), label = get_sample(datasets)
    with torch.no_grad():
        encoding_1 = encoder(X1, y1)
        encoding_2 = encoder(X2, y2)
    probas.append(
        torch.exp(-torch.sqrt(((encoding_1 - encoding_2) ** 2).sum())).item()
    )
    labels.append(label)
calib = IsotonicRegression().fit(np.array(probas).reshape(-1, 1), labels)

for i in tqdm(range(10000)):
    (X1, y1), (X2, y2), label = get_sample(datasets)
    with torch.no_grad():
        encoding_1 = encoder(X1, y1)
        encoding_2 = encoder(X2, y2)
    prediction = int(calib.predict([torch.exp(-torch.sqrt(((encoding_1 - encoding_2) ** 2).sum())).item()]) >= 0.5)
    correctness.append(prediction == label)
np.mean(correctness)

100%|██████████| 1000/1000 [00:05<00:00, 199.58it/s]
100%|██████████| 10000/10000 [01:02<00:00, 160.41it/s]


0.7185

In [8]:
encoder = encoders["liltab"]

probas = []
labels = []
for i in tqdm(range(1000)):
    (X1, y1), (X2, y2), label = get_sample(datasets)
    with torch.no_grad():
        encoding_1 = encoder(X1, y1)
        encoding_2 = encoder(X2, y2)
    probas.append(
        torch.exp(-torch.sqrt(((encoding_1 - encoding_2) ** 2).sum())).item()
    )
    labels.append(label)
    correctness.append(prediction == label)
calib = IsotonicRegression().fit(np.array(probas).reshape(-1, 1), labels)

correctness = []
for i in tqdm(range(10000)):
    (X1, y1), (X2, y2), label = get_sample(datasets)
    with torch.no_grad():
        encoding_1 = encoder(X1, y1)
        encoding_2 = encoder(X2, y2)
    prediction = int(
        calib.predict([torch.exp(-torch.sqrt(((encoding_1 - encoding_2) ** 2).sum())).item()])
        >= 0.5
    )
    correctness.append(prediction == label)
np.mean(correctness)

100%|██████████| 1000/1000 [00:15<00:00, 64.41it/s]
100%|██████████| 10000/10000 [01:50<00:00, 90.88it/s]


0.6563