In [None]:
import pandas as pd
import random
import numpy as np
from random import randint

import pickle
import time
import memory_profiler

%load_ext memory_profiler

from pathlib import Path
import distro

%load_ext watermark

In [None]:
torch.__version__

'2.1.1+cu121'

In [None]:
%load_ext autoreload
%autoreload 2

from text_embeddings_src.legacy.metrics import knn_accuracy


In [None]:
import black
import jupyter_black

jupyter_black.load(line_length=79)

In [None]:
variables_path = Path("../../results/variables")
figures_path = Path("../../results/figures/updated_dataset")
data_path = Path("../../data")

In [None]:
# MANUAL FIX TO PATH ISSUE FROM VSCODE
import text_embeddings_src

nb_path = Path(text_embeddings_src.__path__[0]).parents[0] / Path(
    "scripts/updated_dataset"
)
assert nb_path.exists(), "The path does not exist"

variables_path = (nb_path / variables_path).resolve(strict=True)
figures_path = (nb_path / figures_path).resolve(strict=True)
data_path = (nb_path / data_path).resolve(strict=True)

In [None]:
# plt.style.use((nb_path / Path("../matplotlib_style.txt")).resolve(strict=True))

In [None]:
%watermark -a 'Rita González-Márquez' -t -d -tz -u -v -iv -w -m -h -p transformers -p openTSNE
print(distro.name(pretty=True))

Author: Rita González-Márquez

Last updated: 2024-03-12 03:38:05CET

Python implementation: CPython
Python version       : 3.11.5
IPython version      : 8.18.1

openTSNE: 1.0.0

Compiler    : GCC 11.2.0
OS          : Linux
Release     : 3.10.0-1160.el7.x86_64
Machine     : x86_64
Processor   : x86_64
CPU cores   : 64
Architecture: 64bit

Hostname: rgonzalesmarquez_GPU0-llm_gber7

matplotlib     : 3.8.2
scipy          : 1.11.4
distro         : 1.8.0
sklearn        : 1.3.2
memory_profiler: 0.61.0
jupyter_black  : 0.3.4
numpy          : 1.26.2
black          : 23.11.0
openTSNE       : 1.0.0
pandas         : 2.1.3
torch          : 2.1.1
adapters       : 0.1.0

Watermark: 2.4.3

Ubuntu 22.04.3 LTS


# Import

In [None]:
%%time
iclr2024 = pd.read_parquet(
    data_path / "iclr2024.parquet.gzip",
    engine="pyarrow",
)

CPU times: user 264 ms, sys: 90.5 ms, total: 354 ms
Wall time: 359 ms


In [None]:
iclr2024.keywords = iclr2024.keywords.transform(lambda x: list(x))
iclr2024.scores = iclr2024.scores.transform(lambda x: list(x))

In [None]:
iclr2024

In [None]:
labels_iclr = np.load(variables_path / "updated_dataset" / "labels_iclr.npy")
colors_iclr = np.load(variables_path / "updated_dataset" / "colors_iclr.npy")

pickle_in = open(
    variables_path / "updated_dataset" / "dict_label_to_color.pkl", "rb"
)
dict_label_to_color = pickle.load(pickle_in)

# kNN accuracy for centered and whitened representations
Using both euclidean and cosine distances for comparison.
The numbers here and in the paper are obtained using an older and smaller version of the ICLR dataset (see shapes in a cell below).

In [None]:
def print_table(knn_accs):
    print(["euclidean", "cosine"])
    print(
        "Raw:      ",
        round(knn_accs[0, 0], 3) * 100,
        round(knn_accs[0, 1], 3) * 100,
    )
    print(
        "Centered: ",
        round(knn_accs[1, 0], 3) * 100,
        round(knn_accs[1, 1], 3) * 100,
    )
    print(
        "Whitened: ",
        round(knn_accs[2, 0], 3) * 100,
        round(knn_accs[2, 1], 3) * 100,
    )
    print("---------------")

## MPNet

In [None]:
# load
saving_path = Path("embeddings_" + "mpnet") / Path("updated_dataset")
embedding_av = np.load(
    variables_path / saving_path / "embedding_abstracts_only_av.npy"
)

embedding_asbtracts_only_av_after_training_av_1_epoch = np.load(
    variables_path
    / saving_path
    / "embedding_asbtracts_only_av_after_training_av_1_epoch.npy"
)

In [None]:
# THE NUMBERS FROM THE PAPER ARE OBTAINED USING THE OLD VERSION OF THE ICLR DATASET (SEE SHAPES HERE)
print(labels_iclr[labels_iclr != "unlabeled"].shape)
round(0.1 * labels_iclr[labels_iclr != "unlabeled"].shape[0])

(12997,)


1300

### Baseline

In [None]:
%%time
knn_accuracy_before_training_centered_and_whitened = (
    knn_accuracy_whitening_scores(
        embedding_av[labels_iclr != "unlabeled"],
        labels_iclr[labels_iclr != "unlabeled"],
        rs=42,
    )
)
knn_accuracy_before_training_centered_and_whitened_rs23 = (
    knn_accuracy_whitening_scores(
        embedding_av[labels_iclr != "unlabeled"],
        labels_iclr[labels_iclr != "unlabeled"],
        rs=23,
    )
)

CPU times: user 1min 5s, sys: 1min 12s, total: 2min 18s
Wall time: 5.11 s


In [None]:
# rs=23
print_table(knn_accuracy_before_training_centered_and_whitened)
print_table(knn_accuracy_before_training_centered_and_whitened_rs23)

['euclidean', 'cosine']
Raw:       37.4 39.6
Centered:  37.4 37.0
Whitened:  17.599999999999998 46.9
---------------
['euclidean', 'cosine']
Raw:       39.800000000000004 41.6
Centered:  39.800000000000004 37.9
Whitened:  20.0 49.4
---------------


### After fine-tuning

In [None]:
%%time
knn_accuracy_after_training_centered_and_whitened = (
    knn_accuracy_whitening_scores(
        embedding_asbtracts_only_av_after_training_av_1_epoch[
            labels_iclr != "unlabeled"
        ],
        labels_iclr[labels_iclr != "unlabeled"],
        rs=42,
    )
)
knn_accuracy_after_training_centered_and_whitened_rs23 = (
    knn_accuracy_whitening_scores(
        embedding_asbtracts_only_av_after_training_av_1_epoch[
            labels_iclr != "unlabeled"
        ],
        labels_iclr[labels_iclr != "unlabeled"],
        rs=23,
    )
)

CPU times: user 1min 7s, sys: 1min 22s, total: 2min 30s
Wall time: 5.18 s


In [None]:
# rs=23
print_table(knn_accuracy_after_training_centered_and_whitened)
print_table(knn_accuracy_after_training_centered_and_whitened_rs23)

['euclidean', 'cosine']
Raw:       58.699999999999996 59.3
Centered:  58.699999999999996 58.9
Whitened:  36.199999999999996 56.49999999999999
---------------
['euclidean', 'cosine']
Raw:       60.6 60.199999999999996
Centered:  60.6 59.5
Whitened:  38.2 57.099999999999994
---------------


## SBERT

### Baseline

In [None]:
# save
saving_path = Path("embeddings_" + "sbert") / Path("updated_dataset")
embedding_av_sbert = np.load(
    variables_path / saving_path / "embedding_abstracts_only_av.npy"
)

In [None]:
%%time
knn_accuracy_sbert_before_training_centered_and_whitened = (
    knn_accuracy_whitening_scores(
        embedding_av_sbert[labels_iclr != "unlabeled"],
        labels_iclr[labels_iclr != "unlabeled"],
        rs=42,
    )
)
knn_accuracy_sbert_before_training_centered_and_whitened_rs23 = (
    knn_accuracy_whitening_scores(
        embedding_av_sbert[labels_iclr != "unlabeled"],
        labels_iclr[labels_iclr != "unlabeled"],
        rs=23,
    )
)

CPU times: user 1min 10s, sys: 1min 29s, total: 2min 39s
Wall time: 5.22 s


In [None]:
# rs=23
print_table(knn_accuracy_sbert_before_training_centered_and_whitened)
print_table(knn_accuracy_sbert_before_training_centered_and_whitened_rs23)

['euclidean', 'cosine']
Raw:       63.3 62.4
Centered:  63.3 62.7
Whitened:  46.7 57.099999999999994
---------------
['euclidean', 'cosine']
Raw:       63.2 63.2
Centered:  63.2 61.8
Whitened:  50.0 58.199999999999996
---------------


In [None]:
# Not normalized
print(np.linalg.norm(embedding_av_sbert[0]))
print(np.linalg.norm(embedding_av[0]))
print(np.linalg.norm(embedding_asbtracts_only_av_after_training_av_1_epoch[0]))

2.1254666
4.802613
2.1115158
