In [1]:
import pandas as pd
import random
import numpy as np
from random import randint

import scipy as sp
from scipy import sparse

import time
import pickle
import memory_profiler

%load_ext memory_profiler

from pathlib import Path
import distro

%load_ext watermark

In [2]:
%load_ext autoreload
%autoreload 2

from metrics import knn_accuracy_cv

In [3]:
import black
import jupyter_black

jupyter_black.load(line_length=79)

In [4]:
variables_path = Path("../results/variables/iclr24v2")
figures_path = Path("../results/figures")
data_path = Path("../data")

In [5]:
plt.style.use("matplotlib_style.txt")

In [6]:
%watermark -a 'Rita González-Márquez' -t -d -tz -u -v -iv -w -m -h -p 
print(distro.name(pretty=True))

Author: Rita González-Márquez

Last updated: 2024-04-02 04:14:32CEST

Python implementation: CPython
Python version       : 3.11.5
IPython version      : 8.18.1

transformers: 4.35.2
openTSNE    : 1.0.0

Compiler    : GCC 11.2.0
OS          : Linux
Release     : 3.10.0-1160.el7.x86_64
Machine     : x86_64
Processor   : x86_64
CPU cores   : 64
Architecture: 64bit

Hostname: rgonzalesmarquez_GPU0-llm_gber7

scipy          : 1.11.4
numpy          : 1.26.2
jupyter_black  : 0.3.4
tiktoken       : 0.6.0
sklearn        : 1.3.2
distro         : 1.8.0
black          : 23.11.0
torch          : 2.1.1
memory_profiler: 0.61.0
pandas         : 2.1.3
cohere         : 5.1.1
adapters       : 0.1.0
openTSNE       : 1.0.0
matplotlib     : 3.8.2

Watermark: 2.4.3

Ubuntu 22.04.3 LTS


# Import

In [7]:
%%time
iclr2024 = pd.read_parquet(
    data_path / "iclr24v2.parquet",
    engine="pyarrow",
)

CPU times: user 185 ms, sys: 80.3 ms, total: 265 ms
Wall time: 206 ms


In [8]:
iclr2024.keywords = iclr2024.keywords.transform(lambda x: list(x))
iclr2024.scores = iclr2024.scores.transform(lambda x: list(x))

In [9]:
iclr2024

Unnamed: 0,year,id,title,abstract,authors,decision,scores,keywords,labels
0,2017,B1-Hhnslg,Prototypical Networks for Few-shot Learning,A recent approach to few-shot classification c...,"Jake Snell, Kevin Swersky, Richard Zemel",Reject,"[6, 4, 5]","[deep learning, transfer learning]",transfer learning
1,2017,B1-q5Pqxl,Machine Comprehension Using Match-LSTM and Ans...,Machine comprehension of text is an important ...,"Shuohang Wang, Jing Jiang",Accept (Poster),"[6, 6, 7]","[natural language processing, deep learning]",language models
2,2017,B16Jem9xe,Learning in Implicit Generative Models,Generative adversarial networks (GANs) provide...,"Shakir Mohamed, Balaji Lakshminarayanan",Invite to Workshop Track,"[8, 7, 6]",[unsupervised learning],unlabeled
3,2017,B16dGcqlx,Third Person Imitation Learning,Reinforcement learning (RL) makes it possible ...,"Bradly C Stadie, Pieter Abbeel, Ilya Sutskever",Accept (Poster),"[6, 5, 6]",[],unlabeled
4,2017,B184E5qee,Improving Neural Language Models with a Contin...,We propose an extension to neural network lang...,"Edouard Grave, Armand Joulin, Nicolas Usunier",Accept (Poster),"[7, 9, 5]",[natural language processing],language models
...,...,...,...,...,...,...,...,...,...
24440,2024,zxPDdw8koz,CLIP meets Model Zoo Experts: Pseudo-Supervisi...,Contrastive language image pretraining (CLIP) ...,"Mohammadreza Salehi, Mehrdad Farajtabar, Maxwe...",Withdrawn,"[8, 3, 3, 3]","[contrastive learning, clip, distillation, den...",vision-language models
24441,2024,zyBJodMrn5,On the generalization capacity of neural netwo...,The advent of the Transformer has led to the d...,"Takuya Ito, Soham Dan, Mattia Rigotti, James K...",Accept (poster),"[8, 3, 6]","[compositional generalization, compositionalit...",out-of-distribution
24442,2024,zz61V8bIab,Stochastic Adversarial Networks for Multi-Doma...,Adversarial training has played a pivotal role...,"Xu Wang, Yuan Wu",Withdrawn,"[5, 1, 5]","[multi-domain text classification, adversarial...",adversarial
24443,2024,zzqn5G9fjn,Breaking Physical and Linguistic Borders: Mult...,Pretrained large language models (LLMs) have e...,"Wanru Zhao, Royson Lee, Yihong Chen, Xinchi Qi...",Accept (poster),"[5, 3, 1, 8]","[multilingual federated learning, natural lang...",language models


In [10]:
labels_iclr = iclr2024.labels.to_numpy()

In [11]:
colors_iclr = np.load(variables_path / "colors_iclr.npy")

pickle_in = open(variables_path / "dict_label_to_color.pkl", "rb")
dict_label_to_color = pickle.load(pickle_in)

# kNN accuracy

## BERT-based models

In [12]:
model_names = [
    "SimCSE",
    "DeCLUTR-sci",
    "SciNCL",
    "SPECTER2",
    "ST5",
    "SBERT",
]


model_paths = [
    "princeton-nlp/unsup-simcse-bert-base-uncased",
    "johngiorgi/declutr-sci-base",
    "malteos/scincl",
    "allenai/specter2_base",
    "sentence-transformers/sentence-t5-base",
    "sentence-transformers/all-mpnet-base-v2",
]
print(model_names)

['SimCSE', 'DeCLUTR-sci', 'SciNCL', 'SPECTER2', 'ST5', 'SBERT']


### Euclidean

In [13]:
%%time
%%memit
print("kNN accuracy     [AVG]    [CLS]   [SEP]")
for i, model_name in enumerate(model_names):
    # import
    saving_path = Path("embeddings_" + model_name.lower())

    if model_name != "ST5":
        embedding_cls = np.load(
            variables_path / saving_path / "embedding_abstracts_only_cls.npy"
        )
        embedding_sep = np.load(
            variables_path / saving_path / "embedding_abstracts_only_sep.npy"
        )

    embedding_av = np.load(
        variables_path / saving_path / "embedding_abstracts_only_av.npy"
    )

    # knn acc
    if model_name == "ST5":
        knn_acc = knn_accuracy_cv(
            embedding_av[labels_iclr != "unlabeled"],
            labels_iclr[labels_iclr != "unlabeled"],
            metric="euclidean",
        )
    else:
        knn_acc = knn_accuracy_cv(
            [
                embedding_av[labels_iclr != "unlabeled"],
                embedding_cls[labels_iclr != "unlabeled"],
                embedding_sep[labels_iclr != "unlabeled"],
            ],
            labels_iclr[labels_iclr != "unlabeled"],
            metric="euclidean",
        )

    print(f"{model_name}: {np.round(np.array(knn_acc)*100,1)}")

    # save embeddings
    saving_name = Path("knn_accuracy_cv_" + model_name.lower())
    np.save(variables_path / saving_name, knn_acc)

    # print("----------------------------")

kNN accuracy     [AVG]    [CLS]   [SEP]
SimCSE: [46.5 45.1 44.3]
DeCLUTR-sci: [52.7 40.  30.5]
SciNCL: [56.3 58.8 58.9]
SPECTER2: [57.1 58.8 59.2]
ST5: 57.0
SBERT: [61.6 59.9 60.2]
peak memory: 1323.14 MiB, increment: 509.14 MiB
CPU times: user 6min 48s, sys: 5.09 s, total: 6min 53s
Wall time: 31.9 s


In [58]:
model_names = [
    "SimCSE",
    "DeCLUTR-sci",
    "SciNCL",
    "SPECTER2",
    "ST5",
    "SBERT",
    "Cohere",
    "OpenAI",
]

print(model_names)

['SimCSE', 'DeCLUTR-sci', 'SciNCL', 'SPECTER2', 'ST5', 'SBERT', 'Cohere', 'OpenAI']


In [59]:
%%time
%%memit
print("kNN accuracy     [AVG]    [CLS]   [SEP]")
for i, model_name in enumerate(model_names):
    # print("Model: ", model_name)

    # save embeddings
    saving_name = Path("knn_accuracy_cv_" + model_name.lower() + ".npy")
    knn_acc = np.load(variables_path / saving_name)
    print(f"{model_name}: {np.round(np.array(knn_acc)*100,1)}")

    # print("----------------------------")

kNN accuracy     [AVG]    [CLS]   [SEP]
SimCSE: [46.5 45.1 44.3]
DeCLUTR-sci: [52.7 40.  30.5]
SciNCL: [56.3 58.8 58.9]
SPECTER2: [57.1 58.8 59.2]
ST5: 57.0
SBERT: [61.6 59.9 60.2]
Cohere: 61.1
OpenAI: 62.3
peak memory: 4582.23 MiB, increment: 0.00 MiB
CPU times: user 157 ms, sys: 47.7 ms, total: 205 ms
Wall time: 330 ms


### Cosine

In [15]:
model_names = [
    "SimCSE",
    "DeCLUTR-sci",
    "SciNCL",
    "SPECTER2",
    "ST5",
    "SBERT",
]


model_paths = [
    "princeton-nlp/unsup-simcse-bert-base-uncased",
    "johngiorgi/declutr-sci-base",
    "malteos/scincl",
    "allenai/specter2_base",
    "sentence-transformers/sentence-t5-base",
    "sentence-transformers/all-mpnet-base-v2",
]
print(model_names)

['SimCSE', 'DeCLUTR-sci', 'SciNCL', 'SPECTER2', 'ST5', 'SBERT']


In [16]:
%%time
%%memit
print("kNN accuracy     [AVG]    [CLS]   [SEP]")
for i, model_name in enumerate(model_names):
    # import
    saving_path = Path("embeddings_" + model_name.lower())

    if model_name != "ST5":
        embedding_cls = np.load(
            variables_path / saving_path / "embedding_abstracts_only_cls.npy"
        )
        embedding_sep = np.load(
            variables_path / saving_path / "embedding_abstracts_only_sep.npy"
        )

    embedding_av = np.load(
        variables_path / saving_path / "embedding_abstracts_only_av.npy"
    )

    # knn acc
    if model_name == "ST5":
        knn_acc = knn_accuracy_cv(
            embedding_av[labels_iclr != "unlabeled"],
            labels_iclr[labels_iclr != "unlabeled"],
            metric="cosine",
        )
    else:
        knn_acc = knn_accuracy_cv(
            [
                embedding_av[labels_iclr != "unlabeled"],
                embedding_cls[labels_iclr != "unlabeled"],
                embedding_sep[labels_iclr != "unlabeled"],
            ],
            labels_iclr[labels_iclr != "unlabeled"],
            metric="cosine",
        )

    print(f"{model_name}: {np.round(np.array(knn_acc)*100,1)}")

    # save embeddings
    saving_name = Path("knn_accuracy_cv_cosine_" + model_name.lower())
    np.save(variables_path / saving_name, knn_acc)

    # print("----------------------------")

kNN accuracy     [AVG]    [CLS]   [SEP]
SimCSE: [46.2 44.6 44.5]
DeCLUTR-sci: [52.7 40.  30.7]
SciNCL: [56.6 58.9 58.9]
SPECTER2: [56.8 58.8 59. ]
ST5: 57.0
SBERT: [61.6 60.  60.3]
peak memory: 1890.86 MiB, increment: 568.18 MiB
CPU times: user 29min 40s, sys: 1h 2min 44s, total: 1h 32min 25s
Wall time: 2min 12s


In [56]:
model_names = [
    "SimCSE",
    "DeCLUTR-sci",
    "SciNCL",
    "SPECTER2",
    "ST5",
    "SBERT",
    "Cohere",
    "OpenAI",
]

print(model_names)

['SimCSE', 'DeCLUTR-sci', 'SciNCL', 'SPECTER2', 'ST5', 'SBERT', 'Cohere', 'OpenAI']


In [57]:
%%time
%%memit
print("kNN accuracy     [AVG]    [CLS]   [SEP]")
for i, model_name in enumerate(model_names):
    # print("Model: ", model_name)

    # save embeddings
    saving_name = Path("knn_accuracy_cv_cosine_" + model_name.lower() + ".npy")
    knn_acc = np.load(variables_path / saving_name)
    print(f"{model_name}: {np.round(np.array(knn_acc)*100,1)}")

    # print("----------------------------")

kNN accuracy     [AVG]    [CLS]   [SEP]
SimCSE: [46.2 44.6 44.5]
DeCLUTR-sci: [52.7 40.  30.7]
SciNCL: [56.6 58.9 58.9]
SPECTER2: [56.8 58.8 59. ]
ST5: 57.0
SBERT: [61.6 60.  60.3]
Cohere: 61.1
OpenAI: 62.3
peak memory: 4583.19 MiB, increment: 0.00 MiB
CPU times: user 159 ms, sys: 64.4 ms, total: 224 ms
Wall time: 343 ms


### Cohere

In [18]:
saving_path = Path("embeddings_cohere")
embedding_av = np.load(
    variables_path / saving_path / "embedding_abstracts_only_av.npy"
)

#### Euclidean

In [19]:
%%time
%%memit
model_name = "Cohere"

knn_acc = knn_accuracy_cv(
    embedding_av[labels_iclr != "unlabeled"],
    labels_iclr[labels_iclr != "unlabeled"],
    metric="euclidean",
)

print(f"{model_name}: {np.round(np.array(knn_acc)*100,1)}")

# save embeddings
saving_name = Path("knn_accuracy_cv_" + model_name.lower())
np.save(variables_path / saving_name, knn_acc)

Cohere: 61.1
peak memory: 1888.59 MiB, increment: 1.03 MiB
CPU times: user 26.7 s, sys: 3.42 s, total: 30.1 s
Wall time: 2.31 s


#### Cosine

In [20]:
%%time
%%memit
model_name = "Cohere"

knn_acc = knn_accuracy_cv(
    embedding_av[labels_iclr != "unlabeled"],
    labels_iclr[labels_iclr != "unlabeled"],
    metric="cosine",
)

print(f"{model_name}: {np.round(np.array(knn_acc)*100,1)}")

# save embeddings
saving_name = Path("knn_accuracy_cv_cosine_" + model_name.lower())
np.save(variables_path / saving_name, knn_acc)

Cohere: 61.1
peak memory: 2436.50 MiB, increment: 547.92 MiB
CPU times: user 2min 15s, sys: 4min 25s, total: 6min 41s
Wall time: 9.98 s


### OpenAI

In [21]:
saving_path = Path("embeddings_openai")
embedding_av = np.load(
    variables_path / saving_path / "embedding_abstracts_only_av.npy"
)

#### Euclidean

In [22]:
%%time
%%memit
model_name = "OpenAI"

knn_acc = knn_accuracy_cv(
    embedding_av[labels_iclr != "unlabeled"],
    labels_iclr[labels_iclr != "unlabeled"],
    metric="euclidean",
)

print(f"{model_name}: {np.round(np.array(knn_acc)*100,1)}")

# save embeddings
saving_name = Path("knn_accuracy_cv_" + model_name.lower())
np.save(variables_path / saving_name, knn_acc)

OpenAI: 62.3
peak memory: 2491.11 MiB, increment: 56.08 MiB
CPU times: user 47.5 s, sys: 4.49 s, total: 51.9 s
Wall time: 3.99 s


#### Cosine

In [23]:
%%time
%%memit
model_name = "OpenAI"

knn_acc = knn_accuracy_cv(
    embedding_av[labels_iclr != "unlabeled"],
    labels_iclr[labels_iclr != "unlabeled"],
    metric="cosine",
)

print(f"{model_name}: {np.round(np.array(knn_acc)*100,1)}")

# save embeddings
saving_name = Path("knn_accuracy_cv_cosine_" + model_name.lower())
np.save(variables_path / saving_name, knn_acc)

OpenAI: 62.3
peak memory: 4584.72 MiB, increment: 2093.61 MiB
CPU times: user 3min 51s, sys: 5min 31s, total: 9min 23s
Wall time: 14 s


## TF-IDF

In [24]:
# save results
tfidf_features = sp.sparse.load_npz(variables_path / "tfidf_features.npz")

### Euclidean

In [25]:
%%time
%%memit

knn_accuracy_tfidf = knn_accuracy_cv(
    tfidf_features[labels_iclr != "unlabeled"],
    labels_iclr[labels_iclr != "unlabeled"],
)

np.save(
    variables_path / "knn_accuracy_cv_tfidf",
    knn_accuracy_tfidf,
)

peak memory: 4585.21 MiB, increment: 0.58 MiB
CPU times: user 32min 15s, sys: 3.46 s, total: 32min 19s
Wall time: 1min 26s


In [26]:
print(knn_accuracy_tfidf * 100)

59.181660828595405


### Cosine

In [27]:
%%time
%%memit

knn_accuracy_tfidf = knn_accuracy_cv(
    tfidf_features[labels_iclr != "unlabeled"],
    labels_iclr[labels_iclr != "unlabeled"],
    metric="cosine",
)
np.save(
    variables_path / "knn_accuracy_cv_cosine_tfidf",
    knn_accuracy_tfidf,
)

peak memory: 4585.15 MiB, increment: 0.20 MiB
CPU times: user 26.7 s, sys: 1.58 s, total: 28.3 s
Wall time: 5.77 s


In [28]:
print(knn_accuracy_tfidf * 100)

59.181660828595405


## SVD

In [29]:
svd_data = np.load(variables_path / "svd_data.npy")

### Euclidean

In [30]:
%%time
knn_accuracy_svd = knn_accuracy_cv(
    svd_data[:, :100][labels_iclr != "unlabeled"],
    labels_iclr[labels_iclr != "unlabeled"],
)

np.save(variables_path / "knn_accuracy_cv_svd", knn_accuracy_svd)

CPU times: user 11.1 s, sys: 3.65 s, total: 14.8 s
Wall time: 902 ms


In [31]:
print(knn_accuracy_svd * 100)

58.85986281999378


### Cosine

In [32]:
%%time
knn_accuracy_svd = knn_accuracy_cv(
    svd_data[:, :100][labels_iclr != "unlabeled"],
    labels_iclr[labels_iclr != "unlabeled"],
    metric="cosine",
)

np.save(
    variables_path / "knn_accuracy_cv_cosine_svd",
    knn_accuracy_svd,
)

CPU times: user 1min 24s, sys: 3min 23s, total: 4min 48s
Wall time: 7.27 s


In [33]:
print(knn_accuracy_svd * 100)

60.72144479062154


## L2(SVD)

In [34]:
# svd_data = np.load(variables_path / "svd_data.npy")

In [35]:
%%time
knn_accuracy_L2_svd = knn_accuracy_cv(
    normalize(svd_data[labels_iclr != "unlabeled"][:, :100], axis=1),
    labels_iclr[labels_iclr != "unlabeled"],
)

np.save(variables_path / "knn_accuracy_cv_L2_svd", knn_accuracy_L2_svd)

CPU times: user 9.88 s, sys: 138 ms, total: 10 s
Wall time: 892 ms


In [36]:
print(knn_accuracy_L2_svd * 100)

60.72144479062154


# t-SNE

## BERT-based models

In [37]:
model_names = [
    "SimCSE",
    "DeCLUTR-sci",
    "SciNCL",
    "SPECTER2",
    "ST5",
    "SBERT",
]


model_paths = [
    "princeton-nlp/unsup-simcse-bert-base-uncased",
    "johngiorgi/declutr-sci-base",
    "malteos/scincl",
    "allenai/specter2_base",
    "sentence-transformers/sentence-t5-base",
    "sentence-transformers/all-mpnet-base-v2",
]
print(model_names)

['SimCSE', 'DeCLUTR-sci', 'SciNCL', 'SPECTER2', 'ST5', 'SBERT']


In [38]:
%%time
%%memit
print("kNN accuracy          [AVG]    [CLS]   [SEP]")
for i, model_name in enumerate(model_names):
    # import
    saving_path = Path("embeddings_" + model_name.lower())

    # load
    if model_name != "ST5":
        tsne_cls = np.load(variables_path / saving_path / "tsne_cls.npy")
        tsne_sep = np.load(variables_path / saving_path / "tsne_sep.npy")
    tsne_av = np.load(variables_path / saving_path / "tsne_av.npy")

    # knn acc
    if model_name == "ST5":
        knn_acc = knn_accuracy_cv(
            [
                tsne_av[labels_iclr != "unlabeled"],
            ],
            labels_iclr[labels_iclr != "unlabeled"],
        )
    else:
        knn_acc = knn_accuracy_cv(
            [
                tsne_av[labels_iclr != "unlabeled"],
                tsne_cls[labels_iclr != "unlabeled"],
                tsne_sep[labels_iclr != "unlabeled"],
            ],
            labels_iclr[labels_iclr != "unlabeled"],
        )

    print(f"t-SNE {model_name}: {np.array(knn_acc)*100}")

    # save embeddings
    saving_name = Path("knn_accuracy_cv_tsne_" + model_name.lower())
    np.save(variables_path / saving_name, knn_acc)

kNN accuracy          [AVG]    [CLS]   [SEP]
t-SNE SimCSE: [39.56905059 36.32078881 37.13289093]
t-SNE DeCLUTR-sci: [47.09998064 32.74359426 22.23239631]
t-SNE SciNCL: [52.11031314 54.85283366 54.62303075]
t-SNE SPECTER2: [52.92240939 54.1176122  54.60031215]
t-SNE ST5: [52.57760528]
t-SNE SBERT: [56.84466623 55.65740203 55.97119689]
peak memory: 4585.41 MiB, increment: 0.48 MiB
CPU times: user 1min 57s, sys: 6.3 s, total: 2min 4s
Wall time: 13.2 s


In [39]:
%%time
%%memit
print("kNN accuracy     [AVG] [CLS] [SEP]")
for i, model_name in enumerate(model_names):
    # print("Model: ", model_name)

    # save embeddings
    saving_name = Path("knn_accuracy_cv_" + model_name.lower() + ".npy")
    knn_acc = np.load(variables_path / saving_name)
    print(f"{model_name}:          {np.round(np.array(knn_acc)*100,1)}")

    saving_name = Path("knn_accuracy_cv_tsne_" + model_name.lower() + ".npy")
    knn_acc = np.load(variables_path / saving_name)
    print(f"t-SNE {model_name}:    {np.round(np.array(knn_acc)*100,1)}")

    print("----------------------------------")

kNN accuracy     [AVG] [CLS] [SEP]
SimCSE:          [46.5 45.1 44.3]
t-SNE SimCSE:    [39.6 36.3 37.1]
----------------------------------
DeCLUTR-sci:          [52.7 40.  30.5]
t-SNE DeCLUTR-sci:    [47.1 32.7 22.2]
----------------------------------
SciNCL:          [56.3 58.8 58.9]
t-SNE SciNCL:    [52.1 54.9 54.6]
----------------------------------
SPECTER2:          [57.1 58.8 59.2]
t-SNE SPECTER2:    [52.9 54.1 54.6]
----------------------------------
ST5:          57.0
t-SNE ST5:    [52.6]
----------------------------------
SBERT:          [61.6 59.9 60.2]
t-SNE SBERT:    [56.8 55.7 56. ]
----------------------------------
peak memory: 4583.90 MiB, increment: 0.00 MiB
CPU times: user 183 ms, sys: 72 ms, total: 255 ms
Wall time: 360 ms


In [60]:
model_names = [
    "SimCSE",
    "DeCLUTR-sci",
    "SciNCL",
    "SPECTER2",
    "ST5",
    "SBERT",
    "Cohere",
    "OpenAI",
]

print(model_names)

['SimCSE', 'DeCLUTR-sci', 'SciNCL', 'SPECTER2', 'ST5', 'SBERT', 'Cohere', 'OpenAI']


In [62]:
print("kNN accuracy          [AVG]    [CLS]   [SEP]")
for i, model_name in enumerate(model_names):
    # save embeddings
    saving_name = Path("knn_accuracy_cv_tsne_" + model_name.lower() + ".npy")
    knn_acc = np.load(variables_path / saving_name)
    print(f"t-SNE {model_name}: {np.round(np.array(knn_acc)*100,1)}")

    # print("----------------------------")

kNN accuracy          [AVG]    [CLS]   [SEP]
t-SNE SimCSE: [39.6 36.3 37.1]
t-SNE DeCLUTR-sci: [47.1 32.7 22.2]
t-SNE SciNCL: [52.1 54.9 54.6]
t-SNE SPECTER2: [52.9 54.1 54.6]
t-SNE ST5: [52.6]
t-SNE SBERT: [56.8 55.7 56. ]
t-SNE Cohere: 56.4
t-SNE OpenAI: 57.1


### Cohere

In [41]:
saving_path = Path("embeddings_cohere")
tsne_av = np.load(variables_path / saving_path / "tsne_av.npy")

In [42]:
%%time
%%memit

knn_accuracy_tsne_cohere = knn_accuracy_cv(
    tsne_av[labels_iclr != "unlabeled"],
    labels_iclr[labels_iclr != "unlabeled"],
)
np.save(
    variables_path / "knn_accuracy_cv_tsne_cohere",
    knn_accuracy_tsne_cohere,
)

peak memory: 4584.21 MiB, increment: 1.25 MiB
CPU times: user 8.72 s, sys: 4.16 s, total: 12.9 s
Wall time: 1.16 s


In [43]:
print(knn_accuracy_tsne_cohere * 100)

56.40061490439058


### OpenAI

In [44]:
saving_path = Path("embeddings_openai")
tsne_av = np.load(variables_path / saving_path / "tsne_av.npy")

In [45]:
%%time
%%memit

knn_accuracy_tsne_openai = knn_accuracy_cv(
    tsne_av[labels_iclr != "unlabeled"],
    labels_iclr[labels_iclr != "unlabeled"],
)
np.save(
    variables_path / "knn_accuracy_cv_tsne_openai",
    knn_accuracy_tsne_openai,
)

peak memory: 4583.88 MiB, increment: 0.00 MiB
CPU times: user 8.99 s, sys: 4.77 s, total: 13.8 s
Wall time: 1.26 s


In [46]:
print(knn_accuracy_tsne_openai * 100)

57.1435989509074


## TF-IDF

In [47]:
tsne_tfidf = np.load(variables_path / "tsne_tfidf.npy")

In [48]:
%%time

knn_accuracy_tsne_tfidf = knn_accuracy_cv(
    tsne_tfidf[labels_iclr != "unlabeled"],
    labels_iclr[labels_iclr != "unlabeled"],
)
np.save(
    variables_path / "knn_accuracy_cv_tsne_tfidf",
    knn_accuracy_tsne_tfidf,
)

CPU times: user 8.83 s, sys: 113 ms, total: 8.94 s
Wall time: 799 ms


In [49]:
print(knn_accuracy_tsne_tfidf * 100)

51.97980438060705


## SVD

In [50]:
tsne_svd = np.load(variables_path / "tsne_svd.npy")

In [51]:
%%time
%%memit

knn_accuracy_tsne_svd = knn_accuracy_cv(
    tsne_svd[labels_iclr != "unlabeled"],
    labels_iclr[labels_iclr != "unlabeled"],
)
np.save(
    variables_path / "knn_accuracy_cv_tsne_svd",
    knn_accuracy_tsne_svd,
)

peak memory: 4583.54 MiB, increment: 0.00 MiB
CPU times: user 9.95 s, sys: 3.98 s, total: 13.9 s
Wall time: 1.28 s


In [52]:
print(knn_accuracy_tsne_svd * 100)

55.88755111979488


## L2(SVD)

In [53]:
tsne_L2_svd = np.load(variables_path / "tsne_L2_svd.npy")

In [54]:
%%time
%%memit

knn_accuracy_tsne_L2_svd = knn_accuracy_cv(
    tsne_L2_svd[labels_iclr != "unlabeled"],
    labels_iclr[labels_iclr != "unlabeled"],
)
np.save(
    variables_path / "knn_accuracy_cv_tsne_L2_svd",
    knn_accuracy_tsne_L2_svd,
)

peak memory: 4583.54 MiB, increment: 0.00 MiB
CPU times: user 9.23 s, sys: 4.07 s, total: 13.3 s
Wall time: 1.2 s


In [55]:
print(knn_accuracy_tsne_L2_svd * 100)

56.73785006424812
