In [1]:
import pandas as pd
import random
import numpy as np
from random import randint

import scipy as sp
from scipy import sparse

from openTSNE import TSNE, affinity

import matplotlib.pyplot as plt
import matplotlib

import time
import pickle
import memory_profiler

%load_ext memory_profiler

from pathlib import Path
import distro

%load_ext watermark

In [2]:
%load_ext autoreload
%autoreload 2

from plotting import plot_tsne_colors

In [3]:
import black
import jupyter_black

jupyter_black.load(line_length=79)

In [4]:
variables_path = Path("../results/variables/iclr24v2")
figures_path = Path("../results/figures")
data_path = Path("../data")

In [5]:
plt.style.use("matplotlib_style.txt")

In [6]:
%watermark -a 'Rita González-Márquez' -t -d -tz -u -v -iv -w -m -h -p openTSNE
print(distro.name(pretty=True))

Author: Rita González-Márquez

Last updated: 2024-04-12 09:05:29CEST

Python implementation: CPython
Python version       : 3.11.5
IPython version      : 8.18.1

openTSNE: 1.0.0

Compiler    : GCC 11.2.0
OS          : Linux
Release     : 3.10.0-1160.el7.x86_64
Machine     : x86_64
Processor   : x86_64
CPU cores   : 64
Architecture: 64bit

Hostname: rgonzalesmarquez_GPU0-llm_gber7

numpy          : 1.26.2
jupyter_black  : 0.3.4
pandas         : 2.1.3
scipy          : 1.11.4
black          : 23.11.0
matplotlib     : 3.8.2
distro         : 1.8.0
openTSNE       : 1.0.0
memory_profiler: 0.61.0

Watermark: 2.4.3

Ubuntu 22.04.3 LTS


# Import

In [56]:
%%time
iclr2024 = pd.read_parquet(
    data_path / "iclr24v2.parquet",
    engine="pyarrow",
)

CPU times: user 176 ms, sys: 70.9 ms, total: 247 ms
Wall time: 182 ms


In [57]:
iclr2024.keywords = iclr2024.keywords.transform(lambda x: list(x))
iclr2024.scores = iclr2024.scores.transform(lambda x: list(x))

In [58]:
iclr2024

Unnamed: 0,year,id,title,abstract,authors,decision,scores,keywords,labels
0,2017,B1-Hhnslg,Prototypical Networks for Few-shot Learning,A recent approach to few-shot classification c...,"Jake Snell, Kevin Swersky, Richard Zemel",Reject,"[6, 4, 5]","[deep learning, transfer learning]",transfer learning
1,2017,B1-q5Pqxl,Machine Comprehension Using Match-LSTM and Ans...,Machine comprehension of text is an important ...,"Shuohang Wang, Jing Jiang",Accept (Poster),"[6, 6, 7]","[natural language processing, deep learning]",language models
2,2017,B16Jem9xe,Learning in Implicit Generative Models,Generative adversarial networks (GANs) provide...,"Shakir Mohamed, Balaji Lakshminarayanan",Invite to Workshop Track,"[8, 7, 6]",[unsupervised learning],unlabeled
3,2017,B16dGcqlx,Third Person Imitation Learning,Reinforcement learning (RL) makes it possible ...,"Bradly C Stadie, Pieter Abbeel, Ilya Sutskever",Accept (Poster),"[6, 5, 6]",[],unlabeled
4,2017,B184E5qee,Improving Neural Language Models with a Contin...,We propose an extension to neural network lang...,"Edouard Grave, Armand Joulin, Nicolas Usunier",Accept (Poster),"[7, 9, 5]",[natural language processing],language models
...,...,...,...,...,...,...,...,...,...
24440,2024,zxPDdw8koz,CLIP meets Model Zoo Experts: Pseudo-Supervisi...,Contrastive language image pretraining (CLIP) ...,"Mohammadreza Salehi, Mehrdad Farajtabar, Maxwe...",Withdrawn,"[8, 3, 3, 3]","[contrastive learning, clip, distillation, den...",vision-language models
24441,2024,zyBJodMrn5,On the generalization capacity of neural netwo...,The advent of the Transformer has led to the d...,"Takuya Ito, Soham Dan, Mattia Rigotti, James K...",Accept (poster),"[8, 3, 6]","[compositional generalization, compositionalit...",out-of-distribution
24442,2024,zz61V8bIab,Stochastic Adversarial Networks for Multi-Doma...,Adversarial training has played a pivotal role...,"Xu Wang, Yuan Wu",Withdrawn,"[5, 1, 5]","[multi-domain text classification, adversarial...",adversarial
24443,2024,zzqn5G9fjn,Breaking Physical and Linguistic Borders: Mult...,Pretrained large language models (LLMs) have e...,"Wanru Zhao, Royson Lee, Yihong Chen, Xinchi Qi...",Accept (poster),"[5, 3, 1, 8]","[multilingual federated learning, natural lang...",language models


In [59]:
labels_iclr = iclr2024.labels.to_numpy()

In [60]:
colors_iclr = np.load(variables_path / "colors_iclr.npy")

pickle_in = open(variables_path / "dict_label_to_color.pkl", "rb")
dict_label_to_color = pickle.load(pickle_in)

# t-SNE

## BERT-based models

In [None]:
model_names = [
    "SimCSE",
    "DeCLUTR-sci",
    "SciNCL",
    "SPECTER2",
    "ST5",
    "SBERT",
]


model_paths = [
    "princeton-nlp/unsup-simcse-bert-base-uncased",
    "johngiorgi/declutr-sci-base",
    "malteos/scincl",
    "allenai/specter2_base",
    "sentence-transformers/sentence-t5-base",
    "sentence-transformers/all-mpnet-base-v2",
]
print(model_names)

In [None]:
%%time
%%memit

for i, model_name in enumerate(model_names):
    # import
    saving_path = Path("embeddings_" + model_name.lower())

    embedding_av = np.load(
        variables_path / saving_path / "embedding_abstracts_only_av.npy"
    )

    if model_name != "ST5":
        embedding_cls = np.load(
            variables_path / saving_path / "embedding_abstracts_only_cls.npy"
        )
        embedding_sep = np.load(
            variables_path / saving_path / "embedding_abstracts_only_sep.npy"
        )

        # t-SNE
        tsne_cls = TSNE(verbose=True, random_state=42).fit(embedding_cls)
        tsne_sep = TSNE(verbose=True, random_state=42).fit(embedding_sep)

    tsne_av = TSNE(verbose=True, random_state=42).fit(embedding_av)

    # save
    if model_name != "ST5":
        np.save(variables_path / saving_path / "tsne_cls.npy", tsne_cls)
        np.save(variables_path / saving_path / "tsne_sep.npy", tsne_sep)
    np.save(variables_path / saving_path / "tsne_av.npy", tsne_av)

### Cohere

In [None]:
%%time

# save
saving_path = Path("embeddings_cohere")
(variables_path / saving_path).mkdir(parents=True, exist_ok=True)

embeddings_av = np.load(
    variables_path / saving_path / "embedding_abstracts_only_av.npy",
)

tsne_av = TSNE(verbose=True, random_state=42).fit(embeddings_av)

np.save(variables_path / saving_path / "tsne_av", tsne_av)

### OpenAI

In [None]:
%%time

# save
saving_path = Path("embeddings_openai")
(variables_path / saving_path).mkdir(parents=True, exist_ok=True)

embeddings_av = np.load(
    variables_path / saving_path / "embedding_abstracts_only_av.npy",
)

tsne_av = TSNE(verbose=True, random_state=42).fit(embeddings_av)

np.save(variables_path / saving_path / "tsne_av", tsne_av)

## TF-IDF

In [None]:
# import
#tfidf_features = sp.sparse.load_npz(variables_path / "tfidf_features.npz")

# svd_data = np.load(variables_path / "svd_data.npy")

In [None]:
%%time
%%memit

A = affinity.PerplexityBasedNN(
    tfidf_features,
    verbose=True,
    method="exact",
    random_state=42,
)
tsne_tfidf = TSNE(verbose=True, random_state=42).fit(
    tfidf_features,
    affinities=A,
    initialization=svd_data[:, :2],
)

np.save(variables_path / "tsne_tfidf", tsne_tfidf)

## SVD

In [None]:
# svd_data = np.load(variables_path / "svd_data.npy")

In [None]:
%%time
%%memit

tsne_svd = TSNE(verbose=True, random_state=42).fit(svd_data[:, :100])

np.save(variables_path / "tsne_svd", tsne_svd)

## L2(SVD)

In [None]:
#svd_data = np.load(variables_path / "svd_data.npy")

In [None]:
%%time
%%memit

tsne_L2_svd = TSNE(verbose=True, random_state=42).fit(
    normalize(svd_data[:, :100], axis=1)
)

np.save(variables_path / "tsne_L2_svd", tsne_L2_svd)

# Plot

In [None]:
fig, axs = plt.subplots(
    len(model_names) + 1, 3, figsize=(9, 3 * (len(model_names) + 1)), dpi=200
)
rep = ["av", "cls", "sep"]
acc_fontsize = 8
for i in range(len(model_names) + 1):
    for j in range(3):
        if i == 0:
            # load
            ## tsnes
            tsne_tfidf = np.load(variables_path / "tsne_tfidf.npy")
            tsne_svd = np.load(variables_path / "tsne_svd.npy")
            tsne_L2_svd = np.load(variables_path / "tsne_L2_svd.npy")

            ## accuracies
            knn_accuracy_tfidf = np.load(
                variables_path / "knn_accuracy_tfidf.npy"
            )
            knn_accuracy_svd = np.load(variables_path / "knn_accuracy_svd.npy")
            knn_accuracy_L2_svd = np.load(
                variables_path / "knn_accuracy_L2_svd.npy"
            )

            knn_accuracy_tsne_tfidf = np.load(
                variables_path / "knn_accuracy_tsne_tfidf.npy"
            )
            knn_accuracy_tsne_svd = np.load(
                variables_path / "knn_accuracy_tsne_svd.npy"
            )
            knn_accuracy_tsne_L2_svd = np.load(
                variables_path / "knn_accuracy_tsne_L2_svd.npy"
            )

            plot_tsne_colors(
                tsne_tfidf, colors_iclr, ax=axs[i, 0], plot_type="subplot_3"
            )
            axs[i, 0].set_title(f"TF-IDF (d=44434)", fontsize=15)

            axs[i, 0].text(
                0,
                0.06,
                f"high-d: {knn_accuracy_tfidf*100:.1f}",
                transform=axs[i, 0].transAxes,
                va="bottom",
                ha="left",
                size=acc_fontsize,
            )
            axs[i, 0].text(
                0,
                0.01,
                f"low-d:  {knn_accuracy_tsne_tfidf*100:.1f}",
                transform=axs[i, 0].transAxes,
                va="bottom",
                ha="left",
                size=acc_fontsize,
            )

            plot_tsne_colors(
                tsne_svd,
                colors_iclr,
                ax=axs[i, 1],
                plot_type="subplot_3",
            )
            axs[i, 1].set_title("SVD (d=100)", fontsize=15)
            axs[i, 1].text(
                0,
                0.06,
                f"high-d: {knn_accuracy_svd*100:.1f}",
                transform=axs[i, 1].transAxes,
                va="bottom",
                ha="left",
                size=acc_fontsize,
            )
            axs[i, 1].text(
                0,
                0.01,
                f"low-d:  {knn_accuracy_tsne_svd*100:.1f}",
                transform=axs[i, 1].transAxes,
                va="bottom",
                ha="left",
                size=acc_fontsize,
            )

            plot_tsne_colors(
                tsne_L2_svd,
                colors_iclr,
                ax=axs[i, 2],
                plot_type="subplot_3",
            )
            axs[i, 2].set_title("L2(SVD) (d=100)", fontsize=15)
            axs[i, 2].text(
                0,
                0.06,
                f"high-d: {knn_accuracy_L2_svd*100:.1f}",
                transform=axs[i, 2].transAxes,
                va="bottom",
                ha="left",
                size=acc_fontsize,
            )
            axs[i, 2].text(
                0,
                0.01,
                f"low-d:  {knn_accuracy_tsne_L2_svd*100:.1f}",
                transform=axs[i, 2].transAxes,
                va="bottom",
                ha="left",
                size=acc_fontsize,
            )
        else:
            # break
            # load
            ## tsnes
            if (model_names[i - 1] == "ST5") & ((j == 1) | (j == 2)):
                axs[i, j].axis("off")
                continue

            else:
                saving_path = Path("embeddings_" + model_names[i - 1].lower())
                tsne_name = "tsne_" + rep[j] + ".npy"
                tsne = np.load(variables_path / saving_path / tsne_name)

                ## accuracies
                saving_name = Path(
                    "knn_accuracy_" + model_names[i - 1].lower() + ".npy"
                )
                knn_acc_highd = np.load(variables_path / saving_name)

                saving_name = Path(
                    "knn_accuracy_tsne_" + model_names[i - 1].lower() + ".npy"
                )
                knn_acc_lowd = np.load(variables_path / saving_name)

                # plot
                plot_tsne_colors(
                    tsne, colors_iclr, ax=axs[i, j], plot_type="subplot_3"
                )
                title = model_names[i - 1] + " [" + rep[j].upper() + "]"
                axs[i, j].set_title(title, fontsize=15)

                if model_names[i - 1] == "ST5":
                    axs[i, j].text(
                        0,
                        0.06,
                        f"high-d: {knn_acc_highd*100:.1f}",
                        transform=axs[i, j].transAxes,
                        va="bottom",
                        ha="left",
                        size=acc_fontsize,
                    )
                    axs[i, j].text(
                        0,
                        0.01,
                        f"low-d:  {knn_acc_lowd[0]*100:.1f}",
                        transform=axs[i, j].transAxes,
                        va="bottom",
                        ha="left",
                        size=acc_fontsize,
                    )
                else:
                    axs[i, j].text(
                        0,
                        0.06,
                        f"high-d: {knn_acc_highd[j]*100:.1f}",
                        transform=axs[i, j].transAxes,
                        va="bottom",
                        ha="left",
                        size=acc_fontsize,
                    )
                    axs[i, j].text(
                        0,
                        0.01,
                        f"low-d:  {knn_acc_lowd[j]*100:.1f}",
                        transform=axs[i, j].transAxes,
                        va="bottom",
                        ha="left",
                        size=acc_fontsize,
                    )

fig.savefig(figures_path / "tmp" / "tsne_embeddings_iclr_with_knn_accs_v2.png")