In [1]:
import pandas as pd
import random
import numpy as np
from random import randint
import torch
from transformers import AutoTokenizer, AutoModel
import adapters
from adapters import AutoAdapterModel
import gc

import scipy as sp
from scipy import sparse
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize

from openTSNE import TSNE, affinity

import matplotlib.pyplot as plt
import matplotlib

import pickle
import time
import memory_profiler

%load_ext memory_profiler

from pathlib import Path
import distro

%load_ext watermark

In [2]:
# old one '1.8.1+cu111'
torch.__version__

'2.1.1+cu121'

In [3]:
%load_ext autoreload
%autoreload 2

from text_embeddings_src.model_stuff import train_loop, train_loop_batches_eval
from text_embeddings_src.data_stuff import (
    SentencePairDataset,
    MultOverlappingSentencesPairDataset,
)
from text_embeddings_src.metrics import knn_accuracy
from text_embeddings_src.embeddings import generate_embeddings
from text_embeddings_src.dim_red import run_tsne_simple
from text_embeddings_src.plotting import plot_tsne_colors

In [4]:
import black
import jupyter_black

jupyter_black.load(line_length=79)

In [5]:
variables_path = Path("../../results/variables")
# variables_pubmed_path = Path("../../pubmed-landscape/results/variables")
figures_path = Path("../../results/figures/updated_dataset")
data_path = Path("../../data")
berenslab_data_path = Path("/gpfs01/berens/data/data/pubmed_processed")

In [6]:
plt.style.use("../matplotlib_style.txt")

In [7]:
model = None
gc.collect()
torch.cuda.empty_cache()

In [8]:
%watermark -a 'Rita González-Márquez' -t -d -tz -u -v -iv -w -m -h -p transformers -p openTSNE
print(distro.name(pretty=True))

Author: Rita González-Márquez

Last updated: 2024-03-12 03:38:05CET

Python implementation: CPython
Python version       : 3.11.5
IPython version      : 8.18.1

openTSNE: 1.0.0

Compiler    : GCC 11.2.0
OS          : Linux
Release     : 3.10.0-1160.el7.x86_64
Machine     : x86_64
Processor   : x86_64
CPU cores   : 64
Architecture: 64bit

Hostname: rgonzalesmarquez_GPU0-llm_gber7

matplotlib     : 3.8.2
scipy          : 1.11.4
distro         : 1.8.0
sklearn        : 1.3.2
memory_profiler: 0.61.0
jupyter_black  : 0.3.4
numpy          : 1.26.2
black          : 23.11.0
openTSNE       : 1.0.0
pandas         : 2.1.3
torch          : 2.1.1
adapters       : 0.1.0

Watermark: 2.4.3

Ubuntu 22.04.3 LTS


# Import

In [9]:
%%time
iclr2024 = pd.read_parquet(
    data_path / "iclr2024.parquet.gzip",
    # index=False,
    engine="pyarrow",
    # compression="gzip",
)

CPU times: user 289 ms, sys: 78.5 ms, total: 368 ms
Wall time: 275 ms


In [10]:
iclr2024.keywords = iclr2024.keywords.transform(lambda x: list(x))
iclr2024.scores = iclr2024.scores.transform(lambda x: list(x))

In [11]:
iclr2024

Unnamed: 0,index,year,id,title,abstract,authors,decision,scores,keywords,gender-first,gender-last,t-SNE x,t-SNE y
0,0,2017,S1VaB4cex,FractalNet: Ultra-Deep Neural Networks without...,We introduce a design strategy for neural netw...,"Gustav Larsson, Michael Maire, Gregory Shakhna...",Accept (Poster),"[5, 7, 6, 6]",[],male,male,-28.117955,-20.418127
1,1,2017,H1W1UN9gg,Deep Information Propagation,We study the behavior of untrained neural netw...,"Samuel S. Schoenholz, Justin Gilmer, Surya Gan...",Accept (Poster),"[8, 9, 8]","[theory, deep learning]",male,,-32.466820,-10.791123
2,2,2017,r1GKzP5xx,Recurrent Normalization Propagation,We propose a LSTM parametrization that preser...,"César Laurent, Nicolas Ballas, Pascal Vincent",Invite to Workshop Track,"[4, 6, 6]","[deep learning, optimization]",,male,3.504240,19.946053
3,3,2017,S1J0E-71l,Surprisal-Driven Feedback in Recurrent Networks,Recurrent neural nets are widely used for pred...,"K, a, m, i, l, , R, o, c, k, i",Reject,"[3, 4, 3]","[unsupervised learning, applications, deep lea...",,,4.553473,16.037763
4,4,2017,SJGCiw5gl,Pruning Convolutional Neural Networks for Reso...,We propose a new formulation for pruning convo...,"Pavlo Molchanov, Stephen Tyree, Tero Karras, T...",Accept (Poster),"[6, 7, 9]","[deep learning, transfer learning]",,male,-25.827705,-37.891772
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24342,7299,2024,1bbPQShCT2,I-PHYRE: Interactive Physical Reasoning,Current evaluation protocols predominantly ass...,,,[],"[intuitive physics, physical reasoning]",,,43.137120,44.316133
24343,7300,2024,Ny150AblPu,EXPOSING TEXT-IMAGE INCONSISTENCY USING DIFFUS...,In the battle against widespread online misinf...,,,[],"[mis-contextualization, media forensic]",,,59.742172,-22.673627
24344,7301,2024,ZGBOfAQrMl,Video Super-Resolution Transformer with Masked...,"Recently, Vision Transformer has achieved grea...",,,[],"[video super-resolution, adaptive, memory and ...",,,57.933273,-3.932825
24345,7302,2024,J2kRjUAOLh,Contrastive Predict-and-Search for Mixed Integ...,Mixed integer linear programs (MILP) are flex...,,,[],[mixed integer programs; contrastive learning],,,-11.437999,21.289523


In [12]:
labels_iclr = np.load(variables_path / "updated_dataset" / "labels_iclr.npy")
colors_iclr = np.load(variables_path / "updated_dataset" / "colors_iclr.npy")

pickle_in = open(
    variables_path / "updated_dataset" / "dict_label_to_color.pkl", "rb"
)
dict_label_to_color = pickle.load(pickle_in)

In [13]:
# sanity check
print(len(np.unique(labels_iclr)))
labels_iclr

46


array(['unlabeled', 'unlabeled', 'optimization', ..., 'unlabeled',
       'unlabeled', 'federated learning'], dtype='<U34')

In [14]:
# titles_abstracts_together = [
#     iclr2024.title[i] + ". " + iclr2024.abstract[i]
#     for i in range(len(iclr2024))
# ]
# print(len(titles_abstracts_together))

In [15]:
# pd.Series(titles_abstracts_together).iloc[0]

In [16]:
# save
saving_path = Path("embeddings_" + "mpnet") / Path("updated_dataset")
embedding_av = np.load(
    variables_path / saving_path / "embedding_abstracts_only_av.npy"
)

embedding_asbtracts_only_av_after_training_av_1_epoch = np.load(
    variables_path
    / saving_path
    / "embedding_asbtracts_only_av_after_training_av_1_epoch.npy"
)

# Experiment

In [22]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA


def knn_accuracy_whitening_scores(X, y, ntest=500, rs=42):
    """Calculates kNN accuracy of raw, centered and whitened data.
    It calculates it for to distance metrics: cosine and euclidean.

    Parameters
    ----------
    X : list of array-like
        List with the different datasets for which to calculate the kNN accuracy.
    y : array-like
        Array with labels (colors).
    ntest : int, default=500
        Subset size for the kNN calculation
    rs : int, default=42
        Random seed.

    Returns
    -------
    scores : array of floates of shape (3,2)
        List with the kNN accuracy for the different distance metrics and versions of the data.

    """

    n = X.shape[0]
    Xcentered = X - np.mean(X, axis=0)
    Xwhitened = PCA(whiten=True).fit_transform(X)

    scores = np.zeros((3, 2))
    np.random.seed(rs)
    test = np.random.choice(n, size=ntest, replace=False)
    train = np.setdiff1d(np.arange(n), test)

    for i, X_to_use in enumerate([X, Xcentered, Xwhitened]):
        for j, metric in enumerate(["euclidean", "cosine"]):
            knn = KNeighborsClassifier(
                n_neighbors=10, algorithm="brute", metric=metric
            ).fit(X_to_use[train], y[train])

            scores[i, j] = knn.score(X_to_use[test], y[test])

    return scores

In [None]:
def print_table(knn_accs):
    print(["euclidean", "cosine"])
    print(
        "Raw:      ",
        round(knn_accs[0, 0], 3) * 100,
        round(knn_accs[0, 1], 3) * 100,
    )
    print(
        "Centered: ",
        round(knn_accs[1, 0], 3) * 100,
        round(knn_accs[1, 1], 3) * 100,
    )
    print(
        "Whitened: ",
        round(knn_accs[2, 0], 3) * 100,
        round(knn_accs[2, 1], 3) * 100,
    )
    print("---------------")

In [23]:
print(labels_iclr[labels_iclr != "unlabeled"].shape)
round(0.1 * labels_iclr[labels_iclr != "unlabeled"].shape[0])

(12997,)


1300

In [24]:
%%time
knn_accuracy_before_training_centered_and_whitened = (
    knn_accuracy_whitening_scores(
        embedding_av[labels_iclr != "unlabeled"],
        labels_iclr[labels_iclr != "unlabeled"],
        ntest=round(0.1 * labels_iclr[labels_iclr != "unlabeled"].shape[0]),
        rs=42,
    )
)
knn_accuracy_before_training_centered_and_whitened_rs23 = (
    knn_accuracy_whitening_scores(
        embedding_av[labels_iclr != "unlabeled"],
        labels_iclr[labels_iclr != "unlabeled"],
        ntest=round(0.1 * labels_iclr[labels_iclr != "unlabeled"].shape[0]),
        rs=23,
    )
)

CPU times: user 1min 5s, sys: 1min 12s, total: 2min 18s
Wall time: 5.11 s


In [42]:
# rs=23
print_table(knn_accuracy_before_training_centered_and_whitened)
print_table(knn_accuracy_before_training_centered_and_whitened_rs23)

['euclidean', 'cosine']
Raw:       37.4 39.6
Centered:  37.4 37.0
Whitened:  17.599999999999998 46.9
---------------
['euclidean', 'cosine']
Raw:       39.800000000000004 41.6
Centered:  39.800000000000004 37.9
Whitened:  20.0 49.4
---------------


In [43]:
%%time
knn_accuracy_after_training_centered_and_whitened = (
    knn_accuracy_whitening_scores(
        embedding_asbtracts_only_av_after_training_av_1_epoch[
            labels_iclr != "unlabeled"
        ],
        labels_iclr[labels_iclr != "unlabeled"],
        ntest=round(0.1 * labels_iclr[labels_iclr != "unlabeled"].shape[0]),
        rs=42,
    )
)
knn_accuracy_after_training_centered_and_whitened_rs23 = (
    knn_accuracy_whitening_scores(
        embedding_asbtracts_only_av_after_training_av_1_epoch[
            labels_iclr != "unlabeled"
        ],
        labels_iclr[labels_iclr != "unlabeled"],
        ntest=round(0.1 * labels_iclr[labels_iclr != "unlabeled"].shape[0]),
        rs=23,
    )
)

CPU times: user 1min 7s, sys: 1min 22s, total: 2min 30s
Wall time: 5.18 s


In [44]:
# rs=23
print_table(knn_accuracy_after_training_centered_and_whitened)
print_table(knn_accuracy_after_training_centered_and_whitened_rs23)

['euclidean', 'cosine']
Raw:       58.699999999999996 59.3
Centered:  58.699999999999996 58.9
Whitened:  36.199999999999996 56.49999999999999
---------------
['euclidean', 'cosine']
Raw:       60.6 60.199999999999996
Centered:  60.6 59.5
Whitened:  38.2 57.099999999999994
---------------


## Sanity check

### SBERT

In [46]:
# save
saving_path = Path("embeddings_" + "sbert") / Path("updated_dataset")
embedding_av_sbert = np.load(
    variables_path / saving_path / "embedding_abstracts_only_av.npy"
)

In [48]:
%%time
knn_accuracy_sbert_before_training_centered_and_whitened = (
    knn_accuracy_whitening_scores(
        embedding_av_sbert[labels_iclr != "unlabeled"],
        labels_iclr[labels_iclr != "unlabeled"],
        ntest=round(0.1 * labels_iclr[labels_iclr != "unlabeled"].shape[0]),
        rs=42,
    )
)
knn_accuracy_sbert_before_training_centered_and_whitened_rs23 = (
    knn_accuracy_whitening_scores(
        embedding_av_sbert[labels_iclr != "unlabeled"],
        labels_iclr[labels_iclr != "unlabeled"],
        ntest=round(0.1 * labels_iclr[labels_iclr != "unlabeled"].shape[0]),
        rs=23,
    )
)

CPU times: user 1min 10s, sys: 1min 29s, total: 2min 39s
Wall time: 5.22 s


In [49]:
# rs=23
print_table(knn_accuracy_sbert_before_training_centered_and_whitened)
print_table(knn_accuracy_sbert_before_training_centered_and_whitened_rs23)

['euclidean', 'cosine']
Raw:       63.3 62.4
Centered:  63.3 62.7
Whitened:  46.7 57.099999999999994
---------------
['euclidean', 'cosine']
Raw:       63.2 63.2
Centered:  63.2 61.8
Whitened:  50.0 58.199999999999996
---------------


In [52]:
# Not normalized
print(np.linalg.norm(embedding_av_sbert[0]))
print(np.linalg.norm(embedding_av[0]))
print(np.linalg.norm(embedding_asbtracts_only_av_after_training_av_1_epoch[0]))

2.1254666
4.802613
2.1115158
