In [21]:
import pandas as pd
import random
import numpy as np
from random import randint
import torch
from transformers import AutoTokenizer, AutoModel
import adapters
from adapters import AutoAdapterModel
import gc

# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegressionCV

import matplotlib.pyplot as plt
import matplotlib

import time
import pickle
import memory_profiler

%load_ext memory_profiler

from pathlib import Path
import distro

%load_ext watermark

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler
The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark


In [22]:
torch.__version__

'2.1.1+cu121'

In [23]:
%load_ext autoreload
%autoreload 2

from text_embeddings_src.model_stuff import train_loop
from text_embeddings_src.data_stuff import (
    MultOverlappingSentencesPairDataset,
)
from text_embeddings_src.metrics import logistic_accuracy
from text_embeddings_src.embeddings import generate_embeddings

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
import black
import jupyter_black

jupyter_black.load(line_length=79)

In [25]:
variables_path = Path("../../results/variables")
figures_path = Path("../../results/figures/updated_dataset")
data_path = Path("../../data")
# berenslab_data_path = Path("/gpfs01/berens/data/data/GPT_wiki_intro")

In [26]:
plt.style.use("../matplotlib_style.txt")

In [27]:
%watermark -a 'Rita González-Márquez' -t -d -tz -u -v -iv -w -m -h -p transformers,openTSNE
print(distro.name(pretty=True))

Author: Rita González-Márquez

Last updated: 2023-12-08 10:06:07CET

Python implementation: CPython
Python version       : 3.11.5
IPython version      : 8.18.1

transformers: 4.35.2
openTSNE    : 1.0.0

Compiler    : GCC 11.2.0
OS          : Linux
Release     : 3.10.0-1160.el7.x86_64
Machine     : x86_64
Processor   : x86_64
CPU cores   : 64
Architecture: 64bit

Hostname: rgonzalesmarquez_GPU0-llm_gber7

matplotlib     : 3.8.2
numpy          : 1.26.2
adapters       : 0.1.0
jupyter_black  : 0.3.4
pandas         : 2.1.3
distro         : 1.8.0
black          : 23.11.0
memory_profiler: 0.61.0
torch          : 2.1.1

Watermark: 2.4.3

Ubuntu 22.04.3 LTS


# Import

In [28]:
%%time
iclr2024 = pd.read_parquet(
    data_path / "iclr2024.parquet.gzip",
    # index=False,
    engine="pyarrow",
    # compression="gzip",
)

CPU times: user 276 ms, sys: 56.3 ms, total: 332 ms
Wall time: 364 ms


In [29]:
iclr2024.keywords = iclr2024.keywords.transform(lambda x: list(x))
iclr2024.scores = iclr2024.scores.transform(lambda x: list(x))

In [30]:
iclr2024

Unnamed: 0,index,year,id,title,abstract,authors,decision,scores,keywords,gender-first,gender-last,t-SNE x,t-SNE y
0,0,2017,S1VaB4cex,FractalNet: Ultra-Deep Neural Networks without...,We introduce a design strategy for neural netw...,"Gustav Larsson, Michael Maire, Gregory Shakhna...",Accept (Poster),"[5, 7, 6, 6]",[],male,male,-28.117955,-20.418127
1,1,2017,H1W1UN9gg,Deep Information Propagation,We study the behavior of untrained neural netw...,"Samuel S. Schoenholz, Justin Gilmer, Surya Gan...",Accept (Poster),"[8, 9, 8]","[theory, deep learning]",male,,-32.466820,-10.791123
2,2,2017,r1GKzP5xx,Recurrent Normalization Propagation,We propose a LSTM parametrization that preser...,"César Laurent, Nicolas Ballas, Pascal Vincent",Invite to Workshop Track,"[4, 6, 6]","[deep learning, optimization]",,male,3.504240,19.946053
3,3,2017,S1J0E-71l,Surprisal-Driven Feedback in Recurrent Networks,Recurrent neural nets are widely used for pred...,"K, a, m, i, l, , R, o, c, k, i",Reject,"[3, 4, 3]","[unsupervised learning, applications, deep lea...",,,4.553473,16.037763
4,4,2017,SJGCiw5gl,Pruning Convolutional Neural Networks for Reso...,We propose a new formulation for pruning convo...,"Pavlo Molchanov, Stephen Tyree, Tero Karras, T...",Accept (Poster),"[6, 7, 9]","[deep learning, transfer learning]",,male,-25.827705,-37.891772
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24342,7299,2024,1bbPQShCT2,I-PHYRE: Interactive Physical Reasoning,Current evaluation protocols predominantly ass...,,,[],"[intuitive physics, physical reasoning]",,,43.137120,44.316133
24343,7300,2024,Ny150AblPu,EXPOSING TEXT-IMAGE INCONSISTENCY USING DIFFUS...,In the battle against widespread online misinf...,,,[],"[mis-contextualization, media forensic]",,,59.742172,-22.673627
24344,7301,2024,ZGBOfAQrMl,Video Super-Resolution Transformer with Masked...,"Recently, Vision Transformer has achieved grea...",,,[],"[video super-resolution, adaptive, memory and ...",,,57.933273,-3.932825
24345,7302,2024,J2kRjUAOLh,Contrastive Predict-and-Search for Mixed Integ...,Mixed integer linear programs (MILP) are flex...,,,[],[mixed integer programs; contrastive learning],,,-11.437999,21.289523


In [31]:
labels_iclr = np.load(variables_path / "updated_dataset" / "labels_iclr.npy")
colors_iclr = np.load(variables_path / "updated_dataset" / "colors_iclr.npy")

pickle_in = open(
    variables_path / "updated_dataset" / "dict_label_to_color.pkl", "rb"
)
dict_label_to_color = pickle.load(pickle_in)

# Logistic regression classifier

## Baseline

In [16]:
model_names = [
    "BERT",
    "MPNet",
    "SBERT",
    "SciBERT",
    "SPECTER",
    "SciNCL",
]


model_paths = [
    "bert-base-uncased",
    "microsoft/mpnet-base",
    "sentence-transformers/all-mpnet-base-v2",
    "allenai/scibert_scivocab_uncased",
    "allenai/specter",
    "malteos/scincl",
]

In [20]:
%%time
print("Linear accuracy     [AVG]    [CLS]   [SEP]")
for i, model_name in enumerate(model_names):
    # load
    saving_path = Path("embeddings_" + model_name.lower()) / Path(
        "updated_dataset"
    )

    embedding_av = np.load(
        variables_path / saving_path / "embedding_abstracts_only_cls.npy"
    )
    embedding_cls = np.load(
        variables_path / saving_path / "embedding_abstracts_only_sep.npy"
    )
    embedding_sep = np.load(
        variables_path / saving_path / "embedding_abstracts_only_av.npy"
    )

    # metric
    accuracy = logistic_accuracy(
        [
            embedding_av[labels_iclr != "unlabeled"],
            embedding_cls[labels_iclr != "unlabeled"],
            embedding_sep[labels_iclr != "unlabeled"],
        ],
        labels_iclr[labels_iclr != "unlabeled"],
    )
    print(f"{model_name}: {np.array(accuracy)*100}")

    # save
    np.save(variables_path / saving_path / "linear_accuracy", accuracy)

Linear accuracy     [AVG]    [CLS]   [SEP]
BERT: [48.92307692 52.15384615 55.        ]
MPNet: [53.76923077 51.         54.69230769]
SBERT: [57.69230769 57.         60.07692308]
SciBERT: [47.84615385 47.38461538 56.61538462]
SPECTER: [60.         55.84615385 57.61538462]
SciNCL: [61.92307692 58.38461538 58.38461538]
CPU times: user 35min 29s, sys: 1min 10s, total: 36min 39s
Wall time: 34min 25s


### Add new models -- READY TO RUN

In [32]:
model_names = [
    "SimCSE",
    "DeCLUTR",
    "DeCLUTR-sci",
    "SPECTER2",
]


model_paths = [
    "princeton-nlp/unsup-simcse-bert-base-uncased",
    "johngiorgi/declutr-base",
    "johngiorgi/declutr-sci-base",
    # "allenai/specter2_base",
]

In [33]:
%%time
print("Linear accuracy     [AVG]    [CLS]   [SEP]")
for i, model_name in enumerate(model_names):
    # load
    saving_path = Path("embeddings_" + model_name.lower()) / Path(
        "updated_dataset"
    )

    embedding_av = np.load(
        variables_path / saving_path / "embedding_abstracts_only_cls.npy"
    )
    embedding_cls = np.load(
        variables_path / saving_path / "embedding_abstracts_only_sep.npy"
    )
    embedding_sep = np.load(
        variables_path / saving_path / "embedding_abstracts_only_av.npy"
    )

    # metric
    accuracy = logistic_accuracy(
        [
            embedding_av[labels_iclr != "unlabeled"],
            embedding_cls[labels_iclr != "unlabeled"],
            embedding_sep[labels_iclr != "unlabeled"],
        ],
        labels_iclr[labels_iclr != "unlabeled"],
    )
    print(f"{model_name}: {np.array(accuracy)*100}")

    # save
    np.save(variables_path / saving_path / "linear_accuracy", accuracy)

Linear accuracy     [AVG]    [CLS]   [SEP]
SimCSE: [53.         53.38461538 53.84615385]
DeCLUTR: [53.46153846 48.30769231 51.15384615]
DeCLUTR-sci: [53.92307692 48.69230769 55.76923077]
SPECTER2: [61.30769231 58.84615385 58.84615385]
CPU times: user 24min 15s, sys: 47.8 s, total: 25min 3s
Wall time: 24min 5s
