In [None]:
import pandas as pd
import random
import numpy as np
from random import randint
import torch
from transformers import AutoTokenizer, AutoModel
import gc

import scipy as sp
from scipy import sparse
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize

from openTSNE import TSNE, affinity

import matplotlib.pyplot as plt
import matplotlib

import time
import memory_profiler

%load_ext memory_profiler

from pathlib import Path

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


In [None]:
torch.__version__

'1.8.1+cu111'

In [None]:
%load_ext autoreload
%autoreload 2

from text_embeddings_src.model_stuff import train_loop
from text_embeddings_src.data_stuff import SentencePairDataset
from text_embeddings_src.metrics import knn_accuracy
from text_embeddings_src.embeddings import generate_embeddings

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
import black
import jupyter_black

jupyter_black.load(line_length=79)

In [None]:
variables_path = Path("../results/variables")
figures_path = Path("../results/figures")
data_path = Path("../data")

In [None]:
plt.style.use("matplotlib_style.txt")

# Import

## Data

In [None]:
%%time
compression_opts = dict(method="zip", archive_name="iclr.pickle.csv")

iclr = pd.read_pickle(
    data_path / "iclr.pickle.zip",
    # index_col=False,
    compression=compression_opts,
)

CPU times: user 183 ms, sys: 1.9 ms, total: 185 ms
Wall time: 247 ms


In [None]:
iclr

Unnamed: 0,year,id,title,abstract,authors,decision,scores,keywords,gender-first,gender-last,t-SNE x,t-SNE y
0,2018,ryBnUWb0b,Predicting Floor-Level for 911 Calls with Neur...,"In cities with tall buildings, emergency respo...","William Falcon, Henning Schulzrinne",Accept (Poster),"[7, 6, 6]","[recurrent neural networks, rnn, lstm, mobile ...",male,,2.536470,0.739367
1,2018,Skk3Jm96W,Some Considerations on Learning to Explore via...,We consider the problem of exploration in meta...,"Bradly Stadie, Ge Yang, Rein Houthooft, Xi Che...",Invite to Workshop Track,"[7, 4, 6]","[reinforcement learning, rl, exploration, meta...",male,male,49.831927,-29.813831
2,2018,r1RQdCg0W,MACH: Embarrassingly parallel $K$-class classi...,We present Merged-Averaged Classifiers via Has...,"Qixuan Huang, Anshumali Shrivastava, Yiqiu Wang",Reject,"[6, 6, 6]","[extreme classification, large-scale learning,...",,,-22.502752,9.577367
3,2018,rJ3fy0k0Z,Deterministic Policy Imitation Gradient Algorithm,The goal of imitation learning (IL) is to enab...,"Fumihiro Sasaki, Atsuo Kawaguchi",Reject,"[6, 5, 5]",[imitation learning],,,40.437523,-47.690889
4,2018,SkBYYyZRZ,Searching for Activation Functions,The choice of activation functions in deep net...,"Prajit Ramachandran, Barret Zoph, Quoc V. Le",Invite to Workshop Track,"[5, 4, 7]","[meta learning, activation functions]",,,-33.260086,-4.038115
...,...,...,...,...,...,...,...,...,...,...,...,...
16531,2023,w4eQcMZsJa,Text-Driven Generative Domain Adaptation with ...,Combined with the generative prior of pre-trai...,"Zhenhuan Liu, Liang Li, Jiayu Xiao, Zhengjun Z...",Desk rejected,[],"[gan, stylegan, clip, domain adaptation, style...",,,59.296526,5.206691
16532,2023,SDHSQuBpf2,"Laziness, Barren Plateau, and Noises in Machin...",We define \emph{laziness} to describe a large ...,"Zexi Lin, Liang Jiang",Desk rejected,[],"[theoretical issues in deep learning, learning...",,male,-29.178083,-21.810583
16533,2023,HyIY8u5LVDr,Discovering the Representation Bottleneck of G...,Most graph neural networks (GNNs) rely on the ...,"Fang Wu, Siyuan Li, Lirong Wu, Dragomir Radev,...",Desk rejected,[],"[gnn bottleneck, graph rewiring, representatio...",,male,-7.573978,68.386671
16534,2023,470wZ5Qk4ur,Results for Perfect Classification for Graph A...,We study the ability of one layer Graph Attent...,"Kimon Fountoulakis, Amit Levi",Desk rejected,[],[],,male,-7.753593,60.764583


In [None]:
titles_abstracts_together = [
    iclr.title[i] + " " + iclr.abstract[i] for i in range(len(iclr))
]

In [None]:
print(len(titles_abstracts_together))

16536


## Labels

In [None]:
# iclr = pd.read_pickle("iclr.pickle.zip")

keywords = [
    "network",
    "graph",
    "reinforcement",
    "language",
    "adversarial",
    "federated",
    "contrastive",
    "domain",
    "diffusion",
    "out-of-dis",
    "continual",
    "distillation",
    "architecture",
    "privacy",
    "protein",
    "fair",
    "attention",
    "video",
    "meta-learning",
    "generative adv",
    "autoencoder",
    "game",
    "semi-sup",
    "pruning",
    "physics",
    "3d",
    "translation",
    "optimization",
    "recurrent",
    "word",
    "bayesian",
]
keywords = np.array(keywords)

y = np.zeros(iclr.shape[0]) * np.nan

for num, keyword in enumerate(keywords):
    mask = [keyword.lower() in t.lower() for t in iclr.title]
    y[mask & ~np.isnan(y)] = -1
    y[mask & np.isnan(y)] = num

print(y.size)
print(np.sum(~np.isnan(y)))
print(np.sum(y >= 0))

labeled = y >= 0

iclr_labeled = iclr[labeled].reset_index(drop=True)
y_labeled = y[labeled].astype(int)
iclr_labeled["y"] = y_labeled
iclr_labeled["label"] = keywords[y_labeled]

16536
8964
6849


# Reproducibility
Are the batches always the same if I fix the random seed?

## On GPU

In [None]:
model_names = [
    "BERT",
    "SBERT",
    "SPECTER",
    "SciNCL",
]

# rep = ["av", "sep", "cls"]

model_paths = [
    "bert-base-uncased",
    "sentence-transformers/all-mpnet-base-v2",
    "allenai/specter",
    "malteos/scincl",
]

In [None]:
# initialize
i = 0

# random_state = random.seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on device: {}".format(device))

tokenizer = AutoTokenizer.from_pretrained(model_paths[i])
model = AutoModel.from_pretrained(model_paths[i])
print(model_paths[i])

Running on device: cuda


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-uncased


In [None]:
?tokenizer

[0;31mSignature:[0m     
[0mtokenizer[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtext[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtext_pair[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtext_target[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m


In [None]:
seed = 42
# Set the random seed for PyTorch (see https://pytorch.org/docs/stable/notes/randomness.html)
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.use_deterministic_algorithms(True)

# Set the random seed for NumPy
np.random.seed(seed)

In [None]:
training_dataset = SentencePairDataset(
    iclr.abstract, tokenizer, device, seed=seed
)

AttributeError: 'BertTokenizerFast' object has no attribute 'set_seed'

In [None]:
training_loader = torch.utils.data.DataLoader(
    training_dataset, batch_size=128, shuffle=True
)

In [None]:
from tqdm.notebook import tqdm

loop = tqdm(training_loader, leave=True)

  0%|          | 0/129 [00:00<?, ?it/s]

In [None]:
for i_batch, batch in enumerate(training_loader):
    print(len(batch[0]))
    print(batch[0][0][0])
    break
    # prepare batches and more all to the active device
    anchor_ids = batch[0][0].to(device)
    anchor_mask = batch[0][1].to(device)

2
tensor([  101,  2007,  2023,  6614,  1010,  1037,  2028,  1011,  2915,  3565,
         7159,  2003,  2788, 21155,  2094,  2004,  1037,  2836,  9345,  7630,
         8844,  2000,  4635,  1996,  2836,  1032, 23277,  2102,  1066,  2367,
         9381,  1012,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       device='cuda:0')


In [None]:
training_dataset_2 = SentencePairDataset(
    iclr.abstract,
    tokenizer,
    device,
    seed=seed,
)

In [None]:
training_loader_2 = torch.utils.data.DataLoader(
    training_dataset_2, batch_size=128, shuffle=True
)

In [None]:
from tqdm.notebook import tqdm

loop_2 = tqdm(training_loader_2, leave=True)

  0%|          | 0/129 [00:00<?, ?it/s]

In [None]:
for i_batch, batch in enumerate(loop_2):
    print(len(batch[0]))
    print(batch[0][0][0])
    break
    # prepare batches and more all to the active device
    anchor_ids = batch[0][0].to(device)
    anchor_mask = batch[0][1].to(device)

2
tensor([  101,  4866,  7885,  2006,  3746,  7159,  1010, 25022, 14971,  1011,
         2184,  1010,  8292,  2571,  3676,  1998,  5796, 25033,  2951, 13462,
         2031, 20119,  2256, 19113, 13599,  2000,  2060,  2110,  1011,  1997,
         1011,  1996,  1011,  2396, 26163,  2015,  1012,   102,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       device='cuda:0')


## On CPU

In [None]:
model_names = [
    "BERT",
    "SBERT",
    "SPECTER",
    "SciNCL",
]

# rep = ["av", "sep", "cls"]

model_paths = [
    "bert-base-uncased",
    "sentence-transformers/all-mpnet-base-v2",
    "allenai/specter",
    "malteos/scincl",
]

In [None]:
# initialize
i = 0

random_state = random.seed(42)

device = torch.device("cpu")
print("Running on device: {}".format(device))

tokenizer = AutoTokenizer.from_pretrained(model_paths[i])
model = AutoModel.from_pretrained(model_paths[i])
print(model_paths[i])

Running on device: cpu


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-uncased


In [None]:
seed = 42
# Set the random seed for PyTorch
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Set the random seed for NumPy
np.random.seed(seed)

In [None]:
training_dataset = SentencePairDataset(
    iclr.abstract, tokenizer, device, seed=seed
)

In [None]:
training_loader = torch.utils.data.DataLoader(
    training_dataset, batch_size=128, shuffle=True
)

In [None]:
for i_batch, batch in enumerate(training_loader):
    print(len(batch[0]))
    print(batch[0][0][0])
    break
    # prepare batches and more all to the active device
    anchor_ids = batch[0][0].to(device)
    anchor_mask = batch[0][1].to(device)

2
tensor([  101,  2007,  2023,  6614,  1010,  1037,  2028,  1011,  2915,  3565,
         7159,  2003,  2788, 21155,  2094,  2004,  1037,  2836,  9345,  7630,
         8844,  2000,  4635,  1996,  2836,  1032, 23277,  2102,  1066,  2367,
         9381,  1012,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])


In [None]:
training_dataset_2 = SentencePairDataset(
    iclr.abstract,
    tokenizer,
    device,
    seed=seed,
)

In [None]:
seed = 42
# Set the random seed for PyTorch
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Set the random seed for NumPy
np.random.seed(seed)

training_dataset_2 = SentencePairDataset(
    iclr.abstract,
    tokenizer,
    device,
    seed=seed,
)

In [None]:
gen = torch.Generator(device)
gen.manual_seed(seed)
training_loader_2 = torch.utils.data.DataLoader(
    training_dataset_2, batch_size=128, shuffle=True, generator=gen
)

In [None]:
for i_batch, batch in enumerate(training_loader_2):
    print(len(batch[0]))
    print(batch[0][0][0])
    break
    # prepare batches and more all to the active device
    anchor_ids = batch[0][0].to(device)
    anchor_mask = batch[0][1].to(device)

2
tensor([  101,  1999,  2023,  3259,  1010,  2057,  2817,  1996,  3291,  1997,
         4531,  3816, 10594, 14442,  2005,  2812,  1011,  2492,  2048,  1011,
         2447,  5717,  1011,  7680,  2399,  1012,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])


In [None]:
training_dataset_3 = SentencePairDataset(
    iclr.abstract,
    tokenizer,
    device,
    seed=seed,
)
training_dataset_3[0]

((tensor([  101,  1999,  3655,  2007,  4206,  3121,  1010,  5057,  6869,  2545,
           2342,  2019,  8321,  2723,  2504,  3295,  2000,  2424, 19989, 20587,
           2015,  2855,  1012,   102,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0]),
  tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [None]:
seed = 42
np.random.seed(42)
np.random.choice(len(iclr.abstract), size=2, replace=False)

array([9923,  566])

In [None]:
np.random.choice(len(iclr.abstract), size=2, replace=False)

array([5585, 6675])

## New: This way we ensure reproducibility

In [None]:
model_names = [
    "BERT",
    "SBERT",
    "SPECTER",
    "SciNCL",
]

# rep = ["av", "sep", "cls"]

model_paths = [
    "bert-base-uncased",
    "sentence-transformers/all-mpnet-base-v2",
    "allenai/specter",
    "malteos/scincl",
]

In [None]:
seed = 42
# Set the random seed for PyTorch
torch.manual_seed(seed)

torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True


# Set the random seed for NumPy
np.random.seed(seed)

# Set the random seed
random.seed(seed)

In [None]:
# initialize
i = 0

# random_state = random.seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on device: {}".format(device))

tokenizer = AutoTokenizer.from_pretrained(model_paths[i])
model = AutoModel.from_pretrained(model_paths[i])
print(model_paths[i])

Running on device: cuda


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-uncased


In [None]:
training_dataset_4 = SentencePairDataset(
    iclr.abstract,
    tokenizer,
    device,
    seed=seed,
)

In [None]:
gen = torch.Generator()
gen.manual_seed(seed)
training_loader_4 = torch.utils.data.DataLoader(
    training_dataset_4, batch_size=128, shuffle=True, generator=gen
)

In [None]:
for i_batch, batch in enumerate(training_loader_4):
    print(len(batch[0]))
    print(batch[0][0][0])
    break
    # prepare batches and more all to the active device
    anchor_ids = batch[0][0].to(device)
    anchor_mask = batch[0][1].to(device)

2
tensor([  101,  2004,  7976,  4454,  2004,  1037,  2326, 12154,  6217,  1010,
         8650,  2092,  1011,  4738,  4275,  2004,  7789,  3200,  2003,  3352,
         6233,  2590,  1012,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       device='cuda:0')


In [None]:
for i_batch, batch in enumerate(training_loader_4):
    print(len(batch[0]))
    print(batch[0][0][0])
    break
    # prepare batches and more all to the active device
    anchor_ids = batch[0][0].to(device)
    anchor_mask = batch[0][1].to(device)

2
tensor([  101,  7885,  2006,  3115, 10629,  5579,  2951, 13462,  2015, 10580,
         1996, 12353,  1997,  1996,  3818,  3921,  1999,  7831,  2007,  2110,
         1011,  1997,  1011,  1996,  1011,  2396,  4725,  1012,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       device='cuda:0')
