In [None]:
import pandas as pd
import random
import numpy as np
from random import randint
import torch
from transformers import AutoTokenizer, AutoModel
import gc

import scipy as sp
from scipy import sparse
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split

from openTSNE import TSNE, affinity

import matplotlib.pyplot as plt
import matplotlib

import time
import memory_profiler

%load_ext memory_profiler

from pathlib import Path

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


In [None]:
torch.__version__

'1.8.1+cu111'

In [None]:
%load_ext autoreload
%autoreload 2

from text_embeddings_src.model_stuff import train_loop
from text_embeddings_src.data_stuff import (
    SentencePairDataset,
    MultSentencesPairDataset,
    MultOverlappingSentencesPairDataset,
    MultOverlappingSentencesLabelPairDataset,
)
from text_embeddings_src.metrics import knn_accuracy
from text_embeddings_src.embeddings import generate_embeddings

In [None]:
import black
import jupyter_black

jupyter_black.load(line_length=79)

In [None]:
variables_path = Path("../results/variables")
figures_path = Path("../results/figures")
data_path = Path("../data")

In [None]:
plt.style.use("matplotlib_style.txt")

In [None]:
model = None
gc.collect()
torch.cuda.empty_cache()

# Import

## Data

In [None]:
%%time
compression_opts = dict(method="zip", archive_name="iclr.pickle.csv")

iclr = pd.read_pickle(
    data_path / "iclr.pickle.zip",
    # index_col=False,
    compression=compression_opts,
)

CPU times: user 186 ms, sys: 27.7 ms, total: 214 ms
Wall time: 295 ms


In [None]:
iclr

Unnamed: 0,year,id,title,abstract,authors,decision,scores,keywords,gender-first,gender-last,t-SNE x,t-SNE y
0,2018,ryBnUWb0b,Predicting Floor-Level for 911 Calls with Neur...,"In cities with tall buildings, emergency respo...","William Falcon, Henning Schulzrinne",Accept (Poster),"[7, 6, 6]","[recurrent neural networks, rnn, lstm, mobile ...",male,,2.536470,0.739367
1,2018,Skk3Jm96W,Some Considerations on Learning to Explore via...,We consider the problem of exploration in meta...,"Bradly Stadie, Ge Yang, Rein Houthooft, Xi Che...",Invite to Workshop Track,"[7, 4, 6]","[reinforcement learning, rl, exploration, meta...",male,male,49.831927,-29.813831
2,2018,r1RQdCg0W,MACH: Embarrassingly parallel $K$-class classi...,We present Merged-Averaged Classifiers via Has...,"Qixuan Huang, Anshumali Shrivastava, Yiqiu Wang",Reject,"[6, 6, 6]","[extreme classification, large-scale learning,...",,,-22.502752,9.577367
3,2018,rJ3fy0k0Z,Deterministic Policy Imitation Gradient Algorithm,The goal of imitation learning (IL) is to enab...,"Fumihiro Sasaki, Atsuo Kawaguchi",Reject,"[6, 5, 5]",[imitation learning],,,40.437523,-47.690889
4,2018,SkBYYyZRZ,Searching for Activation Functions,The choice of activation functions in deep net...,"Prajit Ramachandran, Barret Zoph, Quoc V. Le",Invite to Workshop Track,"[5, 4, 7]","[meta learning, activation functions]",,,-33.260086,-4.038115
...,...,...,...,...,...,...,...,...,...,...,...,...
16531,2023,w4eQcMZsJa,Text-Driven Generative Domain Adaptation with ...,Combined with the generative prior of pre-trai...,"Zhenhuan Liu, Liang Li, Jiayu Xiao, Zhengjun Z...",Desk rejected,[],"[gan, stylegan, clip, domain adaptation, style...",,,59.296526,5.206691
16532,2023,SDHSQuBpf2,"Laziness, Barren Plateau, and Noises in Machin...",We define \emph{laziness} to describe a large ...,"Zexi Lin, Liang Jiang",Desk rejected,[],"[theoretical issues in deep learning, learning...",,male,-29.178083,-21.810583
16533,2023,HyIY8u5LVDr,Discovering the Representation Bottleneck of G...,Most graph neural networks (GNNs) rely on the ...,"Fang Wu, Siyuan Li, Lirong Wu, Dragomir Radev,...",Desk rejected,[],"[gnn bottleneck, graph rewiring, representatio...",,male,-7.573978,68.386671
16534,2023,470wZ5Qk4ur,Results for Perfect Classification for Graph A...,We study the ability of one layer Graph Attent...,"Kimon Fountoulakis, Amit Levi",Desk rejected,[],[],,male,-7.753593,60.764583


In [None]:
titles_abstracts_together = [
    iclr.title[i] + " " + iclr.abstract[i] for i in range(len(iclr))
]

In [None]:
print(len(titles_abstracts_together))
print(type(titles_abstracts_together))

16536
<class 'list'>


## Labels

In [None]:
# iclr = pd.read_pickle("iclr.pickle.zip")

keywords = [
    "network",
    "graph",
    "reinforcement",
    "language",
    "adversarial",
    "federated",
    "contrastive",
    "domain",
    "diffusion",
    "out-of-dis",
    "continual",
    "distillation",
    "architecture",
    "privacy",
    "protein",
    "fair",
    "attention",
    "video",
    "meta-learning",
    "generative adv",
    "autoencoder",
    "game",
    "semi-sup",
    "pruning",
    "physics",
    "3d",
    "translation",
    "optimization",
    "recurrent",
    "word",
    "bayesian",
]
keywords = np.array(keywords)

y = np.zeros(iclr.shape[0]) * np.nan

for num, keyword in enumerate(keywords):
    mask = [keyword.lower() in t.lower() for t in iclr.title]
    y[mask & ~np.isnan(y)] = -1
    y[mask & np.isnan(y)] = num

print(y.size)
print(np.sum(~np.isnan(y)))
print(np.sum(y >= 0))

labeled = y >= 0

iclr_labeled = iclr[labeled].reset_index(drop=True)
y_labeled = y[labeled].astype(int)
iclr_labeled["y"] = y_labeled
iclr_labeled["label"] = keywords[y_labeled]

16536
8964
6849


## Training

In [None]:
model_names = [
    "BERT",
    "MPNet",
    "SBERT",
    "SciBERT",
    "SPECTER",
    "SciNCL",
]


model_paths = [
    "bert-base-uncased",
    "microsoft/mpnet-base",
    "sentence-transformers/all-mpnet-base-v2",
    "allenai/scibert_scivocab_uncased",
    "allenai/specter",
    "malteos/scincl",
]

# Ideas

- do t-SNE using the representations of ater 1 epoch and after 10 epochs
- kNN recall in which the "original high-dim" representation is the space trained after 10 epochs and the "low-dim" one is after one epoch
- "abstract-accuracy": use as dataset now instead of abtsracts, sentences of abstracts and as labels the abstract PMID. train + test sets!
- Only train the model on the train set and evaluate after each epoch the test set by computing the loss.  train + test sets!


## First two points together

In [None]:
def run_tsne_simple(Z, k=10, rs=42, verbose=False):
    A = affinity.Uniform(
        Z,
        verbose=verbose,
        method="exact",
        random_state=rs,
        k_neighbors=k,
    )

    X = TSNE(
        verbose=True, initialization="spectral", random_state=42
    ).fit(affinities=A)

    return X

In [None]:
def train_loop_tsne_and_knn_rec(model, loader, device, titles_abstracts_together, tokenizer, optimized_rep= "av", n_epochs=1, lr=2e-5):

    assert optimized_rep in ["av", "cls", "sep", "7th"], "Not valid `optimized_rep`. Choose from ['av', 'cls', 'sep', '7th']."

    model.to(device)

    # define layers to be used in multiple-negatives-ranking
    cos_sim = torch.nn.CosineSimilarity()
    loss_func = torch.nn.CrossEntropyLoss()
    scale = 20.0  # we multiply similarity score by this scale value
    # move layers to device
    cos_sim.to(device)
    loss_func.to(device)

    # initialize Adam optimizer
    optim = torch.optim.Adam(model.parameters(), lr=lr)

    # setup warmup for first ~10% of steps
    total_steps = len(loader) * n_epochs 
    warmup_steps = int(0.1 * len(loader))
    scheduler = get_linear_schedule_with_warmup(
        optim,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps - warmup_steps,
    )

    losses = np.empty((n_epochs, len(loader)))
    high_d_reps= []
    tsne_embeddings= []
    
    for epoch in range(n_epochs):
        model.train()  # make sure model is in training mode
        # initialize the dataloader loop with tqdm (tqdm == progress bar)
        loop = tqdm(loader, leave=True)
        for i_batch, batch in enumerate(loop):
            # zero all gradients on each new step
            optim.zero_grad()
            # prepare batches and more all to the active device
            anchor_ids = batch[0][0].to(device)     
            anchor_mask = batch[0][1].to(device)
            pos_ids = batch[1][0].to(device)
            pos_mask = batch[1][1].to(device)
            # extract token embeddings from BERT
            a = model(anchor_ids, attention_mask=anchor_mask)[0]  # all token embeddings
            p = model(pos_ids, attention_mask=pos_mask)[0]
            
            # get the mean pooled vectors
            if optimized_rep == "av":
                a = mean_pool(a, anchor_mask)
                p = mean_pool(p, pos_mask)
                
            elif optimized_rep == "cls":
                a = cls_pool(a, anchor_mask)
                p = cls_pool(p, pos_mask)
                
            elif optimized_rep == "sep":
                a = sep_pool(a, anchor_mask)
                p = sep_pool(p, pos_mask)
                
            elif optimized_rep == "7th":
                a = seventh_pool(a, anchor_mask)
                p = seventh_pool(p, pos_mask)
                
            # calculate the cosine similarities
            scores = torch.stack(
                [cos_sim(a_i.reshape(1, a_i.shape[0]), p) for a_i in a]
            )
            # get label(s) - we could define this before if confident
            # of consistent batch sizes
            labels = torch.tensor(
                range(len(scores)), dtype=torch.long, device=scores.device
            )
            # and now calculate the loss
            loss = loss_func(scores * scale, labels)   # Nik does not know what the labels nor the scale are
            losses[epoch, i_batch] = loss.item()

            # using loss, calculate gradients and then optimize
            loss.backward()
            optim.step()
            # update learning rate scheduler
            scheduler.step()
            # update the TDQM progress bar
            loop.set_description(f"Epoch {epoch}")
            loop.set_postfix(loss=loss.item())
            
        ## get high-dim and low-dim representations
        if (epoch == 0) | (epoch == n_epochs-1):

            embedding_cls, embedding_sep, embedding_av = generate_embeddings(
                titles_abstracts_together, tokenizer, model, device, batch_size=256, return_seventh=False
            )

            if optimized_rep == "av":
                high_d_reps.append(embedding_av)
                tsne_result = run_tsne_simple(embedding_av)
                tsne_embeddings.append(tsne_result)
                
            elif optimized_rep == "cls":
                high_d_reps.append(embedding_cls)
                tsne_result = run_tsne_simple(embedding_cls)
                tsne_embeddings.append(tsne_result)
                
            elif optimized_rep == "sep":
                high_d_reps.append(embedding_sep)
                tsne_result = run_tsne_simple(embedding_sep)
                tsne_embeddings.append(tsne_result)


    # knn recall
    knn_recall_result = knn_recall(high_d_reps[1], [high_d_reps[0]])
        
    return losses, tsne_embeddings, knn_recall_result

## Second two points together
- Only train the model on the train set and evaluate after each epoch the test set by computing the loss.  
- "abstract-accuracy": use as dataset now instead of abtsracts, sentences of abstracts and as labels the abstract PMID. 
train + test sets!

In [None]:
def train_loop_eval_test_loss(model, train_loader, test_loader, device, tokenizer, train_dataset, test_dataset, optimized_rep= "av", n_epochs=1, lr=2e-5):
    
    assert optimized_rep in ["av", "cls", "sep", "7th"], "Not valid `optimized_rep`. Choose from ['av', 'cls', 'sep', '7th']."

    model.to(device)

    # define layers to be used in multiple-negatives-ranking
    cos_sim = torch.nn.CosineSimilarity()
    loss_func = torch.nn.CrossEntropyLoss()
    scale = 20.0  # we multiply similarity score by this scale value
    # move layers to device
    cos_sim.to(device)
    loss_func.to(device)

    # initialize Adam optimizer
    optim = torch.optim.Adam(model.parameters(), lr=lr)

    # setup warmup for first ~10% of steps
    total_steps = len(train_loader) * n_epochs 
    warmup_steps = int(0.1 * len(train_loader))
    scheduler = get_linear_schedule_with_warmup(
        optim,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps - warmup_steps,
    )

    losses_train = np.empty((n_epochs, len(train_loader)))
    losses_test = np.empty((n_epochs, len(test_loader)))
    knn_accuracies_train =  np.empty((n_epochs, 3))
    knn_accuracies_test =  np.empty((n_epochs, 3))
    for epoch in range(n_epochs):
        model.train()  # make sure model is in training mode

        # TRAIN LOADER -- training model with train loader
        # initialize the dataloader loop with tqdm (tqdm == progress bar)
        train_loop = tqdm(train_loader, leave=True)
        for i_batch, batch in enumerate(train_loop):
            # zero all gradients on each new step
            optim.zero_grad()

            # prepare batches and move all to the active device
            anchor_ids = batch[0][0].to(device)     
            anchor_mask = batch[0][1].to(device)
            pos_ids = batch[1][0].to(device)
            pos_mask = batch[1][1].to(device)
            # extract token embeddings from model
            a = model(anchor_ids, attention_mask=anchor_mask)[0]  # all token embeddings
            p = model(pos_ids, attention_mask=pos_mask)[0]
            
            # get the pooled vectors
            if optimized_rep == "av":
                a = mean_pool(a, anchor_mask)
                p = mean_pool(p, pos_mask)
                
            elif optimized_rep == "cls":
                a = cls_pool(a, anchor_mask)
                p = cls_pool(p, pos_mask)
                
            elif optimized_rep == "sep":
                a = sep_pool(a, anchor_mask)
                p = sep_pool(p, pos_mask)
                
            elif optimized_rep == "7th":
                a = seventh_pool(a, anchor_mask)
                p = seventh_pool(p, pos_mask)
                
            # calculate the cosine similarities
            scores = torch.stack(
                [cos_sim(a_i.reshape(1, a_i.shape[0]), p) for a_i in a]
            )
            # get label(s)
            labels = torch.tensor(
                range(len(scores)), dtype=torch.long, device=scores.device
            )
            # and now calculate the loss
            loss = loss_func(scores * scale, labels) 
            losses_train[epoch, i_batch] = loss.item()

            # using loss, calculate gradients and then optimize
            loss.backward()
            optim.step()
            # update learning rate scheduler
            scheduler.step()
            # update the TDQM progress bar
            train_loop.set_description(f"Epoch {epoch}")
            train_loop.set_postfix(loss=loss.item())
        
        # TEST LOADER -- no training model, only evaluating loss for the test loader
        # initialize the dataloader loop with tqdm (tqdm == progress bar)
        test_loop = tqdm(test_loader, leave=True)
        for i_batch, batch in enumerate(test_loop):
            # prepare batches and move all to the active device
            anchor_ids = batch[0][0].to(device)     
            anchor_mask = batch[0][1].to(device)
            pos_ids = batch[1][0].to(device)
            pos_mask = batch[1][1].to(device)
            # extract token embeddings from model
            a = model(anchor_ids, attention_mask=anchor_mask)[0]  # all token embeddings
            p = model(pos_ids, attention_mask=pos_mask)[0]
            
            # get the pooled vectors
            if optimized_rep == "av":
                a = mean_pool(a, anchor_mask)
                p = mean_pool(p, pos_mask)
                
            elif optimized_rep == "cls":
                a = cls_pool(a, anchor_mask)
                p = cls_pool(p, pos_mask)
                
            elif optimized_rep == "sep":
                a = sep_pool(a, anchor_mask)
                p = sep_pool(p, pos_mask)
                
            elif optimized_rep == "7th":
                a = seventh_pool(a, anchor_mask)
                p = seventh_pool(p, pos_mask)
                
            # calculate the cosine similarities
            scores = torch.stack(
                [cos_sim(a_i.reshape(1, a_i.shape[0]), p) for a_i in a]
            )
            # get label(s)
            labels = torch.tensor(
                range(len(scores)), dtype=torch.long, device=scores.device
            )
            # and now calculate the loss
            loss = loss_func(scores * scale, labels)  
            losses_test[epoch, i_batch] = loss.item()

            # update the TDQM progress bar
            test_loop.set_description(f"Epoch {epoch}")
            test_loop.set_postfix(loss=loss.item())


        ## evaluation 
        # TRAINING DATASET -- evaluation of abstract accuracy
        X_train = list(np.vstack(train_dataset.abs_sentences)[:, 0])
        y_train = list(np.vstack(train_dataset.abs_sentences)[:, 1])

        # knn accuracy
        embd_cls_train, embd_sep_train, embd_av_train = generate_embeddings(
            X_train, tokenizer, model, device, batch_size=256
        )
        knn_acc_train = knn_accuracy([embd_av_train, embd_cls_train, embd_sep_train], y_train)
        knn_accuracies_train[epoch,:]= knn_acc_train


        # TEST DATASET -- evaluation of abstract accuracy
        X_test = list(np.vstack(test_dataset.abs_sentences)[:, 0])
        y_test = list(np.vstack(test_dataset.abs_sentences)[:, 1])

        # knn accuracy
        embd_cls_test, embd_sep_test, embd_av_test = generate_embeddings(
            X_test, tokenizer, model, device, batch_size=256
        )
        knn_acc_test = knn_accuracy([embd_av_test, embd_cls_test, embd_sep_test], y_test)
        knn_accuracies_test[epoch,:]= knn_acc_test

        
    return losses_train, losses_test, knn_accuracies_train, knn_accuracies_test

In [None]:
%%time
from sklearn.model_selection import train_test_split

# model
i = 0
model_name = model_names[i]

## fix random seeds
seed = 42
# Set the random seed for PyTorch
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
# torch.use_deterministic_algorithms(True)
# Set the random seed for NumPy
np.random.seed(seed)
# Set the random seed
random.seed(seed)

## set up model
print("Model: ", model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on device: {}".format(device))

tokenizer = AutoTokenizer.from_pretrained(model_paths[i])
model = AutoModel.from_pretrained(model_paths[i])
print(model_paths[i])

## data
# split into train and test
X_train, X_test = train_test_split(
    iclr.abstract, test_size=0.1, random_state=np.random.seed(seed)
)

# do the train and test datasets with the class
n_cons_sntcs = 2

train_dataset = MultOverlappingSentencesPairDataset(
    X_train, tokenizer, device, n_cons_sntcs=n_cons_sntcs, seed=42
)
test_dataset = MultOverlappingSentencesPairDataset(
    X_test, tokenizer, device, n_cons_sntcs=n_cons_sntcs, seed=42
)

gen = torch.Generator()
gen.manual_seed(seed)

# do the loader with the train and test datasets
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=64, shuffle=True, generator=gen
)
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=64, shuffle=True, generator=gen
)

Model:  BERT
Running on device: cuda


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-uncased
CPU times: user 48.1 s, sys: 20.2 s, total: 1min 8s
Wall time: 8.26 s


In [None]:
print(len(train_dataset))
print(len(test_dataset))

14345
1586


### exploration batches class

In [None]:
model_names = [
    "BERT",
    "MPNet",
    "SBERT",
    # "SciBERT",
    # "SPECTER",
    # "SciNCL",
]


model_paths = [
    "bert-base-uncased",
    "microsoft/mpnet-base",
    "sentence-transformers/all-mpnet-base-v2",
    # "allenai/scibert_scivocab_uncased",
    # "allenai/specter",
    # "malteos/scincl",
]

In [None]:
# model
i = 0
model_name = model_names[i]

## fix random seeds
seed = 42
# Set the random seed for PyTorch
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
# torch.use_deterministic_algorithms(True)
# Set the random seed for NumPy
np.random.seed(seed)
# Set the random seed
random.seed(seed)

# set up model
print("Model: ", model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on device: {}".format(device))

tokenizer = AutoTokenizer.from_pretrained(model_paths[i])
model = AutoModel.from_pretrained(model_paths[i])
print(model_paths[i])

# data
training_dataset = MultOverlappingSentencesPairDataset(
    iclr.abstract, tokenizer, device, n_cons_sntcs=2, seed=42
)

gen = torch.Generator()
gen.manual_seed(seed)

Model:  BERT
Running on device: cuda


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-uncased


<torch._C.Generator>

In [None]:
training_loader = torch.utils.data.DataLoader(
    training_dataset, batch_size=64, shuffle=True, generator=gen
)

# training

In [None]:
print(len(training_loader))

249


In [None]:
for i, batch in enumerate(training_loader):
    if i == 0:
        # print(batch)
        # print(len(batch))
        # print(batch[0])
        # print(len(batch[0]))
        print(batch[0][0][0])
        print(len(batch[0][0][0]))
        print(batch[0][1])
        print(len(batch[0][1]))

tensor([  101,  2004,  3698,  4083,  4275,  2024,  6233,  2108,  4846,  2000,
         2191,  9530,  3366, 15417,  4818,  6567,  1999,  2613,  1011,  2088,
        10906,  1010,  2009,  4150,  4187,  2000,  5676,  2008,  3633,  2040,
         2024, 15316,  2135, 19209,  1006,  1041,  1012,  2096,  2195,  8107,
         2031,  2042,  3818,  2000,  9570, 28667, 22957,  2229,  2005,  5360,
         3633,  1010,  1996, 28667, 22957,  2229,  6434,  2011,  2122,  4725,
         2593,  6162,  2659,  5366,  1006,  1045,  1012,   102,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [None]:
for elem in training_dataset.abs_sentences[0]:
    print(elem)
print(len(training_dataset.abs_sentences[0]))

("In cities with tall buildings, emergency responders need an accurate floor level location to find 911 callers quickly. We introduce a system to estimate a victim's floor level via their mobile device's sensor data in a two-step process. ", 0)
("We introduce a system to estimate a victim's floor level via their mobile device's sensor data in a two-step process. First, we train a neural network to determine when a smartphone enters or exits a building via GPS signal changes. ", 0)
("First, we train a neural network to determine when a smartphone enters or exits a building via GPS signal changes. Second, we use a barometer equipped smartphone to measure the change in barometric pressure from the entrance of the building to the victim's indoor location. ", 0)
("Second, we use a barometer equipped smartphone to measure the change in barometric pressure from the entrance of the building to the victim's indoor location. Unlike impractical previous approaches, our system is the first that do

In [None]:
subset_abs_sentences = training_dataset.abs_sentences[0:2]
print(len(subset_abs_sentences))
print(len(subset_abs_sentences[0]))
print(len(subset_abs_sentences[1]))
print(len(np.vstack(subset_abs_sentences)))
print(np.vstack(subset_abs_sentences).shape)
print(np.vstack(subset_abs_sentences)[:, 1])
print(list(np.vstack(subset_abs_sentences)[:, 0]))
# print(np.vstack(training_dataset.abs_sentences)[:, 1])

2
5
6
11
(11, 2)
['0' '0' '0' '0' '0' '2' '2' '2' '2' '2' '2']
["In cities with tall buildings, emergency responders need an accurate floor level location to find 911 callers quickly. We introduce a system to estimate a victim's floor level via their mobile device's sensor data in a two-step process. ", "We introduce a system to estimate a victim's floor level via their mobile device's sensor data in a two-step process. First, we train a neural network to determine when a smartphone enters or exits a building via GPS signal changes. ", "First, we train a neural network to determine when a smartphone enters or exits a building via GPS signal changes. Second, we use a barometer equipped smartphone to measure the change in barometric pressure from the entrance of the building to the victim's indoor location. ", "Second, we use a barometer equipped smartphone to measure the change in barometric pressure from the entrance of the building to the victim's indoor location. Unlike impractical p

### Baseline abstract accuracy
As a sanity check and using this new dataset and kNN abstract accuracy

In [None]:
model_names = [
    "BERT",
    "MPNet",
    "SBERT",
    # "SciBERT",
    # "SPECTER",
    # "SciNCL",
]


model_paths = [
    "bert-base-uncased",
    "microsoft/mpnet-base",
    "sentence-transformers/all-mpnet-base-v2",
    # "allenai/scibert_scivocab_uncased",
    # "allenai/specter",
    # "malteos/scincl",
]

In [None]:
%%time
i = 0
model_name = model_names[i]

## fix random seeds
seed = 42
# Set the random seed for PyTorch
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
# torch.use_deterministic_algorithms(True)
# Set the random seed for NumPy
np.random.seed(seed)
# Set the random seed
random.seed(seed)

## set up model
print("Model: ", model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on device: {}".format(device))

tokenizer = AutoTokenizer.from_pretrained(model_paths[i])
model = AutoModel.from_pretrained(model_paths[i])
model.to(device)
print(model_paths[i])

## data
# split into train and test
X_train, X_test = train_test_split(
    iclr.abstract, test_size=0.1, random_state=np.random.seed(seed)
)
print("Train set: ", X_train.shape)
print("Test set: ", X_test.shape)

# do the train and test datasets with the class
n_cons_sntcs = 2

train_dataset = MultOverlappingSentencesPairDataset(
    X_train, tokenizer, device, n_cons_sntcs=n_cons_sntcs, seed=42
)
test_dataset = MultOverlappingSentencesPairDataset(
    X_test, tokenizer, device, n_cons_sntcs=n_cons_sntcs, seed=42
)

print("Train dataset: ", len(train_dataset))
print("Test dataset: ", len(test_dataset))

Model:  BERT
Running on device: cuda


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-uncased
Train set:  (14882,)
Test set:  (1654,)
Train dataset:  14345
Test dataset:  1586
CPU times: user 53.4 s, sys: 21.3 s, total: 1min 14s
Wall time: 10.3 s


In [None]:
%%time
X_train = list(np.vstack(train_dataset.abs_sentences)[:, 0])
y_train = list(np.vstack(train_dataset.abs_sentences)[:, 1])
print("Number of sentences:", len(X_train))
print("Number of classes:", len(np.unique(y_train)))
print("Ratio:", len(np.unique(y_train)) / len(X_train))

# knn accuracy
embd_cls_train, embd_sep_train, embd_av_train = generate_embeddings(
    X_train, tokenizer, model, device, batch_size=256
)
knn_acc_train = knn_accuracy(
    [embd_av_train, embd_cls_train, embd_sep_train], y_train
)
print(knn_acc_train)

# save
saving_path = Path("embeddings_" + model_name.lower())
(variables_path / saving_path).mkdir(exist_ok=True)
np.save(
    variables_path / saving_path / "knn_accuracies_abstract_train_baseline",
    knn_acc_train,
)

Number of sentences: 70175
Number of classes: 14345
Ratio: 0.20441752760954757


  0%|          | 0/275 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
print(X_train[0])
print(X_train[1])

Recurrent Neural Networks (RNNs) are powerful tools for solving sequence-based problems, but their efficacy and execution time are dependent on the size of the network. Following recent work in simplifying these networks with model pruning and a novel mapping of work onto GPUs, we design an efficient implementation for sparse RNNs. 
Following recent work in simplifying these networks with model pruning and a novel mapping of work onto GPUs, we design an efficient implementation for sparse RNNs. We investigate several optimizations and tradeoffs: Lamport timestamps, wide memory loads, and a bank-aware weight layout. 


In [None]:
%%time
# TEST DATASET -- evaluation of abstract accuracy
X_test = list(np.vstack(test_dataset.abs_sentences)[:, 0])
y_test = list(np.vstack(test_dataset.abs_sentences)[:, 1])
print("Number of sentences:", len(X_test))
print("Number of classes:", len(np.unique(y_test)))
print("Ratio:", len(np.unique(y_test)) / len(X_test))

# knn accuracy
embd_cls_test, embd_sep_test, embd_av_test = generate_embeddings(
    X_test, tokenizer, model, device, batch_size=256
)
knn_acc_test = knn_accuracy(
    [embd_av_test, embd_cls_test, embd_sep_test], y_test
)
print(knn_acc_test)

saving_path = Path("embeddings_" + model_name.lower())
(variables_path / saving_path).mkdir(exist_ok=True)
np.save(
    variables_path / saving_path / "knn_accuracies_abstract_test_baseline",
    knn_acc_test,
)

Number of sentences: 7820
Number of classes: 1586
Ratio: 0.20281329923273658


  0%|          | 0/31 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
print(np.round(knn_acc_train, 3))
print(np.round(knn_acc_test, 3))

[0.297 0.1   0.112]
[0.409 0.176 0.16 ]


In [None]:
from sklearn.neighbors import NearestNeighbors
from scipy.stats import mode


def knn_acc_loocv(X, y, n_neighbors=10):
    neigh = NearestNeighbors(n_neighbors=n_neighbors).fit(X)
    knn = neigh.kneighbors(return_distance=False)
    yhat = mode(y[knn], axis=1).mode.flatten()
    return np.mean(yhat == y)

In [None]:
%%time

knn_acc_loo_test = knn_acc_loocv(embd_av_test, np.array(y_test))
print(knn_acc_loo_test)

0.42762148337595907
CPU times: user 13.6 s, sys: 38.4 s, total: 52.1 s
Wall time: 2.41 s


##### Subsets

In [None]:
%%time
np.random.seed(12)
subset = np.random.choice(
    len(iclr.abstract), size=int(len(iclr.abstract) * 0.1), replace=False
)

train_subset_dataset = MultOverlappingSentencesPairDataset(
    iclr.abstract[subset],
    tokenizer,
    device,
    n_cons_sntcs=n_cons_sntcs,
    seed=42,
)

X_train_subset = list(np.vstack(train_subset_dataset.abs_sentences)[:, 0])
y_train_subset = list(np.vstack(train_subset_dataset.abs_sentences)[:, 1])
print("Number of sentences: ", len(X_train_subset))
print("Number of classes: ", len(np.unique(y_train_subset)))
print("Ratio: ", len(np.unique(y_train_subset)) / len(X_train_subset))


# knn accuracy
(
    embd_cls_train_subset,
    embd_sep_train_subset,
    embd_av_train_subset,
) = generate_embeddings(
    X_train_subset, tokenizer, model, device, batch_size=256
)
knn_acc_train_subset_3 = knn_accuracy(
    [embd_av_train_subset, embd_cls_train_subset, embd_sep_train_subset],
    y_train_subset,
)
print(knn_acc_train_subset_3)

# # save
# saving_path = Path("embeddings_" + model_name.lower())
# (variables_path / saving_path).mkdir(exist_ok=True)
# np.save(
#     variables_path / saving_path / "knn_accuracies_abstract_train_baseline",
#     knn_acc_train,
# )

Number of sentences:  7839
Number of classes:  1593
Ratio:  0.20321469575200918


  0%|          | 0/31 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
%%time
# subset of train set of size=test set
# for rs_i in range(5):
np.random.seed(42)
subset = np.random.choice(len(X_train), size=len(X_test), replace=False)
print(len(X_test))
print(len(np.unique(np.array(y_train)[subset])))
print(len(np.unique(np.array(y_train)[subset])) / len(X_test))

knn_acc_train_subset = knn_accuracy(
    [
        embd_av_train[subset],
        embd_cls_train[subset],
        embd_sep_train[subset],
    ],
    np.array(y_train)[subset],
)
print(knn_acc_train_subset)
# # save
# saving_path = Path("embeddings_" + model_name.lower())
# (variables_path / saving_path).mkdir(exist_ok=True)
# np.save(
#     variables_path
#     / saving_path
#     / "knn_accuracies_abstract_train_subset_baseline",
#     knn_acc_train_subset,
# )

7820
6107
0.7809462915601023
[0.01918159 0.01662404 0.01150895]
CPU times: user 52.6 s, sys: 2min 11s, total: 3min 3s
Wall time: 3.95 s


In [None]:
%%time
# subset of train set of size=test set
# knn accuracy
# print(np.array(y_train)[subset].shape)
n = 7820
knn_acc_train_subset_2 = knn_accuracy(
    [
        embd_av_train[:n],
        embd_cls_train[:n],
        embd_sep_train[:n],
    ],
    np.array(y_train)[:n],
)
print(len(np.unique(np.array(y_train)[:n])))
print(len(np.unique(np.array(y_train)[:n])) / n)
print(knn_acc_train_subset_2)
# # save
# saving_path = Path("embeddings_" + model_name.lower())
# (variables_path / saving_path).mkdir(exist_ok=True)
# np.save(
#     variables_path
#     / saving_path
#     / "knn_accuracies_abstract_train_subset_baseline",
#     knn_acc_train_subset,
# )

1605
0.20524296675191817
[0.34015345 0.16368286 0.15984655]
CPU times: user 1min 1s, sys: 3min 1s, total: 4min 3s
Wall time: 5.5 s


In [None]:
%%time
# subset of train set of size=test set
print("Subset size      number of classes       ratio           accuracy")
for subset_size in np.hstack((np.logspace(2, 4, 3), np.array([50000]))):
    np.random.seed(12)
    subset = np.random.choice(
        len(X_train), size=int(subset_size), replace=False
    )
    # knn accuracy
    # print(np.array(y_train)[subset].shape)

    knn_acc_loo_train_subset = knn_acc_loocv(
        embd_av_train[subset], np.array(y_train)[subset]
    )
    print(
        f"{int(subset_size)}",
        f"                  {len(np.unique(np.array(y_train)[subset]))}",
        f"              {len(np.unique(np.array(y_train)[subset])) / subset_size:.3f}",
        f"          {knn_acc_loo_train_subset:.3f}",
    )

Subset size      number of classes       ratio           accuracy
100                   100               1.000           0.000
1000                   972               0.972           0.000
10000                   7308               0.731           0.032
50000                   14165               0.283           0.220
CPU times: user 9min 28s, sys: 19min 19s, total: 28min 48s
Wall time: 1min 3s


In [None]:
print("Train set ratio: ", np.round(len(np.unique(y_train)) / len(X_train), 4))
print("Test set ratio: ", np.round(len(np.unique(y_test)) / len(X_test), 4))

Train set ratio:  0.2044
Test set ratio:  0.2028


In [None]:
print("Train set: ", len(X_train), len(y_train))
print("Test set: ", len(X_test), len(y_test))

Train set:  70175 70175
Test set:  7820 7820


#### Sanity check for another random seed

In [None]:
%%time
i = 0
model_name = model_names[i]

## fix random seeds
seed = 42
# Set the random seed for PyTorch
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
# torch.use_deterministic_algorithms(True)
# Set the random seed for NumPy
np.random.seed(seed)
# Set the random seed
random.seed(seed)

## set up model
print("Model: ", model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on device: {}".format(device))

tokenizer = AutoTokenizer.from_pretrained(model_paths[i])
model = AutoModel.from_pretrained(model_paths[i])
model.to(device)
print(model_paths[i])

## data
# split into train and test
for rs_i in range(5):
    X_train, X_test = train_test_split(
        iclr.abstract, test_size=0.1, random_state=np.random.seed(rs_i)
    )
    # print("Train set: ", X_train.shape)
    # print("Test set: ", X_test.shape)

    # do the train and test datasets with the class
    n_cons_sntcs = 2

    train_dataset = MultOverlappingSentencesPairDataset(
        X_train, tokenizer, device, n_cons_sntcs=n_cons_sntcs, seed=42
    )
    test_dataset = MultOverlappingSentencesPairDataset(
        X_test, tokenizer, device, n_cons_sntcs=n_cons_sntcs, seed=42
    )
    print(f"TRAIN SET, random seed {rs_i}")
    X_train = list(np.vstack(train_dataset.abs_sentences)[:, 0])
    y_train = list(np.vstack(train_dataset.abs_sentences)[:, 1])
    print("Number of sentences:", len(X_train))
    print("Number of classes:", len(np.unique(y_train)))
    print("Ratio:", len(np.unique(y_train)) / len(X_train))

    # knn accuracy
    embd_cls_train, embd_sep_train, embd_av_train = generate_embeddings(
        X_train, tokenizer, model, device, batch_size=256
    )
    knn_acc_train = knn_accuracy(
        [embd_av_train, embd_cls_train, embd_sep_train], y_train
    )
    print(knn_acc_train)

    print(f"TEST SET, random seed {rs_i}")
    X_test = list(np.vstack(test_dataset.abs_sentences)[:, 0])
    y_test = list(np.vstack(test_dataset.abs_sentences)[:, 1])
    print("Number of sentences:", len(X_test))
    print("Number of classes:", len(np.unique(y_test)))
    print("Ratio:", len(np.unique(y_test)) / len(X_test))

    # knn accuracy
    embd_cls_test, embd_sep_test, embd_av_test = generate_embeddings(
        X_test, tokenizer, model, device, batch_size=256
    )
    knn_acc_test = knn_accuracy(
        [embd_av_test, embd_cls_test, embd_sep_test], y_test
    )
    print(knn_acc_test)
    print("--------------------------------")

Model:  BERT
Running on device: cuda


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-uncased
TRAIN SET, random seed 0
Number of sentences: 70246
Number of classes: 14335
Ratio: 0.2040685590638613


  0%|          | 0/275 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/31 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/275 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/31 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/274 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/31 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/275 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/30 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/275 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/31 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
%%time
i = 0
model_name = model_names[i]

## fix random seeds
seed = 42
# Set the random seed for PyTorch
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
# torch.use_deterministic_algorithms(True)
# Set the random seed for NumPy
np.random.seed(seed)
# Set the random seed
random.seed(seed)

## set up model
print("Model: ", model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on device: {}".format(device))

tokenizer = AutoTokenizer.from_pretrained(model_paths[i])
model = AutoModel.from_pretrained(model_paths[i])
model.to(device)
print(model_paths[i])

## data
# split into train and test
for rs_i in range(5):
    X_train, X_test = train_test_split(
        iclr.abstract, test_size=0.1, random_state=np.random.seed(rs_i)
    )

    # do the train and test datasets with the class
    n_cons_sntcs = 2

    test_dataset = MultOverlappingSentencesPairDataset(
        X_test, tokenizer, device, n_cons_sntcs=n_cons_sntcs, seed=42
    )

    print(f"TEST SET, random seed {rs_i}")
    X_test = list(np.vstack(test_dataset.abs_sentences)[:, 0])
    y_test = list(np.vstack(test_dataset.abs_sentences)[:, 1])
    print("Number of sentences:", len(X_test))
    print("Number of classes:", len(np.unique(y_test)))
    print("Ratio:", len(np.unique(y_test)) / len(X_test))

    # knn accuracy
    embd_cls_test, embd_sep_test, embd_av_test = generate_embeddings(
        X_test, tokenizer, model, device, batch_size=256
    )
    knn_acc_loo_test = knn_acc_loocv(embd_av_test, np.array(y_test))
    print("knn acc loo: ", knn_acc_loo_test)
    print("--------------------------------")

Model:  BERT
Running on device: cuda


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-uncased
TEST SET, random seed 0
Number of sentences: 7749
Number of classes: 1596
Ratio: 0.20596205962059622


  0%|          | 0/31 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/31 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/31 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/30 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/31 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

                           Size           number of classes                              AV                CLS               SEP               
Train set         70175       14345   [0.297           0.1                0.112]               (evaluated on 7018 points)
Test set           7820         1586   [0.409           0.176           0.16 ]                (evaluated on 782 points)

   accuracy 
                         0.297
                             0.409