In [None]:
import pandas as pd
import random
import numpy as np
from random import randint
import torch
from transformers import AutoTokenizer, AutoModel
import gc

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV

import matplotlib.pyplot as plt
import matplotlib

import time
import memory_profiler

%load_ext memory_profiler

from pathlib import Path

In [None]:
torch.__version__

'2.1.1+cu121'

In [None]:
%load_ext autoreload
%autoreload 2

from text_embeddings_src.model_stuff import train_loop
from text_embeddings_src.data_stuff import (
    MultOverlappingSentencesPairDataset,
)
from text_embeddings_src.metrics import knn_accuracy
from text_embeddings_src.embeddings import generate_embeddings

In [None]:
import black
import jupyter_black

jupyter_black.load(line_length=79)

In [None]:
variables_path = Path("../results/variables")
figures_path = Path("../results/figures")
data_path = Path("../data")

In [None]:
plt.style.use("matplotlib_style.txt")

In [None]:
model = None
gc.collect()
torch.cuda.empty_cache()

# Import

## Data

In [None]:
%%time
compression_opts = dict(method="zip", archive_name="iclr.pickle.csv")

iclr = pd.read_pickle(
    data_path / "iclr.pickle.zip",
    # index_col=False,
    compression=compression_opts,
)

TypeError: issubclass() arg 1 must be a class

In [None]:
iclr

NameError: name 'iclr' is not defined

In [None]:
titles_abstracts_together = [
    iclr.title[i] + " " + iclr.abstract[i] for i in range(len(iclr))
]

NameError: name 'iclr' is not defined

In [None]:
print(len(titles_abstracts_together))
print(type(titles_abstracts_together))

16536
<class 'list'>


## Labels

In [None]:
# iclr = pd.read_pickle("iclr.pickle.zip")

keywords = [
    "network",
    "graph",
    "reinforcement",
    "language",
    "adversarial",
    "federated",
    "contrastive",
    "domain",
    "diffusion",
    "out-of-dis",
    "continual",
    "distillation",
    "architecture",
    "privacy",
    "protein",
    "fair",
    "attention",
    "video",
    "meta-learning",
    "generative adv",
    "autoencoder",
    "game",
    "semi-sup",
    "pruning",
    "physics",
    "3d",
    "translation",
    "optimization",
    "recurrent",
    "word",
    "bayesian",
]
keywords = np.array(keywords)

y = np.zeros(iclr.shape[0]) * np.nan

for num, keyword in enumerate(keywords):
    mask = [keyword.lower() in t.lower() for t in iclr.title]
    y[mask & ~np.isnan(y)] = -1
    y[mask & np.isnan(y)] = num

print(y.size)
print(np.sum(~np.isnan(y)))
print(np.sum(y >= 0))

labeled = y >= 0

iclr_labeled = iclr[labeled].reset_index(drop=True)
y_labeled = y[labeled].astype(int)
iclr_labeled["y"] = y_labeled
iclr_labeled["label"] = keywords[y_labeled]

16536
8964
6849


In [None]:
model_names = [
    "BERT",
    "MPNet",
    "SBERT",
    "SciBERT",
    "SPECTER",
    "SciNCL",
]


model_paths = [
    "bert-base-uncased",
    "microsoft/mpnet-base",
    "sentence-transformers/all-mpnet-base-v2",
    "allenai/scibert_scivocab_uncased",
    "allenai/specter",
    "malteos/scincl",
]

# Logistic regression classifier

A train and test set are embedded with the provided model. The train set embeddings are used to train a logistic regression classifier with 100 maximum iterations, which is scored on the test set. 

The main metric is:
- accuracy

other:
- average precision 
- f1 additionally provided.


In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


def logistic_accuracy(
    embeddings, true_labels, test_size=0.1, rs=42, set_numpy=True
):
    """Calculates logistic accuracy.

    Parameters
    ----------
    embeddings : list
        List with the different datasets for which to calculate the kNN accuracy.
    true_labels : array-like
        Array with labels (colors).
    test_size : float
        Fraction of the data to take as test set.
    rs : int, default=42
        Random seed.

    Returns
    -------
    accuracy : float
        Accuracy of the logistic classifier in the test set.

    """

    random_state = np.random.seed(rs)

    if type(embeddings) == list:
        accuracy = []
        for embed in embeddings:
            X_train, X_test, y_train, y_test = train_test_split(
                embed,
                true_labels,
                test_size=test_size,
                random_state=random_state,
            )
            lr = make_pipeline(
                StandardScaler(),
                LogisticRegression(
                    penalty="none",
                    solver="saga",
                    tol=1e-2,
                    random_state=random_state,
                    n_jobs=-1,
                    max_iter=1000,
                ),
            )
            # clf = LogisticRegressionCV(
            #     cv=5,
            #     multi_class="multinomial",
            #     solver="sag",
            #     max_iter=100,
            #     n_jobs=-1,
            #     random_state=random_state,
            # )

            lr.fit(X_train, y_train)
            accuracy.append(lr.score(X_test, y_test))
        if set_numpy == True:
            accuracy = np.array(accuracy)

    else:
        X_train, X_test, y_train, y_test = train_test_split(
            embeddings,
            true_labels,
            test_size=test_size,
            random_state=random_state,
        )

        lr = make_pipeline(
            StandardScaler(),
            LogisticRegression(
                penalty="none",
                solver="saga",
                tol=1e-2,
                random_state=random_state,
                n_jobs=-1,
                max_iter=1000,
            ),
        )
        # clf = LogisticRegressionCV(
        #     cv=5,
        #     multi_class="multinomial",
        #     solver="sag",
        #     max_iter=100,
        #     n_jobs=-1,
        #     random_state=random_state,
        # )
        lr.fit(X_train, y_train)
        accuracy = lr.score(X_test, y_test)

    return accuracy

## Baseline

In [None]:
%%time
print("kNN accuracy     [AVG]    [CLS]   [SEP]")
for i, model_name in enumerate(model_names):
    # load
    saving_path = Path("embeddings_" + model_name.lower())

    embedding_av = np.load(variables_path / saving_path / "embedding_av.npy")
    embedding_cls = np.load(variables_path / saving_path / "embedding_cls.npy")
    embedding_sep = np.load(variables_path / saving_path / "embedding_sep.npy")

    # metric
    accuracy = logistic_accuracy(
        [
            embedding_av[labeled],
            embedding_cls[labeled],
            embedding_sep[labeled],
        ],
        iclr_labeled["y"].to_numpy(),
    )
    print(f"{model_name}: {np.array(accuracy)*100}")

    # save
    np.save(variables_path / saving_path / "logistic_accuracy", accuracy)

kNN accuracy     [AVG]    [CLS]   [SEP]
BERT: [79.12408759 73.28467153 77.22627737]
MPNet: [79.8540146  80.72992701 72.99270073]
SBERT: [88.02919708 90.94890511 81.16788321]
SciBERT: [78.97810219 69.19708029 68.32116788]
SPECTER: [83.35766423 85.25547445 82.33576642]
SciNCL: [84.96350365 88.46715328 85.69343066]
CPU times: user 8min 33s, sys: 1min 5s, total: 9min 38s
Wall time: 8min 18s
