In [2]:
import pandas as pd
import random
import numpy as np
from random import randint

import torch
from transformers import AutoTokenizer, AutoModel

import time

import memory_profiler

%load_ext memory_profiler

from pathlib import Path

In [3]:
import black
import jupyter_black

jupyter_black.load(line_length=79)

In [4]:
%load_ext autoreload
%autoreload 2

from pubmed_landscape_src.metrics import knn_accuracy_ls
from pubmed_landscape_src.data import generate_embeddings

In [5]:
variables_path = Path("../../results/variables")
figures_path = Path("../../results/figures")
berenslab_data_path = Path("/gpfs01/berens/data/data/pubmed_processed")

# Import

In [5]:
# Import
df = pd.read_pickle(berenslab_data_path / "df_labeled_papers_subset")
df = df.reset_index(drop=True)
abstracts = df["AbstractText"].tolist()

# Obtain embeddings

In [6]:
# random seed
random_state = random.seed(42)

In [7]:
# specify & check gpu usage
device = (
    "cuda" if torch.cuda.is_available() else "cpu"
)  # put cuda:0 if else not working
print("running on device: {}".format(device))

running on device: cuda


In [8]:
# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "sentence-transformers/all-mpnet-base-v2"
)
model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")

print("model: SBERT")

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

model: SBERT


In [9]:
# set device
model = model.to(device)

In [None]:
%%capture cap
%%time
%%memit

embeddings_av = np.empty([len(abstracts), 768])
embeddings_sep = np.empty([len(abstracts), 768])
embeddings_cls = np.empty([len(abstracts), 768])

for i, abstr in enumerate(abstracts):
    np.save(variables_path / "experiment_iter", i)

    embd_cls, embd_sep, embd_av = generate_embeddings(
        abstr, tokenizer, model, device
    )

    embeddings_cls[i] = embd_cls
    embeddings_sep[i] = embd_sep
    embeddings_av[i] = embd_av

    if (i % 50000) == 0:
        np.save(
            berenslab_data_path
            / "embeddings/embeddings_SBERT/embeddings_cls_interm",
            embeddings_cls,
        )
        np.save(
            berenslab_data_path
            / "embeddings/embeddings_SBERT/embeddings_sep_interm",
            embeddings_sep,
        )
        np.save(
            berenslab_data_path
            / "embeddings/embeddings_SBERT/embeddings_av_interm",
            embeddings_av,
        )

np.save(
    berenslab_data_path / "embeddings/embeddings_SBERT/embeddings_cls",
    embeddings_cls,
)
np.save(
    berenslab_data_path / "embeddings/embeddings_SBERT/embeddings_sep",
    embeddings_sep,
)
np.save(
    berenslab_data_path / "embeddings/embeddings_SBERT/embeddings_av",
    embeddings_av,
)

tcmalloc: large alloc 6144000000 bytes == 0x7fc1a5ca0000 @ 
tcmalloc: large alloc 6144000000 bytes == 0x7fc037940000 @ 
tcmalloc: large alloc 6144000000 bytes == 0x7fbec95e0000 @ 


In [None]:
with open(variables_path / "verbose_batches_SBERT.txt", "w") as f:
    f.write(cap.stdout)

# kNN accuracies (RERUN)

In [27]:
# Import
df = pd.read_pickle(berenslab_data_path / "df_labeled_papers_subset")
df = df.reset_index(drop=True)
labels = df["Colors"].tolist()

## CLS

In [None]:
embeddings_cls = np.load(berenslab_data_path / 'embeddings/embeddings_SBERT/embeddings_cls.npy', allow_pickle=True, fix_imports=True)

In [15]:
embeddings_cls.shape

(1000000, 768)

In [None]:
%%capture cap
%%time
knn_accuracy_SBERT_cls = knn_accuracy_ls(embeddings_cls, labels)

In [None]:
with open(variables_path / "verbose_knn_accuracy_SBERT_cls.txt", "w") as f:
    f.write(cap.stdout)

In [25]:
print(knn_accuracy_SBERT_cls)

0.6074


In [None]:
np.save(variables_path / "knn_accuracy_SBERT_cls", knn_accuracy_SBERT_cls)

## SEP

In [None]:
embeddings_sep = np.load(berenslab_data_path / 'embeddings/embeddings_SBERT/embeddings_sep.npy', allow_pickle=True, fix_imports=True)

In [35]:
embeddings_sep.shape

(1000000, 768)

In [None]:
%%capture cap
%%time
knn_accuracy_SBERT_sep = knn_accuracy_ls(embeddings_sep, labels)

In [None]:
with open(variables_path / "verbose_knn_accuracy_SBERT_sep.txt", "w") as f:
    f.write(cap.stdout)

In [26]:
print(knn_accuracy_SBERT_sep)

0.6215


In [None]:
np.save(variables_path / "knn_accuracy_SBERT_sep", knn_accuracy_SBERT_sep)

## Average

In [None]:
embeddings_av = np.load(berenslab_data_path / 'embeddings/embeddings_SBERT/embeddings_av.npy', allow_pickle=True, fix_imports=True)

In [39]:
embeddings_av.shape

(1000000, 768)

In [None]:
%%capture cap
%%time
knn_accuracy_SBERT_av = knn_accuracy_ls(embeddings_av, labels)

In [None]:
with open(variables_path / "verbose_knn_accuracy_SBERT_av.txt", "w") as f:
    f.write(cap.stdout)

In [34]:
print(knn_accuracy_SBERT_av)

0.6447


In [None]:
np.save(variables_path / "knn_accuracy_SBERT_av", knn_accuracy_SBERT_av)

## Normalized average

In [7]:
embeddings_av = np.load(
    berenslab_data_path / "embeddings/embeddings_SBERT/embeddings_av.npy",
    allow_pickle=True,
    fix_imports=True,
)

tcmalloc: large alloc 6144000000 bytes == 0xaeae000 @ 


In [28]:
embeddings_av.shape

(1000000, 768)

In [29]:
%%time
knn_accuracy_SBERT_av_norm = knn_accuracy_ls(
    embeddings_av / np.linalg.norm(embeddings_av), labels
)

tcmalloc: large alloc 6144000000 bytes == 0x7fbd1bb8c000 @ 
tcmalloc: large alloc 6082560000 bytes == 0x7fbaeceba000 @ 


In [30]:
with open(variables_path / "verbose_knn_accuracy_SBERT_av_norm.txt", "w") as f:
    f.write(cap.stdout)

In [33]:
print(knn_accuracy_SBERT_av_norm)

0.6447


In [32]:
np.save(
    variables_path / "knn_accuracy_SBERT_av_norm", knn_accuracy_SBERT_av_norm
)

In [16]:
embeddings_av_norm = embeddings_av / np.linalg.norm(
    embeddings_av, axis=1
).reshape(-1, 1)

tcmalloc: large alloc 6144000000 bytes == 0x2e7f9e000 @ 
tcmalloc: large alloc 6144000000 bytes == 0x2e7f9e000 @ 


In [17]:
embeddings_av_norm.shape

(1000000, 768)

In [10]:
print(embeddings_av_norm[0, :5])
print(embeddings_av[0, :5])

[ 3.55270926e-05 -1.01416438e-04 -7.74966664e-07 -8.23686478e-06
 -7.06524465e-05]
[ 0.08264855 -0.2359304  -0.00180285 -0.01916185 -0.16436251]


In [28]:
%%time
knn_accuracy_SBERT_av_norm_2 = knn_accuracy_ls(embeddings_av_norm, labels)

tcmalloc: large alloc 6082560000 bytes == 0x4589e4000 @ 


CPU times: user 1h 12min 21s, sys: 54min 34s, total: 2h 6min 55s
Wall time: 7min 24s


In [30]:
knn_accuracy_SBERT_av_norm_2

0.6442

In [23]:
print(embeddings_av_norm[0].shape)
print(np.linalg.norm(embeddings_av_norm[0]))
print(np.linalg.norm(embeddings_av[0]))

(768,)
1.0
2.083858197235886


In [20]:
np.linalg.norm(embeddings_av, axis=1).reshape(-1, 1).shape

tcmalloc: large alloc 6144000000 bytes == 0x17943e000 @ 


(1000000, 1)

In [12]:
?np.linalg.norm

[0;31mSignature:[0m [0mnp[0m[0;34m.[0m[0mlinalg[0m[0;34m.[0m[0mnorm[0m[0;34m([0m[0mx[0m[0;34m,[0m [0mord[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mkeepdims[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Matrix or vector norm.

This function is able to return one of eight different matrix norms,
or one of an infinite number of vector norms (described below), depending
on the value of the ``ord`` parameter.

Parameters
----------
x : array_like
    Input array.  If `axis` is None, `x` must be 1-D or 2-D, unless `ord`
    is None. If both `axis` and `ord` are None, the 2-norm of
    ``x.ravel`` will be returned.
ord : {non-zero int, inf, -inf, 'fro', 'nuc'}, optional
    Order of the norm (see table under ``Notes``). inf means numpy's
    `inf` object. The default is None.
axis : {None, int, 2-tuple of ints}, optional.
    If `axis` is an integer, it specifies the axis 