In [1]:
import pandas as pd
import numpy as np
import random
from random import randint

import torch
from transformers import AutoTokenizer, AutoModel
from tqdm.notebook import tqdm

import time

import memory_profiler
%load_ext memory_profiler

from pathlib import Path

In [2]:
import black
import jupyter_black

jupyter_black.load(line_length=79)

In [3]:
%load_ext autoreload
%autoreload 2

from pubmed_landscape_src.data import generate_embeddings_batches

In [4]:
variables_path = Path("../../results/variables/2024_baseline")
figures_path = Path("../../results/figures/2024_baseline")
berenslab_data_path = Path("/gpfs01/berens/data/data/pubmed_processed")

saving_path = Path("embeddings/2024_baseline")

# Import dataframe

In [5]:
%%time
%%memit
# import clean_2024_df
clean_2024_df = pd.read_pickle(variables_path / "clean_2024_df")

peak memory: 67054.64 MiB, increment: 66472.96 MiB
CPU times: user 57.5 s, sys: 42.7 s, total: 1min 40s
Wall time: 1min 39s


In [6]:
# extract abstract texts
abstracts = clean_2024_df["AbstractText"].tolist()

In [7]:
clean_2024_df.head()

Unnamed: 0,PMID,Title,AbstractText,Language,Journal,Date,NameFirstAuthor,NameLastAuthor,ISSN,AffiliationFirstAuthor,AffiliationLastAuthor,filename
21,24,Influence of a new virostatic compound on the ...,"The virostatic compound N,N-diethyl-4-[2-(2-ox...",eng,Arzneimittel-Forschung,1975 Sep,H,G,0004-4172,,,pubmed24n0001.xml
22,23,Effect of etafenone on total and regional myoc...,The distribution of blood flow to the subendoc...,eng,Arzneimittel-Forschung,1975 Sep,H,W,0004-4172,,,pubmed24n0001.xml
24,25,Pharmacological properties of new neuroleptic ...,"RMI 61 140, RMI 61 144 and RMI 61 280 are newl...",eng,Arzneimittel-Forschung,1975 Sep,L,A,0004-4172,,,pubmed24n0001.xml
29,30,Lysosomal hydrolases of the epidermis. I. Glyc...,Seven distinct glycosidases (EC 3.2) have been...,eng,The British journal of dermatology,1975 Jul,P D,J J,0007-0963,,,pubmed24n0001.xml
31,32,A serum haemagglutinating property dependent u...,A serum agglutinin reactive with red cells in ...,eng,British journal of haematology,1975 Jan,M L,W L,0007-1048,,,pubmed24n0001.xml


# Obtaining the PubMedBERT embeddings of the abstracts

## Obtain embeddings

In [9]:
%%time

# specifying model
model_name = "PubMedBERT"
model_path = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"

# set up model
print("Model: ", model_name)

random_state = random.seed(42)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Running on device: {}".format(device))

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path)
print(model_path)

model.to(device)

Model:  PubMedBERT
Running on device: cuda
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext
CPU times: user 1.79 s, sys: 495 ms, total: 2.29 s
Wall time: 2.83 s


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [11]:
print(len(abstracts))

23389083


Starting time: 14:08 08.02.2024

In [None]:
%%time

start = time.time()
saving_path = Path("embeddings/2024_baseline")

loader = torch.utils.data.DataLoader(abstracts, batch_size=256, num_workers=0)

embedding_av = []
embedding_sep = []
embedding_cls = []

for i_batch, batch in enumerate(tqdm(loader)):
    embd_cls, embd_sep, embd_av = generate_embeddings_batches(
        batch, tokenizer, model, device
    )
    embedding_av.append(embd_av)
    embedding_cls.append(embd_cls)
    embedding_sep.append(embd_sep)

    if (i_batch % 200) == 0:
        np.save(berenslab_data_path / saving_path / "last_i_batch", i_batch)
        np.save(
            berenslab_data_path / saving_path / "embedding_av_interm",
            np.vstack(embedding_av),
        )
        np.save(
            berenslab_data_path / saving_path / "embedding_cls_interm",
            np.vstack(embedding_cls),
        )
        np.save(
            berenslab_data_path / saving_path / "embedding_sep_interm",
            np.vstack(embedding_sep),
        )


# save all
np.save(
    berenslab_data_path / saving_path / "embedding_av_all",
    np.vstack(embedding_av),
)
np.save(
    berenslab_data_path / saving_path / "embedding_cls_all",
    np.vstack(embedding_cls),
)
np.save(
    berenslab_data_path / saving_path / "embedding_sep_all",
    np.vstack(embedding_sep),
)

end = time.time()
runtime_total = end - start
np.save(berenslab_data_path / saving_path / "runtime_total", runtime_total)

  0%|          | 0/91364 [00:00<?, ?it/s]

In [9]:
saving_path = Path("embeddings/2024_baseline")
print(
    np.load(
        berenslab_data_path / saving_path / "embedding_av_interm.npy"
    ).shape
)

(17613056, 768)


## After crashing

In [17]:
%%time

# specifying model
model_name = "PubMedBERT"
model_path = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"

# set up model
print("Model: ", model_name)

random_state = random.seed(42)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Running on device: {}".format(device))

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path)
print(model_path)

model.to(device)

Model:  PubMedBERT
Running on device: cuda
microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext
CPU times: user 1.94 s, sys: 574 ms, total: 2.51 s
Wall time: 9.82 s


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [18]:
print(len(abstracts[17613055:]))

5776028


Starting time: 18:26 11.02.2024
total batches: 22563

In [22]:
%%time

start = time.time()
saving_path = Path("embeddings/2024_baseline")

loader = torch.utils.data.DataLoader(
    abstracts[17613055:], batch_size=256, num_workers=0
)

embedding_av = []
embedding_sep = []
embedding_cls = []

for i_batch, batch in enumerate(tqdm(loader)):
    embd_cls, embd_sep, embd_av = generate_embeddings_batches(
        batch, tokenizer, model, device
    )
    embedding_av.append(embd_av)
    embedding_cls.append(embd_cls)
    embedding_sep.append(embd_sep)

    if (i_batch % 200) == 0:
        np.save(berenslab_data_path / saving_path / "last_i_batch", i_batch)
        np.save(
            berenslab_data_path / saving_path / "embedding_av_interm_2",
            np.vstack(embedding_av),
        )
        np.save(
            berenslab_data_path / saving_path / "embedding_cls_interm_2",
            np.vstack(embedding_cls),
        )
        np.save(
            berenslab_data_path / saving_path / "embedding_sep_interm_2",
            np.vstack(embedding_sep),
        )


# save all
np.save(
    berenslab_data_path / saving_path / "embedding_av_interm_2_final",
    np.vstack(embedding_av),
)
np.save(
    berenslab_data_path / saving_path / "embedding_cls_interm_2_final",
    np.vstack(embedding_cls),
)
np.save(
    berenslab_data_path / saving_path / "embedding_sep_interm_2_final",
    np.vstack(embedding_sep),
)

end = time.time()
runtime_total = end - start
np.save(berenslab_data_path / saving_path / "runtime_total", runtime_total)

  0%|          | 0/22563 [00:00<?, ?it/s]

CPU times: user 11h 4min 16s, sys: 12h 11min 40s, total: 23h 15min 57s
Wall time: 14h 7min 31s


In [26]:
print(
    np.load(
        berenslab_data_path / saving_path / "embedding_sep_interm_2_final.npy"
    ).shape
)

(5776028, 768)


In [18]:
print(len(abstracts[17613055:]))

5776028


# Concatenate both

## AV

In [23]:
print(len(abstracts))

23389083


In [27]:
embedding_av_interm = np.load(
    berenslab_data_path / saving_path / "embedding_av_interm.npy"
)
embedding_av_interm_2_final = np.load(
    berenslab_data_path / saving_path / "embedding_av_interm_2_final.npy"
)

In [35]:
print(embedding_av_interm.shape)
print(embedding_av_interm_2_final[1:, :].shape)

(17613056, 768)
(5776027, 768)


In [37]:
%%time
embedding_av_all = np.vstack(
    (embedding_av_interm, embedding_av_interm_2_final[1:, :])
)
embedding_av_all.shape

CPU times: user 22.5 s, sys: 40.8 s, total: 1min 3s
Wall time: 1min 3s


(23389083, 768)

In [38]:
np.save(
    berenslab_data_path / saving_path / "embedding_av_all", embedding_av_all
)

## CLS

In [5]:
embedding_cls_interm = np.load(
    berenslab_data_path / saving_path / "embedding_cls_interm.npy"
)
embedding_cls_interm_2_final = np.load(
    berenslab_data_path / saving_path / "embedding_cls_interm_2_final.npy"
)

In [6]:
%%time
embedding_cls_all = np.vstack(
    (embedding_cls_interm, embedding_cls_interm_2_final[1:, :])
)
embedding_cls_all.shape

CPU times: user 11.9 s, sys: 14.1 s, total: 25.9 s
Wall time: 26 s


(23389083, 768)

In [7]:
np.save(
    berenslab_data_path / saving_path / "embedding_cls_all", embedding_cls_all
)

## SEP

In [8]:
embedding_sep_interm = np.load(
    berenslab_data_path / saving_path / "embedding_sep_interm.npy"
)
embedding_sep_interm_2_final = np.load(
    berenslab_data_path / saving_path / "embedding_sep_interm_2_final.npy"
)

In [9]:
%%time
embedding_sep_all = np.vstack(
    (embedding_sep_interm, embedding_sep_interm_2_final[1:, :])
)
embedding_sep_all.shape

CPU times: user 24.9 s, sys: 42.5 s, total: 1min 7s
Wall time: 1min 7s


(23389083, 768)

In [10]:
np.save(
    berenslab_data_path / saving_path / "embedding_sep_all", embedding_sep_all
)

# Save PubMedBERT SEP embeddings with float16 precission

In [5]:
saving_path = Path("embeddings/2024_baseline")
embedding_sep_all = np.load(
    berenslab_data_path / saving_path / "embedding_sep_all.npy"
)

In [6]:
embedding_sep_all.shape

(23389083, 768)

In [7]:
b = embedding_sep_all.astype(np.float16)

In [8]:
print(embedding_sep_all.nbytes)
print(b.nbytes)

71851262976
35925631488


In [9]:
np.save(
    berenslab_data_path / saving_path / "PubMedBERT_embeddings_float16_2024", b
)

In [10]:
b.shape

(23389083, 768)