In [51]:
import pandas as pd
import random
import numpy as np
from random import randint

import torch
from transformers import AutoTokenizer, AutoModel

import scipy as sp
from scipy import sparse
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from openTSNE import TSNE
from openTSNE import affinity, initialization, TSNEEmbedding
from openTSNE.affinity import Affinities

import time

import memory_profiler

%load_ext memory_profiler

from pathlib import Path

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


In [52]:
import black
import jupyter_black

jupyter_black.load(line_length=79)

In [53]:
%load_ext autoreload
%autoreload 2

from pubmed_landscape_src.metrics import knn_accuracy_ls
from pubmed_landscape_src.data import generate_embeddings

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [54]:
variables_path = Path("../../results/variables")
figures_path = Path("../../results/figures")
berenslab_data_path = Path("/gpfs01/berens/data/data/pubmed_processed")

# Import

In [55]:
# Import
df = pd.read_pickle(berenslab_data_path / "df_labeled_papers_subset")
df = df.reset_index(drop=True)
abstracts = df["AbstractText"].tolist()

# For my data

In [28]:
tokenizer = AutoTokenizer.from_pretrained(
    "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
)

In [29]:
def pubmedbert_tokenizer(input_string):
    "The tokenizer should be a function that takes a string and returns an array of its tokens."

    tokenizer_kwargs = dict(
        max_length=512,
        padding=True,
        truncation=True,
        #     return_tensors="pt",
    )
    inputs = tokenizer(input_string, **tokenizer_kwargs)

    return tokenizer.convert_ids_to_tokens(
        inputs["input_ids"], skip_special_tokens=True
    )

In [30]:
%%time
%%memit

# TfidfVectorizer
corpus = abstracts
vectorizer = TfidfVectorizer(sublinear_tf=True, tokenizer=pubmedbert_tokenizer)
tfidf_features_1M_pubmedbert_tokenizer = vectorizer.fit_transform(corpus)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


peak memory: 5948.01 MiB, increment: 1032.31 MiB
CPU times: user 45min 31s, sys: 15.7 s, total: 45min 47s
Wall time: 45min 54s


In [32]:
# old: (1000000, 758111)
print(tfidf_features_1M_pubmedbert_tokenizer.shape)

(1000000, 29047)


In [None]:
# save results
sp.sparse.save_npz(variables_path / "tfidf_features_1M_pubmedbert_tokenizer", tfidf_features_1M_pubmedbert_tokenizer)

# Truncated SVD

In [None]:
#  results
#tfidf_features_1M_pubmedbert_tokenizer = sp.sparse.load_npz(variables_path / "tfidf_features_1M_pubmedbert_tokenizer.npz")

tcmalloc: large alloc 18038185984 bytes == 0x5af2000 @ 
tcmalloc: large alloc 18038185984 bytes == 0x43a030000 @ 


In [35]:
%%time
%%memit

# TruncatedSVD
svd = TruncatedSVD(n_components=300, random_state=42, algorithm="arpack")
svd_data_1M_pubmedbert_tokenizer = svd.fit_transform(
    tfidf_features_1M_pubmedbert_tokenizer
)

# save results
np.save(
    variables_path / "svd_data_1M_pubmedbert_tokenizer",
    svd_data_1M_pubmedbert_tokenizer,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


peak memory: 13167.90 MiB, increment: 7219.50 MiB
CPU times: user 45min 42s, sys: 1h 21min 44s, total: 2h 7min 27s
Wall time: 11min 6s


# t-SNE

In [None]:
# svd_data_1M_pubmedbert_tokenizer = np.load(variables_path / "svd_data_1M_pubmedbert_tokenizer.npy")

tcmalloc: large alloc 2400002048 bytes == 0x74930000 @ 


In [37]:
def run_tsne(
    embeddings, model_name, variables_path, rs=42, save_intermediates=True
):
    # affinities
    A = affinity.Uniform(
        embeddings,
        k_neighbors=10,
        n_jobs=-1,
        verbose=1,
        random_state=42,
    )

    # initialization
    I = initialization.pca(embeddings, random_state=42)

    if save_intermediates == True:
        affinities_name = "affinities_P_" + model_name
        sp.sparse.save_npz(variables_path / affinities_name, A.P)

        initialization_name = "initialization_" + model_name
        np.save(variables_path / initialization_name, I)

    # t-SNE optimization
    E = TSNEEmbedding(I, A, n_jobs=-1, random_state=42, verbose=True)

    ## early exaggeration
    E = E.optimize(
        n_iter=125, exaggeration=12, momentum=0.5, n_jobs=-1, verbose=True
    )

    ## exaggeration annealing
    exs = np.linspace(12, 1, 125)
    for i in range(125):
        E = E.optimize(
            n_iter=1,
            exaggeration=exs[i],
            momentum=0.8,
            n_jobs=-1,
            verbose=True,
        )

    ## final optimization without exaggeration
    E = E.optimize(
        n_iter=500, exaggeration=1, momentum=0.8, n_jobs=-1, verbose=True
    )

    tsne = np.array(E)

    # save
    tsne_name = "tsne_" + model_name
    np.save(variables_path / tsne_name, tsne)

    return tsne

In [38]:
%%time
tsne_tfidf_pubmedbert_tokenizer = run_tsne(
    svd_data_1M_pubmedbert_tokenizer,
    "tfidf_pubmedbert_tokenizer",
    variables_path=variables_path,
)

===> Finding 10 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 230.27 seconds




===> Running optimization with exaggeration=12.00, lr=83333.33 for 125 iterations...
Iteration   50, KL divergence 10.9196, 50 iterations in 16.6956 sec
Iteration  100, KL divergence 10.1942, 50 iterations in 16.5724 sec
   --> Time elapsed: 41.70 seconds
===> Running optimization with exaggeration=12.00, lr=83333.33 for 1 iterations...
   --> Time elapsed: 0.33 seconds
===> Running optimization with exaggeration=11.91, lr=83953.96 for 1 iterations...
   --> Time elapsed: 0.35 seconds
===> Running optimization with exaggeration=11.82, lr=84583.90 for 1 iterations...
   --> Time elapsed: 0.39 seconds
===> Running optimization with exaggeration=11.73, lr=85223.37 for 1 iterations...
   --> Time elapsed: 0.36 seconds
===> Running optimization with exaggeration=11.65, lr=85872.58 for 1 iterations...
   --> Time elapsed: 0.37 seconds
===> Running optimization with exaggeration=11.56, lr=86531.75 for 1 iterations...
   --> Time elapsed: 0.36 seconds
===> Running optimization with exaggeratio

# kNN accuracies

## Import

In [39]:
# Import
df = pd.read_pickle(berenslab_data_path / "df_labeled_papers_subset")
df = df.reset_index(drop=True)
colors = df["Colors"].to_numpy()

In [None]:
# tfidf_features_1M_pubmedbert_tokenizer = sp.sparse.load_npz(
#     variables_path / "tfidf_features_1M_pubmedbert_tokenizer.npz"
# )

In [None]:
# svd_data_1M_pubmedbert_tokenizer = np.load(variables_path / "svd_data_1M_pubmedbert_tokenizer.npy")

tcmalloc: large alloc 2400002048 bytes == 0x233da6000 @ 


In [None]:
# tsne_tfidf_pubmedbert_tokenizer = np.load(variables_path / "tsne_tfidf_pubmedbert_tokenizer.npy")

## Run

In [40]:
%%time
knn_accuracy_tfidf_features_1M_pubmedbert_tokenizer = knn_accuracy_ls(
    tfidf_features_1M_pubmedbert_tokenizer, colors
)

CPU times: user 1d 19h 19min 56s, sys: 1min 34s, total: 1d 19h 21min 31s
Wall time: 1h 30min 3s


In [41]:
print(knn_accuracy_tfidf_features_1M_pubmedbert_tokenizer)

0.6154


In [42]:
%%time
knn_accuracy_svd_data_1M_pubmedbert_tokenizer = knn_accuracy_ls(
    svd_data_1M_pubmedbert_tokenizer, colors
)

CPU times: user 3min 29s, sys: 420 ms, total: 3min 29s
Wall time: 12.6 s


In [43]:
print(knn_accuracy_svd_data_1M_pubmedbert_tokenizer)

0.5618


In [44]:
%%time
knn_accuracy_tsne_tfidf_pubmedbert_tokenizer = knn_accuracy_ls(
    tsne_tfidf_pubmedbert_tokenizer, colors
)

CPU times: user 53.3 s, sys: 586 ms, total: 53.9 s
Wall time: 2.93 s


In [45]:
print(knn_accuracy_tsne_tfidf_pubmedbert_tokenizer)

0.5007
