In [9]:
load_embeddings_from_file = False

In [10]:
# If using Conda, you need to install this manually by cloning Ariel's repo, cd-ing to it, and running:
#   /Users/ariellubonja/anaconda3/envs/faiss/bin/python (i.e. path to your Conda environment's python) setup.py install
from biobert_embedding.embedding import BiobertEmbedding
from utils import graph_tools
import dask
from dask.distributed import Client
import dask.bag as db

import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
# import faiss
import numpy as np


In [11]:
unique_drug_names = list(graph_tools.get_unique_drugs())

In [12]:
client = Client() # Dask client

## Original pip install biobert-embedding does not work (it's too old)

I forked the repo. Install using `pip install git+https://github.com/ariellubonja/biobert_embedding`

In [13]:
biobert = BiobertEmbedding()

Using existing models/pytorch_model.bin
Using existing models/config.json
Using existing models/vocab.txt


### Word Embeddings of Drugs.com drug names (without supporting text)

Find the drug names I've crawled [here](https://drive.google.com/drive/folders/1EO659a-tyjfXjKzHk-M0WreZBR14MgRM?usp=sharing)

<font color="orange">The cell below will try to handle Out of Vocabulary terms. This means that the embedding matrix is not a clear matrix, but will be a list of Tensors</font>

In [14]:
if load_embeddings_from_file:
    biobert_embeddings_oov = torch.load('outputs/biobert_oov_embeddings.pt')
else:
    # This will take a while
    client = Client()
    
    # db - Dask Bag
    b = db.from_sequence(unique_drug_names)

    # Map the function to the data
    biobert_embeddings_oov = b.map(biobert.word_vector).compute()




    # biobert_embeddings_oov = list(map(biobert.word_vector, unique_drug_names)) # Single-threaded
    # Out of vocabulary will be split into smaller. May end up with multiple embeddings for each word. Parallelized

    # Using Python's native multiprocessing library
    # biobert_embeddings_oov = pool.map(biobert.word_vector, unique_drug_names)

    # pool.close()
    # pool.join()

In [15]:
type(biobert_embeddings_oov)

list

In [21]:
torch.save(biobert_embeddings_oov, 'outputs/biobert_oov_embeddings.pt')

In [18]:
df = pd.DataFrame(biobert_embeddings_oov, index=unique_drug_names)

In [19]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
glycron,"[tensor(11.4696), tensor(-0.7767), tensor(0.04...",,,,,,,,,,...,,,,,,,,,,
fioricet-with-codeine,"[tensor(-0.3252), tensor(-2.0265), tensor(0.87...","[tensor(0.0035), tensor(-0.4583), tensor(0.225...","[tensor(0.2675), tensor(0.0342), tensor(-1.016...","[tensor(-0.1575), tensor(-0.5886), tensor(0.13...","[tensor(0.0583), tensor(0.8732), tensor(3.1608...",,,,,,...,,,,,,,,,,
ocella,"[tensor(-1.5729), tensor(-2.2441), tensor(-3.9...",,,,,,,,,,...,,,,,,,,,,
chlorophyllin,"[tensor(6.9570), tensor(-19.1814), tensor(-3.4...",,,,,,,,,,...,,,,,,,,,,
eskata,"[tensor(2.8071), tensor(-2.8373), tensor(-0.72...",,,,,,,,,,...,,,,,,,,,,


In [None]:
np.save("outputs/biobert_oov_embeddings.npy", biobert_embeddings_oov)

In [None]:
len(biobert_embeddings_oov[0])

3

In [None]:
# These are the indices of the drug names that have only one embedding, i.e. the drug name is in the Vocabulary
vector_embd_indices = [i for i, num in enumerate(biobert_embeddings_oov) if len(num) == 1]
len(vector_embd_indices)

3310

In [None]:
len(biobert_embeddings_oov) == len(unique_drug_names)

True

In [None]:
vector_embeddings = torch.stack([biobert_embeddings_oov[i][0] for i in vector_embd_indices]).numpy()
corresponding_drug_names = [unique_drug_names[i] for i in vector_embd_indices]

In [None]:
vector_embeddings.numpy().shape

(3310, 768)

In [None]:
df = pd.DataFrame(vector_embeddings, index=corresponding_drug_names)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
trilafon,-2.494238,-5.982688,3.391733,-3.165889,-3.97255,-3.434661,2.095402,1.464997,6.532821,-1.086071,...,-9.374193,3.749851,1.355658,-2.396497,3.676836,1.897683,0.468625,6.310996,-9.883236,-1.279147
calcipotriene,4.24083,1.769997,-0.795535,-6.929158,5.979644,-0.761292,-1.656379,0.054974,0.432746,12.672398,...,-10.8607,-1.191983,-9.798771,-4.564957,-9.084117,-5.654516,-0.90969,10.11882,-6.920544,3.41189
fosinopril,2.749605,-11.437942,2.700483,-0.5981,9.610103,-13.214931,4.708538,7.986394,-0.904664,-3.148977,...,-8.771662,3.917134,-10.183059,5.208779,6.980632,0.385215,2.746163,-2.396154,-15.207134,-8.165861
nadostine,-2.403095,-5.212749,-2.980472,-4.289715,1.06522,-2.463279,2.731372,3.838523,-2.878551,3.91656,...,-5.626741,4.469468,0.329026,1.49818,-2.434177,-1.255941,0.073906,2.629568,-7.65362,3.064509
duavee,-2.835025,-6.494714,2.831769,-5.364807,4.386308,-0.174934,3.309786,-0.848888,2.427666,7.614682,...,0.924215,1.525765,-0.303652,3.41023,0.926888,-4.464722,2.695533,6.594471,-3.707011,-5.687425


## Evaluations

Top-K similarities

In [None]:
# Instantiate an index with the desired index type and dimensionality
index = faiss.IndexFlatIP(768)  # Cosine similarity

biobert_embeddings_np = df.values

# Add biobert_embeddings_np to the index
index.add(biobert_embeddings_np)

# number of nearest neighbors
k = 5

# similarity search
# D contains the similarities to the nearest neighbors (cosine similarity)
# I contains the indices of the nearest neighbors (excluding self)
D, I = index.search(biobert_embeddings_np, k+1)  # Retrieve k+1 neighbors to exclude self


for i in range(biobert_embeddings_np.shape[0]): # Iterate over each embedding
    embedding = biobert_embeddings_np[i]
    nearest_indices = I[i][1:]  # Exclude self, start from index 1
    nearest_similarities = D[i][1:]

    # Print the top-k nearest neighbors for the current embedding
    print(f"Embedding {df.index[i]}:")
    for j, index in enumerate(nearest_indices):
        similarity = nearest_similarities[j]
        print(f"Nearest Neighbor {j+1}: Drug Name {df.index[index]}, Similarity {similarity}")
    print()


NameError: name 'vector_embeddings' is not defined

## 3194 of the Drugs have vector embeddings, whereas the rest return a Matrix that we have to deal with using Pooling

Let's see the 3194

<font color="red">Most of the drugs give a matrix as their embedding, and not a vector.</font>

Need to try various pooling methods to get a vector from the matrix + see what is best

## TODO try ClinicalBERT https://github.com/EmilyAlsentzer/clinicalBERT

In [4]:
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
encoded_inputs = tokenizer(unique_drug_names, padding=True, truncation=True, return_tensors='pt')

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Macbook M1 Pro Runtimes

CPU - 1min 17 sec

MPS (torch 1.13) - Uses way too much memory (50GB+ - crashes) whereas CPU is max 13GB

MPS (torch 2.01) - 1min 15s - 18GB memory usage


<font color="red">Conclusion - not useful yet!</font>

In [7]:
with torch.no_grad():
    model_output = model(**encoded_inputs)
    embeddings = model_output.last_hidden_state

1min 15s ± 8.66 s per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [None]:
embeddings.shape

torch.Size([8, 768])