In [14]:
from biobert_embedding.embedding import BiobertEmbedding
from utils import graph_tools
from multiprocessing import Pool

In [15]:
num_processes = 10

## Original pip install biobert-embedding does not work (it's too old)

I forked the repo. Install using `pip install git+https://github.com/ariellubonja/biobert_embedding`

In [3]:
## Example 1
text = "Breast cancers with HER2 amplification have a higher risk of CNS metastasis and poorer prognosis."\

# Class Initialization (You can set default 'model_path=None' as your finetuned BERT model path while Initialization)
biobert = BiobertEmbedding()

Using existing models/pytorch_model.bin
Using existing models/config.json
Using existing models/vocab.txt


#### Basic example to make sure it works

In [4]:
word_embeddings = biobert.word_vector(text)
sentence_embedding = biobert.sentence_vector(text)

print("Text Tokens: ", biobert.tokens)
# Text Tokens:  ['breast', 'cancers', 'with', 'her2', 'amplification', 'have', 'a', 'higher', 'risk', 'of', 'cns', 'metastasis', 'and', 'poorer', 'prognosis', '.']

print ('Shape of Word Embeddings: %d x %d' % (len(word_embeddings), len(word_embeddings[0])))
# Shape of Word Embeddings: 16 x 768

print("Shape of Sentence Embedding = ",len(sentence_embedding))
# Shape of Sentence Embedding =  768

## Example 2
sentence_vector1 = biobert.sentence_vector('Breast cancers with HER2 amplification have a higher risk of CNS metastasis and poorer prognosis.')
sentence_vector2 = biobert.sentence_vector('Breast cancers with HER2 amplification are more aggressive, have a higher risk of CNS metastasis, and poorer prognosis.')

Text Tokens:  ['breast', 'cancers', 'with', 'her2', 'amplification', 'have', 'a', 'higher', 'risk', 'of', 'cns', 'metastasis', 'and', 'poorer', 'prognosis', '.']
Shape of Word Embeddings: 16 x 768
Shape of Sentence Embedding =  768


In [5]:
from scipy.spatial.distance import cosine as cosine_distance

cosine_sim = 1 - cosine_distance(sentence_vector1, sentence_vector2)
print('cosine similarity:', cosine_sim)
#cosine similarity: 0.992756187915802

cosine similarity: 0.9927560091018677


In [6]:
help(biobert.word_vector)

Help on method word_vector in module biobert_embedding.embedding:

word_vector(text, handle_oov=True, filter_extra_tokens=True) method of biobert_embedding.embedding.BiobertEmbedding instance



### Word Embeddings of Drugs.com drug names (without supporting text)

Find the drug names I've crawled [here](https://drive.google.com/drive/folders/1EO659a-tyjfXjKzHk-M0WreZBR14MgRM?usp=sharing)

In [7]:
unique_drug_names = graph_tools.get_unique_drugs()

In [16]:
# This will take a while. It's parallelized
pool = Pool(num_processes)

# biobert_embeddings_oov = list(map(biobert.word_vector, unique_drug_names)) # Single-threaded

# Out of vocabulary will be split into smaller. May end up with multiple embeddings for each word
biobert_embeddings_oov = pool.map(biobert.word_vector, unique_drug_names)

pool.close()
pool.join()

In [20]:
biobert_embeddings[0][0].shape

torch.Size([768])

In [25]:
list(unique_drug_names)[0]

'promacot-injection-intravenous'

In [28]:
len(biobert.word_vector(list(unique_drug_names)[0]))

5

In [33]:
biobert.word_vector(list(unique_drug_names)[0])[4].shape

torch.Size([768])

In [21]:
len(biobert_embeddings[0])

5

In [None]:
biobert_embeddings

## Evaluations

Top-K similarities

In [None]:
import faiss
import numpy as np

# Assuming you have computed biobert_embeddings

# Instantiate an index with the desired index type and dimensionality
index = faiss.IndexFlatIP(256)  # Cosine similarity, assuming 256-dimensional embeddings

# Convert biobert_embeddings to a numpy array
biobert_embeddings_np = np.array(biobert_embeddings)

# Add biobert_embeddings_np to the index
index.add(biobert_embeddings_np)

# Specify the number of nearest neighbors to retrieve (k)
k = 5

# Perform the similarity search for each embedding
D, I = index.search(biobert_embeddings_np, k+1)  # Retrieve k+1 neighbors to exclude self

# D contains the similarities to the nearest neighbors (cosine similarity)
# I contains the indices of the nearest neighbors (excluding self)

# Iterate over each embedding
for i in range(biobert_embeddings_np.shape[0]):
    embedding = biobert_embeddings_np[i]
    nearest_indices = I[i][1:]  # Exclude self, start from index 1
    nearest_similarities = D[i][1:]  # Exclude self, start from index 1

    # Print the top-k nearest neighbors for the current embedding
    print(f"Embedding {i+1}:")
    for j, index in enumerate(nearest_indices):
        similarity = nearest_similarities[j]
        print(f"Nearest Neighbor {j+1}: Index {index}, Similarity {similarity}")
    print()


## TODO try ClinicalBERT https://github.com/EmilyAlsentzer/clinicalBERT

In [8]:
# from transformers import AutoTokenizer, AutoModel
# tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
# model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")


## TODO try PubChem Embeddings, DrugBERT