In [14]:
from biobert_embedding.embedding import BiobertEmbedding
from utils import graph_tools
from multiprocessing import Pool

In [35]:
num_processes = 10
pool = Pool(num_processes)

## Original pip install biobert-embedding does not work (it's too old)

I forked the repo. Install using `pip install git+https://github.com/ariellubonja/biobert_embedding`

In [3]:
## Example 1
text = "Breast cancers with HER2 amplification have a higher risk of CNS metastasis and poorer prognosis."\

# Class Initialization (You can set default 'model_path=None' as your finetuned BERT model path while Initialization)
biobert = BiobertEmbedding()

Using existing models/pytorch_model.bin
Using existing models/config.json
Using existing models/vocab.txt


#### Basic example to make sure it works

In [4]:
word_embeddings = biobert.word_vector(text)
sentence_embedding = biobert.sentence_vector(text)

print("Text Tokens: ", biobert.tokens)
# Text Tokens:  ['breast', 'cancers', 'with', 'her2', 'amplification', 'have', 'a', 'higher', 'risk', 'of', 'cns', 'metastasis', 'and', 'poorer', 'prognosis', '.']

print ('Shape of Word Embeddings: %d x %d' % (len(word_embeddings), len(word_embeddings[0])))
# Shape of Word Embeddings: 16 x 768

print("Shape of Sentence Embedding = ",len(sentence_embedding))
# Shape of Sentence Embedding =  768

## Example 2
sentence_vector1 = biobert.sentence_vector('Breast cancers with HER2 amplification have a higher risk of CNS metastasis and poorer prognosis.')
sentence_vector2 = biobert.sentence_vector('Breast cancers with HER2 amplification are more aggressive, have a higher risk of CNS metastasis, and poorer prognosis.')

Text Tokens:  ['breast', 'cancers', 'with', 'her2', 'amplification', 'have', 'a', 'higher', 'risk', 'of', 'cns', 'metastasis', 'and', 'poorer', 'prognosis', '.']
Shape of Word Embeddings: 16 x 768
Shape of Sentence Embedding =  768


In [5]:
from scipy.spatial.distance import cosine as cosine_distance

cosine_sim = 1 - cosine_distance(sentence_vector1, sentence_vector2)
print('cosine similarity:', cosine_sim)
#cosine similarity: 0.992756187915802

cosine similarity: 0.9927560091018677


In [6]:
help(biobert.word_vector)

Help on method word_vector in module biobert_embedding.embedding:

word_vector(text, handle_oov=True, filter_extra_tokens=True) method of biobert_embedding.embedding.BiobertEmbedding instance



### Word Embeddings of Drugs.com drug names (without supporting text)

Find the drug names I've crawled [here](https://drive.google.com/drive/folders/1EO659a-tyjfXjKzHk-M0WreZBR14MgRM?usp=sharing)

In [50]:
unique_drug_names = list(graph_tools.get_unique_drugs())

<font color="orange">The cell below will try to handle Out of Vocabulary terms. This means that the embedding matrix is not a clear matrix, but will be a list of Tensors</font>

In [16]:
# This will take a while

# biobert_embeddings_oov = list(map(biobert.word_vector, unique_drug_names)) # Single-threaded

# Out of vocabulary will be split into smaller. May end up with multiple embeddings for each word
# parallelized
biobert_embeddings_oov = pool.map(biobert.word_vector, unique_drug_names)

pool.close()
pool.join()

In [37]:
import functools
# ...

# Define a partial function with the desired argument
word_vector_no_oov = functools.partial(biobert.word_vector, handle_oov=False)

# Apply the modified function in parallel using map()
biobert_embeddings_no_oov = pool.map(word_vector_no_oov, unique_drug_names)

pool.close()
pool.join()

In [41]:
type(biobert_embeddings_no_oov[0])

list

In [43]:
len(biobert_embeddings_no_oov[0])

10

In [47]:
# We have multiple embeddings for each drug name
each_drug_embedding_len = [len(biobert_embeddings_no_oov[i]) for i in range(len(biobert_embeddings_no_oov))]

In [48]:
print(" ".join(map(str, each_drug_embedding_len)))

10 2 3 5 5 15 8 6 13 8 6 19 11 8 6 5 6 3 3 7 4 6 8 17 4 9 11 2 6 6 7 9 10 6 3 8 8 3 12 12 5 7 11 7 10 4 5 8 7 3 3 4 3 10 4 5 6 3 4 11 15 6 7 4 4 3 4 19 4 5 10 11 8 8 5 9 4 3 15 4 8 4 8 11 6 8 3 5 2 6 10 3 5 4 11 10 5 5 5 8 7 9 2 6 7 6 11 7 3 8 9 5 10 9 3 9 10 7 7 3 3 3 19 5 10 6 4 5 9 11 7 4 5 7 4 24 2 5 19 6 10 9 2 4 6 2 3 4 5 4 3 6 7 6 4 6 9 8 6 4 3 8 11 10 5 9 2 4 4 10 4 11 4 3 13 13 3 8 8 9 4 4 9 8 4 3 6 2 4 16 6 2 4 10 4 7 3 8 7 15 6 6 4 4 4 16 3 3 4 3 9 4 7 8 4 7 3 2 11 10 12 8 5 10 4 2 18 21 17 8 10 5 5 4 10 11 30 14 5 6 2 4 4 12 11 3 5 9 4 9 7 6 13 11 4 2 6 3 2 4 5 11 13 3 12 7 7 5 7 6 2 7 4 4 3 9 8 7 9 3 7 5 2 11 4 4 4 9 7 4 6 6 10 2 5 6 6 5 3 3 7 6 17 4 17 5 5 6 15 4 9 5 19 10 6 5 5 4 6 4 6 7 11 8 8 7 4 13 4 4 5 9 5 10 4 4 7 3 3 5 14 9 7 17 17 1 8 11 8 7 9 8 14 6 5 3 11 3 4 9 6 3 4 10 6 7 2 4 5 8 9 5 7 7 9 5 24 4 3 8 3 3 9 10 8 3 4 4 3 5 3 13 4 3 4 13 5 13 13 7 4 4 4 3 2 7 10 9 6 4 7 3 3 8 3 15 10 7 14 4 7 8 5 4 5 8 7 4 2 6 5 17 3 5 5 11 9 4 16 8 8 5 4 5 4 2 7 3 6 6 7 5 11 3 

In [49]:
# These are the indices of the drug names that have only one embedding, i.e. the drug name is in the Vocabulary
one_indices = [i for i, num in enumerate(biobert_embeddings_no_oov) if len(num) == 1]
one_indices

[345, 1604, 1727, 1831, 2232, 2773, 2855, 5264, 5744, 7011, 7081]

In [52]:
[unique_drug_names[i] for i in one_indices]

['same',
 'remedy',
 'posture',
 'had',
 'cope',
 'alert',
 'potassium',
 'garlic',
 'charcoal',
 'zinc',
 'glucose']

<font color="red">Unfortunately only the above have a not Out-of-Vocabulary embedding</font>

## Evaluations

Top-K similarities

In [None]:
import faiss
import numpy as np

# Assuming you have computed biobert_embeddings

# Instantiate an index with the desired index type and dimensionality
index = faiss.IndexFlatIP(256)  # Cosine similarity, assuming 256-dimensional embeddings

# Convert biobert_embeddings to a numpy array
biobert_embeddings_np = np.array(biobert_embeddings)

# Add biobert_embeddings_np to the index
index.add(biobert_embeddings_np)

# Specify the number of nearest neighbors to retrieve (k)
k = 5

# Perform the similarity search for each embedding
D, I = index.search(biobert_embeddings_np, k+1)  # Retrieve k+1 neighbors to exclude self

# D contains the similarities to the nearest neighbors (cosine similarity)
# I contains the indices of the nearest neighbors (excluding self)

# Iterate over each embedding
for i in range(biobert_embeddings_np.shape[0]):
    embedding = biobert_embeddings_np[i]
    nearest_indices = I[i][1:]  # Exclude self, start from index 1
    nearest_similarities = D[i][1:]  # Exclude self, start from index 1

    # Print the top-k nearest neighbors for the current embedding
    print(f"Embedding {i+1}:")
    for j, index in enumerate(nearest_indices):
        similarity = nearest_similarities[j]
        print(f"Nearest Neighbor {j+1}: Index {index}, Similarity {similarity}")
    print()


## TODO try ClinicalBERT https://github.com/EmilyAlsentzer/clinicalBERT

In [8]:
# from transformers import AutoTokenizer, AutoModel
# tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
# model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
