# Comparing Embedding Models for Inclusion Criteria

In [13]:
# Connecting to drive
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/Clinical Trials Outcomes')

import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import cosine
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

sentence_1 = "Subjects requiring treatment with systemic corticosteroids (e.g., oral, parenteral, ocular) - Any significant disease or disorder that may jeopardize a subject's safety"
sentence_2 = "Patient has previously received or is receiving an organ transplant other than a liver. - Patient has received an ABO incompatible donor liver. - Patient or donor is known to be HIV positive. - Patient has a current malignancy or a history of malignancy (within the past 5 years), except non-metastatic basal or squamous cell carcinoma of the skin that has been treated successfully. - Patient is being transplanted for hepatic malignancy with a single nodule greater than 5.0 cm in diameter or 2 or more nodules with at least one > 3.0 cm. - Patient has a serum creatinine >175 µmol/L at baseline. Patient has uncontrolled concomitant infections and/or severe diarrhoea, vomiting, active upper gastro-intestinal tract malabsorption or an active peptic ulcer or any other unstable medical condition that could interfere with the study objectives. - Patient who is receiving or may require warfarin or fluvastatin during the study. - Patient is participating in another clinical trial and/or is taking or has been taking an investigational drug in the 28 days prior to transplant"

# Define models (same as before)
models = {
    'BERT': 'google-bert/bert-base-uncased',
    'MedBERT': 'Charangan/MedBERT',
    'BioBERT': 'dmis-lab/biobert-base-cased-v1.2',
    'TinyBioBERT': 'nlpie/tiny-biobert',
    'ClinicalBERT': 'emilyalsentzer/Bio_ClinicalBERT',
    'BlueBERT': 'bionlp/bluebert_pubmed_uncased_L-24_H-1024_A-16'
}

# Using 'nlpie/tiny-biobert', a smaller version of BioBERT
model = SentenceTransformer('nlpie/tiny-biobert')

def criteria2vec(criteria):
    embeddings = model.encode(criteria)
    return embeddings

s1 = criteria2vec(sentence_1)

print(s1)

In [36]:
from sentence_transformers import SentenceTransformer, util

# Load the BioSimCSE model
model = SentenceTransformer('kamalkraj/BioSimCSE-BioLinkBERT-BASE')

# Define the sentences
sentence_1 = "Subjects requiring treatment with systemic corticosteroids (e.g., oral, parenteral, ocular). Any significant disease or disorder that may jeopardize a subject's safety"
sentence_2 = "Patient has previously received or is receiving an organ transplant other than a liver. - Patient has received an ABO incompatible donor liver. - Patient or donor is known to be HIV positive. - Patient has a current malignancy or a history of malignancy (within the past 5 years), except non-metastatic basal or squamous cell carcinoma of the skin that has been treated successfully. - Patient is being transplanted for hepatic malignancy with a single nodule greater than 5.0 cm in diameter or 2 or more nodules with at least one > 3.0 cm. - Patient has a serum creatinine >175 µmol/L at baseline. Patient has uncontrolled concomitant infections and/or severe diarrhoea, vomiting, active upper gastro-intestinal tract malabsorption or an active peptic ulcer or any other unstable medical condition that could interfere with the study objectives. - Patient who is receiving or may require warfarin or fluvastatin during the study. - Patient is participating in another clinical trial and/or is taking or has been taking an investigational drug in the 28 days prior to transplant"

# Encode the sentences
embedding_1 = model.encode(sentence_1, convert_to_tensor=True)
embedding_2 = model.encode(sentence_2, convert_to_tensor=True)

# Calculate cosine similarity
cosine_similarity = util.pytorch_cos_sim(embedding_1, embedding_2)

print(f"Cosine similarity: {cosine_similarity.item():.4f}")

Cosine similarity: 0.0804
