## Semantic Textual Similarity

In [1]:
from datasets import load_dataset, concatenate_datasets
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm
2025-05-06 21:04:39.683554: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-06 21:04:39.694340: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746536679.709001 2772682 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746536679.713472 2772682 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746536679.724898 2772682 computation_placer.cc:177] computation placer already r

#### Load Dataset

In [2]:
biosses_dataset = load_dataset('bigbio/biosses', trust_remote_code = True)

biosses_concatenated = concatenate_datasets([
    biosses_dataset['train'],
    biosses_dataset['validation'],
    biosses_dataset['test']
])

#### Convert to Pandas and Clean

In [3]:
# Concatenate Train, Validation and Test Set
biosses = biosses_concatenated.to_pandas()

# Create Mean Label across all 5 annotations
biosses['mean_label'] = biosses.iloc[:, 4:-1].mean(axis = 1)

# Normalised Mean Label for Cosine Similarity
biosses['mean_label'] = biosses['mean_label'] / 4
biosses.head()

Unnamed: 0,id,document_id,text_1,text_2,annotator_a,annotator_b,annotator_c,annotator_d,annotator_e,mean_label
0,0,1,It has recently been shown that Craf is essent...,It has recently become evident that Craf is es...,4,4,4,4,4,1.0
1,2,3,Previous studies demonstrated that the decreas...,"In addition, genetic and functional studies su...",2,2,3,2,2,0.5625
2,3,4,"More recently, IDH mutations and resultant 2-h...",It has also been recently reported that mutati...,3,3,4,3,3,0.8125
3,4,5,Recent in vitro studies using shRNA-based appr...,Two recent studies used RNAi-mediated Tet2 kno...,3,3,4,3,3,0.8125
4,5,6,"Recently, it was reported that expression of I...",This large-scale study showed that IDH1/IDH2 m...,3,3,4,3,3,0.8125


#### Load ClinicalBERT and BERT

In [4]:
from transformers import AutoTokenizer, AutoModel

ClinicalBERT = 'medicalai/ClinicalBERT'
BERT = 'google-bert/bert-base-uncased'
BioBERT = 'emilyalsentzer/Bio_ClinicalBERT'
BlueBERT = 'bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12'

tokenizer_ClinicalBERT = AutoTokenizer.from_pretrained(ClinicalBERT)
model_ClinicalBERT = AutoModel.from_pretrained(ClinicalBERT)

tokenizer_BERT = AutoTokenizer.from_pretrained(BERT)
model_BERT = AutoModel.from_pretrained(BERT)

tokenizer_BioBERT = AutoTokenizer.from_pretrained(BioBERT)
model_BioBERT = AutoModel.from_pretrained(BioBERT)

tokenizer_BlueBERT = AutoTokenizer.from_pretrained(BlueBERT)
model_BlueBERT = AutoModel.from_pretrained(BlueBERT)

#### Load Setence Encoder

In [31]:
sentence_model_1 = SentenceTransformer('all-MiniLM-L6-v2')
sentence_model_2 = SentenceTransformer('pritamdeka/S-PubMedBert-MS-MARCO')
sentence_model_3 = SentenceTransformer('neuml/pubmedbert-base-embeddings')
sentence_model_4 = SentenceTransformer('nuvocare/WikiMedical_sent_biobert')
sentence_model_5 = SentenceTransformer('ls-da3m0ns/bge_large_medical')

In [32]:
def sentence_sim(model, row):
    input1 = model.encode(row['text_1'])
    input2 = model.encode(row['text_2'])
    return model.similarity(input1, input2)[0][0].item()

In [33]:
def max_pooling(inputs, model):
    with torch.no_grad():
            outputs = model(**inputs)
            last_hidden_state = outputs.last_hidden_state
            
    attention_mask = inputs['attention_mask']
    mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size())

    last_hidden_state[mask_expanded == 0] = -1e9
    max_embedding = torch.max(last_hidden_state, dim = 1).values

    return max_embedding

In [34]:
def mean_pooling(inputs, model):
    with torch.no_grad():
            outputs = model(**inputs)
            last_hidden_state = outputs.last_hidden_state
            
    attention_mask = inputs['attention_mask']
    mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()

    sum_embeddings = torch.sum(last_hidden_state * mask_expanded, 1)
    sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
    
    mean_pooling = sum_embeddings / sum_mask  # shape: [batch_size, hidden_dim]

    return mean_pooling


In [35]:
def cosine_similarity(tokenizer, model, row, pooling = 'mean'):
    input1 = tokenizer(row['text_1'], return_tensors = 'pt', padding = False)
    input2 = tokenizer(row['text_2'], return_tensors = 'pt', padding = False)

    if pooling == 'mean':
        embeddings = [mean_pooling(inputs, model) for inputs in [input1, input2]]
    elif pooling == 'max':
        embeddings = [max_pooling(inputs, model) for inputs in [input1, input2]]

    cos_sim = torch.nn.functional.cosine_similarity(embeddings[0], embeddings[1])

    return cos_sim.item()

        



In [36]:
# MEAN POOLING
biosses['ClinicalBERT_SIM'] = biosses.apply(lambda row: cosine_similarity(tokenizer_ClinicalBERT, model_ClinicalBERT, row), axis = 1)
biosses['BioBERT_SIM'] = biosses.apply(lambda row: cosine_similarity(tokenizer_BioBERT, model_BioBERT, row), axis = 1)
biosses['BlueBERT_SIM'] = biosses.apply(lambda row: cosine_similarity(tokenizer_BlueBERT, model_BlueBERT, row), axis = 1)
biosses['BERT_SIM'] = biosses.apply(lambda row: cosine_similarity(tokenizer_BERT, model_BERT, row), axis = 1)
biosses['Sentence_1_SIM'] = biosses.apply(lambda row: sentence_sim(sentence_model_1, row), axis = 1)
biosses['Sentence_2_SIM'] = biosses.apply(lambda row: sentence_sim(sentence_model_2, row), axis = 1)
biosses['Sentence_3_SIM'] = biosses.apply(lambda row: sentence_sim(sentence_model_3, row), axis = 1)
biosses['Sentence_4_SIM'] = biosses.apply(lambda row: sentence_sim(sentence_model_4, row), axis = 1)
biosses['Sentence_5_SIM'] = biosses.apply(lambda row: sentence_sim(sentence_model_5, row), axis = 1)

In [77]:
# MAX POOLING
biosses['ClinicalBERT_SIM_MAX'] = biosses.apply(lambda row: cosine_similarity(tokenizer_ClinicalBERT, model_ClinicalBERT, row, 'max'), axis = 1)
biosses['BioBERT_SIM_MAX'] = biosses.apply(lambda row: cosine_similarity(tokenizer_BioBERT, model_BioBERT, row, 'max'), axis = 1)
biosses['BlueBERT_SIM_MAX'] = biosses.apply(lambda row: cosine_similarity(tokenizer_BlueBERT, model_BlueBERT, row, 'max'), axis = 1)
biosses['BERT_SIM_MAX'] = biosses.apply(lambda row: cosine_similarity(tokenizer_BERT, model_BERT, row, 'max'), axis = 1)

In [37]:
biosses.to_csv('Saved Data/biosses_data.csv')

In [38]:
biosses.head()

Unnamed: 0,id,document_id,text_1,text_2,annotator_a,annotator_b,annotator_c,annotator_d,annotator_e,mean_label,ClinicalBERT_SIM,BioBERT_SIM,BlueBERT_SIM,BERT_SIM,Sentence_1_SIM,Sentence_2_SIM,Sentence_3_SIM,Sentence_4_SIM,Sentence_5_SIM
0,0,1,It has recently been shown that Craf is essent...,It has recently become evident that Craf is es...,4,4,4,4,4,1.0,0.82524,0.921591,0.941057,0.849393,0.66336,0.980989,0.827147,0.776582,0.923207
1,2,3,Previous studies demonstrated that the decreas...,"In addition, genetic and functional studies su...",2,2,3,2,2,0.5625,0.80517,0.930695,0.911174,0.819069,0.494428,0.936641,0.563373,0.659069,0.669502
2,3,4,"More recently, IDH mutations and resultant 2-h...",It has also been recently reported that mutati...,3,3,4,3,3,0.8125,0.926234,0.972272,0.950067,0.946986,0.780003,0.981489,0.809556,0.775908,0.888338
3,4,5,Recent in vitro studies using shRNA-based appr...,Two recent studies used RNAi-mediated Tet2 kno...,3,3,4,3,3,0.8125,0.888762,0.966696,0.954745,0.934582,0.766122,0.973089,0.745908,0.955495,0.880643
4,5,6,"Recently, it was reported that expression of I...",This large-scale study showed that IDH1/IDH2 m...,3,3,4,3,3,0.8125,0.907551,0.940687,0.947424,0.917922,0.788279,0.961574,0.755442,0.714217,0.661724
