# PubmedBert Embedding 

In [4]:
import os
from pathlib import Path
import sys
node_type = os.getenv('BB_CPU')
venv_dir = f'/rds/homes/t/talayag/talayag-agt-computations/opl_analysis/opl-venv-{node_type}'  # edit this line to match the venv directory format
venv_site_pkgs = Path(venv_dir) / 'lib' / f'python{sys.version_info.major}.{sys.version_info.minor}' / 'site-packages'
if venv_site_pkgs.exists():
    sys.path.insert(0, str(venv_site_pkgs))
else:
    print(f"Path '{venv_site_pkgs}' not found. Check that it exists and/or that it exists for node-type '{node_type}'.")

In [5]:
import pandas as pd 

import transformers

from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
articles =pd.read_csv('../data/bq-results-20240801-141038-1722521480060.csv')
question_titles = pd.read_csv("../data/selected-open-problems1.csv", skiprows=1)


In [7]:
print("Article data frame description")
print(articles.describe())
print(f"Columns: {articles.columns}")
print("------------------------")
print("Open problems data frame description")
print(question_titles.describe())
print(f"Columns: {question_titles.columns}")

Article data frame description
                               title                abstract
count                         389627                  340932
unique                        254697                  339505
top     Hypertension in the elderly.  Cross-sectional study.
freq                              19                     154
Columns: Index(['title', 'abstract'], dtype='object')
------------------------
Open problems data frame description
       Scores
count   189.0
mean      4.0
std       0.0
min       4.0
25%       4.0
50%       4.0
75%       4.0
max       4.0
Columns: Index(['Titles', 'Scores'], dtype='object')


# Pre-processing

In [8]:
# Pre processing functions
def strip_columns(text): 
    text = text.strip().strip("[]")
    text = text.replace("]", "")
    return text 

In [9]:
# Test data
first_ten_sentences = articles['title'].head(10)
first_ten_sentences = first_ten_sentences.apply(strip_columns)

first_ten_abstracts = articles['abstract'].head(10)
first_ten_abstracts = first_ten_abstracts.apply(strip_columns)

first_100_abstracts = articles['abstract'].head(100)
first_100_abstracts = first_100_abstracts.apply(strip_columns)


first_open_problem = question_titles['Titles'].head(1)

In [14]:
# For use 
articles_abstracts = articles
open_problems = question_titles

#Drop rows with empty titles or abstracts 
articles_abstracts = articles_abstracts.dropna()
print(articles_abstracts.describe())

#Strip the titles and abstracts 
articles_abstracts.loc[:,"title"] = articles_abstracts["title"].apply(strip_columns)
articles_abstracts.loc[:,"abstract"] = articles_abstracts["abstract"].apply(strip_columns)
articles_abstracts.reset_index(drop=True, inplace=True)

articles_abstracts.head()

                                                    title  \
count                                              340916   
unique                                             207123   
top     Early life determinants of physical activity i...   
freq                                                   14   

                      abstract  
count                   340916  
unique                  339489  
top     Cross-sectional study.  
freq                       154  


Unnamed: 0,title,abstract
0,Effect of nootropil on the ultrastructure of t...,The brain cortex was studied in 7 Wistar rats ...
1,Effect of thyroid hormones and diacylglycerols...,Sphingomyelin metabolism in liver cell nuclei ...
2,Age-related frailty and its association with b...,The relationship between age-related frailty a...
3,Age-related frailty and its association with b...,We analyzed baseline data and up to 7-year mor...
4,Age-related frailty and its association with b...,"The mean FI-B was 0.35 (SD, 0.08), higher than..."


In [16]:
# All articles abstracts with title and abstract concatenated
articles_abstracts.loc[:,'title_abstract'] = articles_abstracts.loc[:,'title'] + " " + articles_abstracts.loc[:,'abstract']

### Test for semantic text similarity 

In [17]:
def get_embeddings(queries, corpus, model, convert_to_tensor=False): 
    query_embeddings = model.encode(queries, convert_to_tensor=convert_to_tensor)
    corpus_embeddings = model.encode(corpus, convert_to_tensor=convert_to_tensor)
    return query_embeddings, corpus_embeddings

In [18]:
import warnings
warnings.filterwarnings('ignore')
model = SentenceTransformer("neuml/pubmedbert-base-embeddings")

query_embeddings, corpus_embeddings = get_embeddings(first_open_problem, first_ten_sentences, model)

In [19]:
# compute similarities 
similarities = model.similarity(query_embeddings, corpus_embeddings)

In [20]:
# output scores 
for idx_i, sentence_1 in enumerate(first_open_problem): 
    print(sentence_1)
    for idx_j, sentence_2 in enumerate(first_ten_sentences):
        print(f" - {sentence_2: <30}: {similarities[idx_i][idx_j]:.4f}")


Which changes in model organisms associated with ageing also change in a similar way in humans?
 - Effect of nootropil on the ultrastructure of the cerebral cortex of the aged rat.: 0.1021
 - Effect of thyroid hormones and diacylglycerols on sphingomyelin metabolism in liver cell nuclei in rats of various ages.: 0.1216
 - Age-related frailty and its association with biological markers of ageing.: 0.4401
 - Age-related frailty and its association with biological markers of ageing.: 0.4401
 - Age-related frailty and its association with biological markers of ageing.: 0.4401
 - Age-related frailty and its association with biological markers of ageing.: 0.4401
 - Forensic neuropsychology in the aging and the dementias.: 0.1433
 - Broadening the definition of brain insulin resistance in aging and Alzheimer's disease.: 0.2138
 - Participation of the brain serotoninergic system in creating the stress reactivity of the hypophyseal-adrenal axis.: 0.0207
 - Effects of perfluorooctanoic acid (PFO

## Test for semantic search

In [21]:
# Test data - use abstracts 
corpus = model.encode(first_100_abstracts)

similarity_scores = util.semantic_search(query_embeddings, corpus_embeddings, top_k=10)

In [22]:
print(f"Query: {first_open_problem}")
for hit in similarity_scores[0]:
    print(first_ten_abstracts[hit['corpus_id']])
    print(f"\t Score: {hit['score']}")
    print(f"-----------------------------------------------")
        


Query: 0    Which changes in model organisms associated wi...
Name: Titles, dtype: object
Many biological processes are implicated in ageing. The systemic effects of these processes can be elucidated using the frailty index approach, which showed here that subclinical deficits increased the risk of death. In the future, blood biomarkers may indicate the nature of the underlying causal deficits leading to age-related frailty, thereby helping to expose targets for early preventative interventions.
	 Score: 0.44008374214172363
-----------------------------------------------
The relationship between age-related frailty and the underlying processes that drive changes in health is currently unclear. Considered individually, most blood biomarkers show only weak relationships with frailty and ageing. Here, we examined whether a biomarker-based frailty index (FI-B) allowed examination of their collective effect in predicting mortality compared with individual biomarkers, a clinical deficits fra