In [2]:
!pip install datasets faiss-cpu transformers tensorflow gensim

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy>=1.17 (from datasets)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m49.7 MB/s[0m eta [36m0:00:

In [20]:
from datasets import load_dataset
from gensim.utils import simple_preprocess

# Load the PubMedQA dataset
ds = load_dataset("qiaojin/PubMedQA", "pqa_labeled")

# Extract questions and associated abstracts for embedding
questions = ds['train']['question']
abstracts = ds['train']['context']
print(f"Loaded {len(abstracts)} abstracts from PubMedQA.")

# Extract 'contexts' field from each abstract and preprocess it
def preprocess_text(text):
    # Use gensim's simple_preprocess to tokenize and clean text
    tokens = simple_preprocess(text, deacc=True)  # deacc=True removes punctuation
    return ' '.join(tokens)

# Apply preprocessing to each abstract
cleaned_abstracts = []
for abstract in abstracts:
    # Concatenate all sections within 'contexts' into a single string
    full_text = ' '.join(abstract['contexts'])
    cleaned_text = preprocess_text(full_text)
    cleaned_abstracts.append(cleaned_text)

print("Text preprocessing completed.")


Loaded 1000 abstracts from PubMedQA.
Text preprocessing completed.


**Using Pytorch instead of Tensor Flow**

In [24]:
print(cleaned_abstracts[0])

programmed cell death pcd is the regulated death of cells within an organism the lace plant aponogeton produces perforations in its leaves through pcd the leaves of the plant consist of latticework of longitudinal and transverse veins enclosing areoles pcd occurs in the cells at the center of these areoles and progresses outwards stopping approximately five cells from the vasculature the role of mitochondria during pcd has been recognized in animals however it has been less studied during pcd in plants the following paper elucidates the role of mitochondrial dynamics during developmentally regulated pcd in vivo in single areole within window stage leaf pcd is occurring was divided into three areas based on the progression of pcd cells that will not undergo pcd npcd cells in early stages of pcd epcd and cells in late stages of pcd lpcd window stage leaves were stained with the mitochondrial dye mitotracker red cmxros and examined mitochondrial dynamics were delineated into four categori

In [25]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# Set PyTorch to use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2").to(device)

# Function to generate embeddings with GPU in batches
def get_embeddings_in_batches(texts, batch_size=16):
    all_embeddings = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]

        # Tokenize and move to device
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

        # Generate embeddings
        with torch.no_grad():
            outputs = model(**inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling

        # Move embeddings to CPU and convert to numpy
        all_embeddings.extend(batch_embeddings.cpu().numpy())

    return np.array(all_embeddings)

# Example usage (replace `cleaned_abstracts` with your actual list of texts)
embeddings = get_embeddings_in_batches(cleaned_abstracts)
print(f"Generated embeddings for {len(embeddings)} abstracts.")


Using device: cuda
Generated embeddings for 1000 abstracts.


In [26]:
print(embeddings[0])

[-1.29553214e-01 -5.12143932e-02 -6.25073314e-02 -1.27265463e-02
  8.29689577e-02  5.34390472e-02 -1.62546441e-01  6.00047670e-02
  2.02693716e-01  1.42728195e-01  2.08357483e-01  9.10859406e-02
  1.57323837e-01 -1.49894938e-01 -1.45714819e-01 -3.24348882e-02
 -1.80800676e-01  2.49275878e-01 -1.49214849e-01  1.07877396e-01
  1.26826867e-01 -1.46040646e-02  7.03575090e-02 -1.92708597e-01
 -4.04870287e-02 -1.51772290e-01 -7.09554553e-02 -9.22578201e-02
 -3.17166485e-02  7.44923204e-02 -1.67982325e-01  6.94317371e-03
 -2.61996329e-01  1.10196151e-01 -5.60705177e-02  2.45159760e-01
  9.52953547e-02 -1.54210076e-01 -2.17150912e-01 -1.53024301e-01
  9.63688493e-02  1.71286672e-01 -8.89786556e-02  4.85177487e-02
  6.05768785e-02  5.91825694e-02 -3.97821665e-02 -6.13752045e-02
 -9.46382433e-02 -1.16104573e-01  4.74357158e-02 -4.59192842e-02
  1.00484267e-02 -4.90188152e-02 -1.35633767e-01  1.32354824e-02
  6.00755140e-02 -3.69408019e-02  3.73073593e-02 -9.09404308e-02
 -8.97273049e-02  3.24514

In [18]:
##Using Pytorch instead of Tensorflow##
# from transformers import TFAutoModel, AutoTokenizer
# import tensorflow as tf
# import numpy as np

# # Set TensorFlow to use GPU if available
# device_name = tf.config.experimental.list_logical_devices('GPU')
# print("Using device:", "GPU" if device_name else "CPU")

# # Load model and tokenizer
# tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
# model = TFAutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# # Function to generate embeddings with GPU in batches
# def get_embeddings_in_batches(texts, batch_size=16):
#     all_embeddings = []
#     for i in range(0, len(texts), batch_size):
#         batch_texts = texts[i:i + batch_size]

#         # Tokenize and process each batch
#         inputs = tokenizer(batch_texts, return_tensors="tf", padding=True, truncation=True, max_length=512)

#         # Move computation to GPU if available
#         with tf.device('/GPU:0' if device_name else '/CPU:0'):
#             outputs = model(inputs).last_hidden_state
#             batch_embeddings = tf.reduce_mean(outputs, axis=1)  # Mean pooling

#         # Collect batch embeddings
#         all_embeddings.extend(batch_embeddings.numpy())

#     return np.array(all_embeddings)

# # Generate embeddings for all abstracts
# embeddings = get_embeddings_in_batches(cleaned_abstracts)
# print(f"Generated embeddings for {len(embeddings)} abstracts.")


Using device: GPU


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.


TypeError: 'builtins.safe_open' object is not iterable

In [27]:
import faiss

# Define the dimension of embeddings
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)  # L2 similarity for retrieval

# Ensure embedding_matrix has the correct shape for FAISS
embedding_matrix = embeddings.reshape(-1, embedding_dim).astype('float32')

# Add embeddings to FAISS index
index.add(embedding_matrix)
print(f"Total embeddings indexed: {index.ntotal}")


Total embeddings indexed: 1000


In [31]:
def retrieve_similar_abstracts(query, top_k=5):
    # Embed the query
    query_embedding = get_embeddings_in_batches([query])

    # Search FAISS for similar documents
    distances, indices = index.search(query_embedding.reshape(1, -1), top_k)

    # Retrieve and print the top similar abstracts
    results = []
    for idx in indices[0]:
        results.append(cleaned_abstracts[idx])
    return results

# Test the function with a sample question
sample_query = "Are group 2 innate lymphoid cells ( ILC2s ) increased in chronic rhinosinusitis with nasal polyps or eosinophilia?"
similar_abstracts = retrieve_similar_abstracts(sample_query)

print("Top similar abstracts:")
for i, abstract in enumerate(similar_abstracts, 1):
    print(f"{i}. {abstract}")


Top similar abstracts:
1. the technique of induced sputum has allowed to subdivide asthma patients into inflammatory phenotypes according to their level of granulocyte airway infiltration there are very few studies which looked at detailed sputum and blood cell counts in large cohort of asthmatics divided into inflammatory phenotypes the purpose of this study was to analyze sputum cell counts blood leukocytes and systemic inflammatory markers in these phenotypes and investigate how those groups compared with healthy subjects we conducted retrospective cross sectional study on asthmatics recruited from the university asthma clinic of liege and compared them with healthy subjects asthmatics were classified into inflammatory phenotypes the total non squamous cell count per gram of sputum was greater in mixed granulocytic and neutrophilic phenotypes as compared to eosinophilic asthma and healthy subjects sputum eosinophils in absolute values and percentages were increased in all asthma phe

**TEST**

In [33]:
print(questions[0])
print(abstracts[0])

Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?
{'contexts': ['Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.', 'The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), cells in early stages of PCD (EPCD), and cells in lat

In [34]:
# Test the function with a sample question
sample_query = "Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?"
similar_abstracts = retrieve_similar_abstracts(sample_query)

print("Top similar abstracts:")
for i, abstract in enumerate(similar_abstracts, 1):
    print(f"{i}. {abstract}")

Top similar abstracts:
1. programmed cell death pcd is the regulated death of cells within an organism the lace plant aponogeton produces perforations in its leaves through pcd the leaves of the plant consist of latticework of longitudinal and transverse veins enclosing areoles pcd occurs in the cells at the center of these areoles and progresses outwards stopping approximately five cells from the vasculature the role of mitochondria during pcd has been recognized in animals however it has been less studied during pcd in plants the following paper elucidates the role of mitochondrial dynamics during developmentally regulated pcd in vivo in single areole within window stage leaf pcd is occurring was divided into three areas based on the progression of pcd cells that will not undergo pcd npcd cells in early stages of pcd epcd and cells in late stages of pcd lpcd window stage leaves were stained with the mitochondrial dye mitotracker red cmxros and examined mitochondrial dynamics were del