In [None]:
import json
import numpy as np
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from scipy.spatial.distance import cdist

In [None]:
def load_jsonl_concepts(jsonl_path):
    """
    Reads a JSONL file and returns a list of (text, concept_id) pairs.
    Each line in the JSONL should have:
        "concept_id", "canonical_name", "aliases"
    """
    concepts = []
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in tqdm(f, desc="Reading JSONL"):
            data = json.loads(line.strip())
            cui = data["concept_id"]
            # Merge canonical_name with aliases
            aliases = set(data.get("aliases", []))
            aliases.add(data["canonical_name"])
            # Build list of (text, cui) pairs
            for text in aliases:
                concepts.append((text, cui))
    return concepts


In [None]:
def encode_texts_with_sapbert(texts, batch_size=128):
    """
    Encodes a list of strings (texts) using SapBERT and returns a NumPy array of embeddings.
    """
    tokenizer = AutoTokenizer.from_pretrained("cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR")
    model = AutoModel.from_pretrained("cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR")
    model.eval()  # put model in evaluation mode

    # If you have a GPU, uncomment these lines:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    all_embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding texts"):
        batch_texts = texts[i : i + batch_size]
        toks = tokenizer.batch_encode_plus(
            batch_texts,
            padding="max_length",
            max_length=25,
            truncation=True,
            return_tensors="pt",
        )
        # If using GPU:
        for k in toks:
            toks[k] = toks[k].to(device)

        with torch.no_grad():
            output = model(**toks)
            cls_rep = output.last_hidden_state[:, 0, :]  # [CLS] token
            # Move the tensor to CPU before converting to NumPy
            all_embeddings.append(cls_rep.cpu().numpy()) # Changed line

    # Concatenate all embeddings into a single NumPy array
    return np.concatenate(all_embeddings, axis=0)

In [None]:
JSONL_PATH = "/content/output.jsonl"  # <-- Change to your JSONL file path
my_concepts = load_jsonl_concepts(JSONL_PATH)
# my_concepts is a list of (text, cui), e.g. [("Neoplasm of abdomen", "C0000735"), ... ]

print(f"Loaded {len(my_concepts)} (text, concept_id) pairs.")
print("Example pairs:", my_concepts[:5])

# For demonstration, we might limit to the first 100,000 to save time/memory
my_concepts_100k = my_concepts

# Separate the texts and IDs
all_names = [p[0] for p in my_concepts_100k]
all_ids = [p[1] for p in my_concepts_100k]

# -------------------------------------------------------------------------
# 2) ENCODE ALL LABELS (TEXTS) WITH SAPBERT
# -------------------------------------------------------------------------
all_reps_emb = encode_texts_with_sapbert(all_names, batch_size=128)
print("Embedding shape:", all_reps_emb.shape)



Reading JSONL: 109255it [00:00, 159294.33it/s]


Loaded 448852 (text, concept_id) pairs.
Example pairs: [('abdomen tumours', 'C0000735'), ('tumor of abdomen', 'C0000735'), ('Tumour of abdomen', 'C0000735'), ('abdomen neoplasm', 'C0000735'), ('abdominal neoplasm', 'C0000735')]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Encoding texts: 100%|██████████| 3507/3507 [09:31<00:00,  6.13it/s]


Embedding shape: (448852, 768)


In [None]:
# -------------------------------------------------------------------------
# 3) ENCODE A QUERY AND FIND NEAREST NEIGHBOR
# -------------------------------------------------------------------------
query = "Asthma in Children"
print(f"\nQuery: {query}")

tokenizer = AutoTokenizer.from_pretrained("cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR")
model = AutoModel.from_pretrained("cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR")
model.eval()

# If you have GPU, you can move the model to CUDA for the query as well.
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

query_toks = tokenizer.batch_encode_plus(
    [query],
    padding="max_length",
    max_length=25,
    truncation=True,
    return_tensors="pt"
)
# If on GPU:
# for k in query_toks:
#     query_toks[k] = query_toks[k].to(device)

with torch.no_grad():
    query_output = model(**query_toks)
    query_cls_rep = query_output.last_hidden_state[:, 0, :]
    # query_cls_rep = query_cls_rep.cpu()  # if GPU used

# Convert to numpy
query_emb = query_cls_rep.numpy()

# -------------------------------------------------------------------------
# 4) SIMPLE NEAREST NEIGHBOR SEARCH
# -------------------------------------------------------------------------
dist = cdist(query_emb, all_reps_emb, metric="euclidean")
nn_index = np.argmin(dist)
print("\n--- Nearest Neighbor Search ---")
print("Nearest concept text:", all_names[nn_index])
print("Nearest concept_id:", all_ids[nn_index])
print("---------------------------------\n")


Query: Asthma in Children

--- Nearest Neighbor Search ---
Nearest concept text: Asthma in Children
Nearest concept_id: C0264408
---------------------------------



********************************************************* FAISS

In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [None]:
from scipy.spatial.distance import cdist
import faiss

In [None]:
d = all_reps_emb.shape[1]
index = faiss.IndexFlatL2(d)
index.add(all_reps_emb.astype('float32'))

faiss.write_index(index, "sapbert_index.faiss")

In [None]:
# -------------------
# 4) Load index & query
# -------------------
index_loaded = faiss.read_index("sapbert_index.faiss")

query = "Asthma Kid"
tokenizer = AutoTokenizer.from_pretrained("cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR")
model = AutoModel.from_pretrained("cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR")
model.eval()

toks = tokenizer.batch_encode_plus([query],
                                   padding="max_length",
                                   max_length=25,
                                   truncation=True,
                                   return_tensors="pt")
with torch.no_grad():
    output = model(**toks)
    cls_rep = output.last_hidden_state[:, 0, :]
query_emb = cls_rep.cpu().numpy().astype('float32')

In [None]:
# -------------------
# 5) Nearest neighbor search
# -------------------
k = 1
distances, indices = index_loaded.search(query_emb, k)
best_idx = indices[0][0]
print("Query:", query)
print("Nearest concept text:", all_names[best_idx])
print("Nearest concept_id:", all_ids[best_idx])
print("Distance:", distances[0][0])

Query: Asthma Kid
Nearest concept text: Asthma in children
Nearest concept_id: C0264408
Distance: 47.910233


In [None]:
!python --version


Python 3.11.11
