In [None]:
# run the following command in terminal to login
# huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): ^C
object address  : 0x1047c9fc0
object refcount : 3
object type     : 

In [27]:
import json
import re
from sentence_transformers import SentenceTransformer

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [None]:

embedder = SentenceTransformer("abhinand/MedEmbed-large-v0.1")

sentences = [
    "Patient shows symptoms of myocardial infarction.",
    "Blood pressure is within normal range."
]
embeddings = embedder.encode(sentences)

similarities = embedder.similarity(embeddings, embeddings)
print(similarities.shape)

In [None]:
with open("../data/guideline_db.json") as f:
    db = json.load(f)

vector_store = (
    list()
)

# Step 1 - build vector store
for chunk in db:
    text = chunk["text"]
    embedding = embedder.encode(text)
    vector_store.append({"text": text, "embedding": embedding, "metadata": chunk["metadata"]})

In [22]:
import pickle
with open('../data/guideline.emb', mode='rb') as f: #replace with your file
  vector_store = pickle.load(f)

In [None]:
# assuming this is your array
arr = vector_store[0]['embedding']

print(arr.shape)
arr


(1024,)


array([ 0.01260174, -0.02737539, -0.06699081, ..., -0.02317602,
       -0.02495784,  0.0003223 ], dtype=float32)

In [30]:
vector_store[0]

{'text': "Canadian Network for Mood and Anxiety Treatments (CANMAT) 2023 Update on Clinical Guidelines for Management of Major Depressive Disorder in Adults: Réseau canadien pour les traitements de l'humeur et de l'anxiété (CANMAT) 2023 : Mise à jour des lignes directrices cliniques pour la prise en charge du trouble dépressif majeur chez les adultes",
 'embedding': array([ 0.01260174, -0.02737539, -0.06699081, ..., -0.02317602,
        -0.02495784,  0.0003223 ], dtype=float32),
 'metadata': {'section': 'title',
  'type': 'title',
  'chunk_index': 1,
  'headings': 'Title',
  'referenced_tables': []}}

In [35]:

def search(embedder, query, vector_store, k, min_similarity):
    query_embedding = embedder.encode(query.lower())
    # make magic happen here
    similarities = []
    results = []
    # calculate cosine similarity between each text and the query
    for i, chunk in enumerate(vector_store):
        similarity = cosine_similarity([query_embedding], [chunk["embedding"]])
        if similarity[0][0] >= min_similarity:
            similarities.append((i, similarity[0][0]))

    # sort the similarities based on similarity and select the top k
    similarities.sort(key=lambda x: x[1], reverse=True)
    for i, similarity in similarities[:k]:
            results.append(vector_store[i]['text'])
            
    
    # if not results:
    #     return "No matching documents!"
    # print(results)
    if not results or not results[0]:
        return ["No matching documents!"]
    return results


# Test your search function!
query = "I have a patient who can’t take lithium but is currently acutely manic. Which medication should I try next?"
results = search(embedder,query, vector_store, 7, 0.1)

# write results to a file
with open("../results.json", "w") as f:
    json.dump(results, f, indent=4)
    
# print results
for i, result in enumerate(results):
    print(f"Result {i+1}:")
    print(f"Text: {result}")
    # print(f"Metadata: {result['metadata']}")
    print()

Result 1:
Text: From section: Question 7. What Should be Done When a Patient is not Better? > Q.7.g. What are the Risks and Benefits of Specific Adjunctive Medications? > Other Medications > paragraph id: 219
Lithium and triiodothyronine were listed as second-line adjunctive agents in the CANMAT (Canadian Network for Mood and Anxiety Treatments) 2016 guidelines. Both medications were included in a network meta-analysis evaluating all adjunctive options. While both lithium and triiodothyronine had evidence for efficacy, the included RCTs predated 2003, had small sample sizes and involved adjunctive use with TCAs only. Lithium also requires serum level monitoring and triiodothyronine requires monitoring of thyroid levels. Hence, lithium and triiodothyronine continue to be second-line recommendations.

Result 2:
Text: From section: Question 7. What Should be Done When a Patient is not Better? > Q.7.g. What are the Risks and Benefits of Specific Adjunctive Medications? > Glutamate Modulato