In [1]:
pip install -U sentence-transformers pandas

import pandas as pd
from sentence_transformers import SentenceTransformer, util
# Sample patient profiles
patient_data = pd.DataFrame([
    {
        "patient_id": "P001",
        "description": "A 55-year-old with high blood pressure and no history of stroke or kidney issues"
    },
    {
        "patient_id": "P002",
        "description": "A 60-year-old man with Type 2 Diabetes, not on insulin, liver function normal"
    },
    {
        "patient_id": "P003",
        "description": "A 50-year-old woman with chronic kidney disease and hypertension"
    }
])

# Sample trial descriptions
trial_data = pd.DataFrame([
    {
        "trial_id": "T001",
        "name": "Hypertension Treatment Study",
        "inclusion_criteria": "Patients aged 40-65 with diagnosed hypertension",
        "exclusion_criteria": "Patients with history of stroke or kidney disease"
    },
    {
        "trial_id": "T002",
        "name": "Diabetes Type 2 Medication Trial",
        "inclusion_criteria": "Individuals with Type 2 Diabetes aged between 30 and 70",
        "exclusion_criteria": "Patients on insulin therapy or with liver dysfunction"
    },
])

model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast and accurate
# Encode patient descriptions
patient_embeddings = model.encode(patient_data["description"].tolist(), convert_to_tensor=True)

# Combine inclusion + exclusion into a single string for each trial
trial_texts = (trial_data["inclusion_criteria"] + ". " + trial_data["exclusion_criteria"]).tolist()
trial_embeddings = model.encode(trial_texts, convert_to_tensor=True)


cosine_scores = util.cos_sim(patient_embeddings, trial_embeddings)
results = []

for i, patient in patient_data.iterrows():
    for j, trial in trial_data.iterrows():
        score = cosine_scores[i][j].item()
        results.append({
            "patient_id": patient["patient_id"],
            "trial_id": trial["trial_id"],
            "trial_name": trial["name"],
            "similarity_score": round(score, 4)
        })

# Create result DataFrame
match_df = pd.DataFrame(results)
match_df.sort_values(by=["patient_id", "similarity_score"], ascending=[True, False], inplace=True)

print(match_df)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  patient_id trial_id                        trial_name  similarity_score
0       P001     T001      Hypertension Treatment Study            0.8175
1       P001     T002  Diabetes Type 2 Medication Trial            0.3622
3       P002     T002  Diabetes Type 2 Medication Trial            0.7786
2       P002     T001      Hypertension Treatment Study            0.3924
4       P003     T001      Hypertension Treatment Study            0.6992
5       P003     T002  Diabetes Type 2 Medication Trial            0.3591
