In [None]:
"""
@Author: Magnus Graham
1/3/2025

This notebook serves as a template for matching free text to a set of predefined symptoms.
It uses spaCy to tokenize text, and uses BioBERT to map their meaning to the closest possible
match in the symptoms list.

"""

In [3]:
!pip install transformers
!pip install torch
!pip install spacy
!python -m spacy download en_core_web_sm  # if not already installed

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m73.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [17]:
import spacy
from transformers import BertTokenizer, BertModel
import torch
import torch.nn.functional as F

# Load spaCy model for tokenization
nlp = spacy.load("en_core_web_sm")

# Load BioBERT model and tokenizer from Hugging Face
tokenizer = BertTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = BertModel.from_pretrained("dmis-lab/biobert-v1.1")


In [18]:
def normalize_text(text):
    # Lowercase and remove unnecessary punctuation
    text = text.lower().strip()
    return text

def preprocess_input(user_input):
    # Normalize input text
    normalized_input = normalize_text(user_input)
    
    # Split by commas or periods
    clauses = [clause.strip() for clause in normalized_input.split(",") if clause.strip()]
    
    return clauses

def tokenize_and_lemmatize_clauses(clauses):
    tokens_per_clause = []
    for clause in clauses:
        doc = nlp(clause)
        tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
        tokens_per_clause.append(" ".join(tokens))  # Joining tokens back to a string
    return tokens_per_clause


In [19]:
def get_bio_bert_embeddings(tokens):
    # Tokenize the input text
    inputs = tokenizer(tokens, return_tensors="pt", padding=True, truncation=True)
    
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the embeddings for each token (shape: batch_size x seq_len x embedding_dim)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling over sequence length
    return embeddings


In [22]:
# Function to compute cosine similarity between two embeddings
import heapq
def cosine_similarity(embedding1, embedding2):
    return F.cosine_similarity(embedding1, embedding2)

# Function to find the most similar predefined symptom for each clause
""" def compare_clauses_to_symptoms(clauses, predefined_symptoms):
    for clause in clauses:
        clause_embedding = get_bio_bert_embeddings(clause)
        
        for symptom in predefined_symptoms:
            symptom_embedding = get_bio_bert_embeddings(symptom)
            
            similarity = cosine_similarity(clause_embedding, symptom_embedding)
            print(f"Clause: '{clause}' - Symptom: '{symptom}' - Similarity: {similarity.item():.2f}") """

def compare_clauses_to_symptoms(clauses, predefined_symptoms, top_n=4):  # **Added top_n argument**
    for clause in clauses:
        clause_embedding = get_bio_bert_embeddings(clause)
        
        similarity_scores = []  # **Initialized a list to store similarity scores**
        
        # Compare the clause with each predefined symptom
        for symptom in predefined_symptoms:
            symptom_embedding = get_bio_bert_embeddings(symptom)
            
            similarity = cosine_similarity(clause_embedding, symptom_embedding)
            similarity_scores.append((symptom, similarity.item()))  # **Store similarity with symptom as a tuple**
        
        # Use heapq to get the top N most similar symptoms
        top_similar_symptoms = heapq.nlargest(top_n, similarity_scores, key=lambda x: x[1])  # **Extract top_n using heapq.nlargest**
        
        # Display the top N matched symptoms for the clause
        print(f"Clause: '{clause}'")
        for symptom, similarity in top_similar_symptoms:  # **Iterate over the top N similar symptoms**
            print(f"  - Symptom: '{symptom}' - Similarity: {similarity:.2f}")


In [25]:
# User input and predefined symptoms
user_input = "blue skin, swollen eyes, cough."
predefined_symptoms = [
    "headache", "fever", "cough", "runny nose", "sore throat",
    "fatigue", "chills", "muscle aches", "shortness of breath", "nausea",
    "vomiting", "diarrhea", "stomach ache", "dizziness", "chest pain",
    "abdominal pain", "sweating", "rash", "sneezing", "congestion",
    "loss of taste", "loss of smell", "coughing up blood", "wheezing", "painful urination",
    "difficulty swallowing", "bloody stool", "joint pain", "painful joints", "swollen glands",
    "itching", "ear pain", "eye redness", "blurred vision", "skin discoloration", "frequent urination",
    "weight loss", "weight gain", "swelling", "high blood pressure", "rapid heartbeat", "low blood pressure",
    "dry mouth", "mouth sores", "swollen feet", "night sweats", "cold sweats", "shaking", "tremors", "confusion",
    "delirium", "loss of appetite", "irregular heartbeat", "head pressure", "burning sensation", "skin irritation",
    "muscle weakness", "abnormal bleeding", "swollen abdomen", "pale skin", "bluish skin", "bloody cough",
    "yellowing of skin", "yellowing of eyes", "numbness", "tingling", "sore gums", "chest tightness", "coughing blood",
    "hiccups", "persistent cough", "throat clearing", "snoring", "sore tongue", "difficulty breathing", "itchy skin"
]


# Step 1: Preprocess the input
clauses = preprocess_input(user_input)

# Step 2: Tokenize and Lemmatize the clauses
tokenized_clauses = tokenize_and_lemmatize_clauses(clauses)

# Step 3: Compare each clause to predefined symptoms using BioBERT
compare_clauses_to_symptoms(tokenized_clauses, predefined_symptoms)


Clause: 'blue skin'
  - Symptom: 'pale skin' - Similarity: 0.97
  - Symptom: 'yellowing of eyes' - Similarity: 0.90
  - Symptom: 'skin irritation' - Similarity: 0.90
  - Symptom: 'bluish skin' - Similarity: 0.89
Clause: 'swollen eye'
  - Symptom: 'swollen feet' - Similarity: 0.96
  - Symptom: 'swollen glands' - Similarity: 0.95
  - Symptom: 'swollen abdomen' - Similarity: 0.94
  - Symptom: 'sore tongue' - Similarity: 0.92
Clause: 'cough'
  - Symptom: 'cough' - Similarity: 1.00
  - Symptom: 'sweating' - Similarity: 0.96
  - Symptom: 'headache' - Similarity: 0.96
  - Symptom: 'fever' - Similarity: 0.96
