In [1]:
"""
@Author: Magnus Graham
1/3/2025

This notebook matches free text to a set of predefined symptoms.
It uses spaCy to preprocess text for, and uses BioBERT and SBERT
to map their meaning to the closest possible match in the symptoms list.

"""

'\n@Author: Magnus Graham\n1/3/2025\n\nThis notebook matches free text to a set of predefined symptoms.\nIt uses spaCy to preprocess text for, and uses BioBERT and SBERT\nto map their meaning to the closest possible match in the symptoms list.\n\n'

In [96]:
!pip install sentence-transformers
!pip install torch
!pip install spacy
!pip install xgboost
!python -m spacy download en_core_web_sm  # if not already installed

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m85.0 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [98]:
import os

In [99]:
os.getcwd()

'/Users/magnusgraham/NLP'

In [84]:
import spacy
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer

#pre-trained SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')
import torch
import torch.nn.functional as F

#spaCy model for tokenization
nlp = spacy.load("en_core_web_sm")

In [85]:
#preprocess input from the user
def preprocess_input(user_input):
    user_input = user_input.lower().strip()
    clauses = [clause.strip() for clause in user_input.split(",") if clause.strip()]
    return clauses

#process clauses of user input
def process_clauses(clauses, create_dict=True,lemmatize=True):
    print("Processing clauses")
    
    symptom_correlation = {} if create_dict else None
    processed_clauses = [] 
    
    for clause in clauses:
        clause_p = clause.replace("_", " ")
        doc = nlp(clause_p)
        
        processed_clause = " ".join([token.lemma_ for token in doc if token.is_alpha and not token.is_stop])
        
        if create_dict:
            symptom_correlation[processed_clause] = clause 
        
        processed_clauses.append(processed_clause)
    
    if create_dict:
        return processed_clauses, symptom_correlation
    else:
        return processed_clauses


def process_csv(filepath):
    df = pd.read_csv(filepath)
    
    predefined_symptoms = set()
    
    for col in df.columns[1:19]:  
        for value in df[col].unique():
            # Add the cleaned symptom to the set
            predefined_symptoms.add(str(value))
        
    return list(predefined_symptoms)


In [86]:
def get_sbert_embeddings(sentences):
    """
    Generate SBERT embeddings for a list of sentences.
    
    Args:
        sentences (list of str): List of input sentences.
    
    Returns:
        torch.Tensor: Embedding tensor of shape (batch_size, hidden_size).
    """
    if isinstance(sentences, str):
        sentences = [sentences]  # Ensure input is a list

    # Directly encode the sentences
    embeddings = model.encode(sentences, convert_to_tensor=True)  # Output shape: (batch_size, hidden_size)
    return embeddings


In [97]:
import heapq
import torch.nn.functional as F
import json

# Function to compute pairwise cosine similarity
def cosine_similarity_matrix(embeddings1, embeddings2):
    return torch.mm(F.normalize(embeddings1, p=2, dim=1), F.normalize(embeddings2, p=2, dim=1).T)

# Function to find the most similar predefined symptoms for each clause
def compare_input_to_symptoms(clauses, predefined_symptoms, correlation, output_file, threshold = 0.6, top_n=4):
    
    print("Comparing...")
    clause_embeddings = get_sbert_embeddings(clauses)
    symptom_embeddings = get_sbert_embeddings(predefined_symptoms)
    
    # Compute similarity matrix
    similarity_matrix = cosine_similarity_matrix(clause_embeddings, symptom_embeddings)
    
    # Process each clause and its similarities to symptoms
    
    symptom_results = []  # List to hold all symptoms above the threshold
    
    for i, clause in enumerate(clauses):
        # Get similarity scores for the current clause
        similarities = similarity_matrix[i]
        
        # Use a min-heap to maintain top_n elements
        top_similar_symptoms = []
        for j, similarity in enumerate(similarities):
            if similarity >= threshold:
                if len(top_similar_symptoms) < top_n:
                    heapq.heappush(top_similar_symptoms, (similarity, predefined_symptoms[j]))
                else:
                    heapq.heappushpop(top_similar_symptoms, (similarity, predefined_symptoms[j]))
        
        # Extract the top elements from the heap (sorted in descending order by similarity)
        top_similar_symptoms.sort(reverse=True, key=lambda x: x[0])
        
        # Display and collect matched symptoms
        print(f"Clause: '{clause}'")
        for similarity, symptom in top_similar_symptoms:
            raw_symptom = correlation.get(symptom, symptom)
            print(f"  - Symptom: '{raw_symptom}' - Similarity: {similarity:.2f}")
            symptom_results.append({"symptom": raw_symptom})
    
    # Save the flat list to a JSON file
    with open(output_file, "w") as f:
        json.dump(symptom_results, f, indent=4)

    
    print(f"Output written to {output_file}")
    return symptom_results

In [110]:
import pandas as pd

# Load our input and output files
data_file = "DiseaseAndSymptoms.csv"
output_file = "output_file.json"

#add user inputted symptoms
user_input = input('Enter your symptoms separated by commas')

# Step 1: Preprocess the input
clauses = preprocess_input(user_input)
predefined_symptoms = process_csv(data_file)


# Step 2: Tokenize and Lemmatize the clauses
processed_input = process_clauses(clauses,create_dict=False)
processed_symptoms, correlation = process_clauses(predefined_symptoms,create_dict=True)


# Step 3: Compare each clause to predefined symptoms using SBERT
compare_input_to_symptoms(processed_input, processed_symptoms, correlation, output_file, threshold = 0.6, top_n=4)
with open(output_file, "r") as f:
        file_contents = f.read()
        print(file_contents)

Enter your symptoms separated by commas itching, indigestion, sore stomach, diarrhea


Processing clauses
Processing clauses
Comparing...
Clause: 'itch'
  - Symptom: 'itching' - Similarity: 1.00
  - Symptom: ' internal_itching' - Similarity: 0.63
Clause: 'indigestion'
  - Symptom: ' indigestion' - Similarity: 1.00
Clause: 'sore stomach'
  - Symptom: ' stomach_pain' - Similarity: 0.85
  - Symptom: ' abdominal_pain' - Similarity: 0.76
  - Symptom: ' belly_pain' - Similarity: 0.72
  - Symptom: ' swelling_of_stomach' - Similarity: 0.70
Clause: 'diarrhea'
  - Symptom: ' diarrhoea' - Similarity: 0.84
  - Symptom: ' vomiting' - Similarity: 0.62
  - Symptom: ' nausea' - Similarity: 0.60
Output written to output_file.json
[
    {
        "symptom": "itching"
    },
    {
        "symptom": " internal_itching"
    },
    {
        "symptom": " indigestion"
    },
    {
        "symptom": " stomach_pain"
    },
    {
        "symptom": " abdominal_pain"
    },
    {
        "symptom": " belly_pain"
    },
    {
        "symptom": " swelling_of_stomach"
    },
    {
        "symptom