In [17]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import defaultdict
import pandas as pd
import re
import itertools

In [18]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\akhil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akhil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\akhil\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [19]:
data = 'C:/Users/akhil/OneDrive/Documents/School/Summer 2024/MSDS 458/Final Project/Symptom2Disease.csv'
df = pd.read_csv(data)

In [20]:
stop_words = set(stopwords.words('english'))

In [21]:
def extract_symptoms_pos(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    pos_tags = nltk.pos_tag(filtered_words)
    symptoms = [word for word, pos in pos_tags if pos in ('NN', 'NNS', 'JJ', 'JJR', 'JJS')]
    return symptoms

In [22]:
grouped_symptoms = df.groupby('label')['text'].apply(list).to_dict()
compound_symptom_disease_map = {}

for disease, symptoms_list in grouped_symptoms.items():
    all_symptoms = []
    for description in symptoms_list:
        symptoms = extract_symptoms_pos(description)
        all_symptoms.extend(symptoms)
    
    unique_symptoms = set(all_symptoms)
    compound_symptom_disease_map[tuple(unique_symptoms)] = disease

In [23]:
symptom_disease_map = defaultdict(set)

In [24]:
for index, row in df.iterrows():
    symptoms = extract_symptoms_pos(row['text'])
    for symptom in symptoms:
        symptom_disease_map[symptom].add(row['label'])

In [25]:
symptom_disease_map = {k: list(v) for k, v in symptom_disease_map.items()}

In [26]:
def forward_chaining(extracted_symptoms):
    possible_diagnoses = defaultdict(float)
    symptom_combinations = []

    for r in range(1, len(extracted_symptoms) + 1):
        combinations = list(itertools.combinations(extracted_symptoms, r))
        symptom_combinations.extend(combinations)

    for combination in symptom_combinations:
        combination_tuple = tuple(combination)
        if combination_tuple in compound_symptom_disease_map:
            disease = compound_symptom_disease_map[combination_tuple]
            possible_diagnoses[disease] += len(combination) 
    
    total_symptoms = len(extracted_symptoms)
    diagnosis_confidence = {disease: (count / total_symptoms) for disease, count in possible_diagnoses.items()}
    return diagnosis_confidence

In [27]:
def backward_chaining(goal_disease, extracted_symptoms):
    relevant_symptoms = set()

    for symptom in extracted_symptoms:
        if symptom in symptom_disease_map and goal_disease in symptom_disease_map[symptom]:
            relevant_symptoms.add(symptom)
    
    confidence = len(relevant_symptoms) / len(extracted_symptoms)
    return confidence if relevant_symptoms else 0.0

In [28]:
def diagnose(sentence, method="forward", goal_disease=None):
    extracted_symptoms = extract_symptoms_pos(sentence)
    print(f"Extracted Symptoms: {extracted_symptoms}")
    
    if method == "forward":
        diagnosis_confidence = forward_chaining(extracted_symptoms)
    elif method == "backward":
        if not goal_disease:
            goal_disease = input("Enter the disease you want to verify (goal): ")
        confidence = backward_chaining(goal_disease, extracted_symptoms)
        diagnosis_confidence = {goal_disease: confidence}
    else:
        print("Invalid method. Choose 'forward' or 'backward'.")
        return
    
    if diagnosis_confidence:
        sorted_diagnoses = sorted(diagnosis_confidence.items(), key=lambda x: x[1], reverse=True)
        print("\nPossible diagnoses with confidence levels:")
        for diagnosis, confidence in sorted_diagnoses:
            print(f"{diagnosis}: {confidence:.2f} confidence")
    else:
        print("No matching diagnosis found.")

In [31]:
user_input = input('Please input your symptoms: ')
diagnose(user_input)

Extracted Symptoms: ['itchy', 'scalp', 'red', 'skin']
No matching diagnosis found.


In [30]:
diagnose(user_input, method="backward")

Extracted Symptoms: ['massive', 'headache']

Possible diagnoses with confidence levels:
diabetes: 0.00 confidence
