In [41]:
from pymongo import MongoClient
import pandas as pd

client = MongoClient("mongodb://127.0.0.1:27017/")
db = client["medical_db"]
collection = db["mayo_diseases"]

cursor = collection.find({})

data = []

for doc in cursor:

    sections = doc.get("sections", {})

    data.append({
        "disease_name": doc.get("disease_name", ""),

        "overview": sections.get("overview", ""),
        "symptoms": sections.get("symptoms", ""),
        "causes": sections.get("causes", ""),

        # Sometimes you saved "risk factors" or "risk factors"
        "factors": (
            sections.get("risk factors", "") or
            sections.get("risk_factor", "") or
            sections.get("risk", "")
        )
    })

df = pd.DataFrame(data)

print("Dataset size:", df.shape)
df.head()

Dataset size: (718, 5)


Unnamed: 0,disease_name,overview,symptoms,causes,factors
0,Chronic sinusitis,Chronic sinusitis Chronic sinusitis Chronic si...,Common symptoms of chronic sinusitis include: ...,Nasal polyps Nasal polyps Nasal polyps Nasal p...,The following factors raise the risk of gettin...
1,Pneumothorax,Collapsed and normal lung Collapsed and normal...,The main symptoms of a pneumothorax are sudden...,A pneumothorax can be caused by: Chest injury....,"In general, men are far more likely to have a ..."
2,Male infertility,"Nearly 1 in 7 couples is infertile, which mean...",The main sign of male infertility is the inabi...,Male fertility is a complex process. To get yo...,Risk factors linked to male infertility includ...
3,Body lice,,,,
4,Hurthle cell cancer,,,,


In [42]:
for col in df.columns:
    df = df[df[col].str.strip() != ""]

In [40]:
import pandas as pd
import random
import nltk
from nltk.corpus import wordnet

nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ENG_AHMED_AYAD\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ENG_AHMED_AYAD\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [44]:
data = df[["disease_name", "symptoms"]].dropna()
data.shape, data.head()

((503, 2),
         disease_name                                           symptoms
 0  Chronic sinusitis  Common symptoms of chronic sinusitis include: ...
 1       Pneumothorax  The main symptoms of a pneumothorax are sudden...
 2   Male infertility  The main sign of male infertility is the inabi...
 5           Mastitis  Symptoms of mastitis can appear suddenly. They...
 6    Poison ivy rash  Signs and symptoms of a poison ivy rash includ...)

In [49]:
df = data.copy()

df["symptoms"] = df["symptoms"].astype(str)

def clean_text(text):
    text = text.lower()
    return text


df["symptoms"] = df["symptoms"].apply(clean_text)

In [50]:
def get_synonyms(word):
    synonyms = set()

    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace("_", " ")
            if synonym != word:
                synonyms.add(synonym)

    return list(synonyms)


def augment_sentence(sentence, augment_count=3):
    words = sentence.split()
    augmented_sentences = []

    for _ in range(augment_count):
        new_words = words.copy()

        num_replace = max(1, int(0.3 * len(words)))

        replace_indices = random.sample(range(len(words)), 
                                         min(num_replace, len(words)))

        for idx in replace_indices:
            synonyms = get_synonyms(words[idx])

            if synonyms:
                new_words[idx] = random.choice(synonyms)

        augmented_sentences.append(" ".join(new_words))

    return augmented_sentences

In [51]:
augmented_rows = []

augmentation_per_sample = 20   

for _, row in df.iterrows():
    original_text = row["symptoms"]
    disease = row["disease_name"]

    augmented_texts = augment_sentence(
        original_text,
        augment_count=augmentation_per_sample
    )

    augmented_rows.append([original_text, disease])

    for text in augmented_texts:
        augmented_rows.append([text, disease])

In [52]:
expanded_df = pd.DataFrame(augmented_rows, columns= ["symptoms", "disease_name"])

print("Original dataset size:", len(df))
print("Expanded dataset size:", len(expanded_df))

expanded_df.to_csv("expanded_medical_dataset.csv", index=False)

Original dataset size: 503
Expanded dataset size: 10563


In [53]:
data = pd.read_csv("expanded_medical_dataset.csv")
data.head()

Unnamed: 0,symptoms,disease_name
0,common symptoms of chronic sinusitis include: ...,Chronic sinusitis
1,common symptoms of chronic sinusitis include: ...,Chronic sinusitis
2,common symptoms of chronic sinusitis include: ...,Chronic sinusitis
3,common symptoms of chronic sinusitis include: ...,Chronic sinusitis
4,unwashed symptoms of chronic sinusitis include...,Chronic sinusitis


In [55]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

vectorizer = TfidfVectorizer(max_features=5000)

X_vec = vectorizer.fit_transform(data['symptoms'])


le = LabelEncoder()
y = le.fit_transform(data['disease_name'])

X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size= 0.2)

In [56]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

model.fit(X_train, y_train)

In [57]:
model.score(X_train, y_train), model.score(X_test, y_test)

(1.0, 0.9990534784666352)

In [59]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)

print("\nModel Performance Summary:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2%}")


Model Performance Summary:
Accuracy: 99.91%


In [66]:
def predict_symptom(new_text):
    new_text = str(new_text).lower()
    vec = vectorizer.transform([new_text])
    label_index = model.predict(vec)[0]
    
    disease_name = le.inverse_transform([label_index])[0]

    return disease_name


print("\nTest Prediction:")
print(predict_symptom("My chest feels like it is burning"))
print(predict_symptom("The main symptoms of a pneumothorax are sudden"))


Test Prediction:
Burning mouth syndrome
Pneumothorax


In [67]:
import joblib

joblib.dump(vectorizer, "models/symptoms_models/vectorizer.pkl", compress= 3)
joblib.dump(model, "models/symptoms_models/medical_model.pkl", compress= 3)
joblib.dump(le, "models/symptoms_models/label_encoder.pkl", compress= 3)

['models/symptoms_models/label_encoder.pkl']

In [None]:
vectorizer = joblib.load("vectorizer.pkl")
model = joblib.load("medical_model.pkl")
le = joblib.load("label_encoder.pkl")