In [1]:
from pymongo import MongoClient
import pandas as pd

client = MongoClient("mongodb://127.0.0.1:27017/")
db = client["medical_db"]
collection = db["mayo_diseases"]

cursor = collection.find({})

data = []

for doc in cursor:

    sections = doc.get("sections", {})

    data.append({
        "disease_name": doc.get("disease_name", ""),

        "overview": sections.get("overview", ""),
        "symptoms": sections.get("symptoms", ""),
        "causes": sections.get("causes", ""),

        # Sometimes you saved "risk factors" or "risk factors"
        "factors": (
            sections.get("risk factors", "") or
            sections.get("risk_factor", "") or
            sections.get("risk", "")
        )
    })

df = pd.DataFrame(data)

print("Dataset size:", df.shape)
df.head()

Dataset size: (718, 5)


Unnamed: 0,disease_name,overview,symptoms,causes,factors
0,Chronic sinusitis,Chronic sinusitis Chronic sinusitis Chronic si...,Common symptoms of chronic sinusitis include: ...,Nasal polyps Nasal polyps Nasal polyps Nasal p...,The following factors raise the risk of gettin...
1,Pneumothorax,Collapsed and normal lung Collapsed and normal...,The main symptoms of a pneumothorax are sudden...,A pneumothorax can be caused by: Chest injury....,"In general, men are far more likely to have a ..."
2,Male infertility,"Nearly 1 in 7 couples is infertile, which mean...",The main sign of male infertility is the inabi...,Male fertility is a complex process. To get yo...,Risk factors linked to male infertility includ...
3,Body lice,,,,
4,Hurthle cell cancer,,,,


In [2]:
for col in df.columns:
    df = df[df[col].str.strip() != ""]

In [3]:
import pandas as pd
import random
import nltk
from nltk.corpus import wordnet

data = df[["disease_name", "causes"]].dropna()

df = data.copy()

df["causes"] = df["causes"].astype(str)

def clean_text(text):
    text = text.lower()
    return text


df["causes"] = df["causes"].apply(clean_text)

In [4]:
def get_synonyms(word):
    synonyms = set()

    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace("_", " ")
            if synonym != word:
                synonyms.add(synonym)

    return list(synonyms)


def augment_sentence(sentence, augment_count=3):
    words = sentence.split()
    augmented_sentences = []

    for _ in range(augment_count):
        new_words = words.copy()

        num_replace = max(1, int(0.3 * len(words)))

        replace_indices = random.sample(range(len(words)), 
                                         min(num_replace, len(words)))

        for idx in replace_indices:
            synonyms = get_synonyms(words[idx])

            if synonyms:
                new_words[idx] = random.choice(synonyms)

        augmented_sentences.append(" ".join(new_words))

    return augmented_sentences

In [5]:
augmented_rows = []

augmentation_per_sample = 20   

for _, row in df.iterrows():
    original_text = row["causes"]
    disease = row["disease_name"]

    augmented_texts = augment_sentence(
        original_text,
        augment_count=augmentation_per_sample
    )

    augmented_rows.append([original_text, disease])

    for text in augmented_texts:
        augmented_rows.append([text, disease])

In [6]:
causes_df = pd.DataFrame(augmented_rows, columns= ["causes", "disease_name"])

print("Original dataset size:", len(df))
print("Expanded dataset size:", len(causes_df))

causes_df.to_csv("data/causes_medical_dataset.csv", index= False)

Original dataset size: 503
Expanded dataset size: 10563


In [42]:
import numpy as np
import pandas as pd

data = pd.read_csv("data/causes_medical_dataset.csv")
data.head()

Unnamed: 0,causes,disease_name
0,nasal polyps nasal polyps nasal polyps nasal p...,Chronic sinusitis
1,nasal polyps nasal polyps nasal polyps nasal p...,Chronic sinusitis
2,pinched polypus nasal bone polyps nasal polyps...,Chronic sinusitis
3,adenoidal polyps nasal polyps nasal polyps nas...,Chronic sinusitis
4,nasal polyps nasal polypus os nasale polyps os...,Chronic sinusitis


In [43]:
import numpy as np
import re
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, GlobalMaxPooling1D

In [44]:
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['disease_name'])
num_classes = len(label_encoder.classes_)

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text) 
    return text

In [45]:
train_texts = []
train_labels = []

for idx, row in data.iterrows():
    variants = [row['causes'], row['disease_name']]
    for text in variants:
        if isinstance(text, str) and len(text) > 2:
            train_texts.append(clean_text(text))
            train_labels.append(row['label'])

In [46]:
max_words = 10000 
max_len = 100
tokenizer = Tokenizer(num_words= max_words, oov_token= '<OOV>')
tokenizer.fit_on_texts(train_texts)

X = tokenizer.texts_to_sequences(train_texts)
X_padded = pad_sequences(X, maxlen= max_len, padding= 'post')
y = tf.keras.utils.to_categorical(train_labels, num_classes= num_classes)

X_train, X_val, y_train, y_val = train_test_split(X_padded, y, test_size= 0.2, random_state= 42)

In [47]:
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_len),
    Bidirectional(LSTM(128, return_sequences=True)),
    GlobalMaxPooling1D(), 
    Dense(128, activation='relu'),
    Dropout(0.4),
    Dense(num_classes, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

print(f"Training on {num_classes} unique diseases...")
model.fit(X_train, y_train, epochs= 20, validation_data= (X_val, y_val), batch_size= 16, callbacks= [early_stop])

Training on 503 unique diseases...
Epoch 1/20




[1m1057/1057[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 53ms/step - accuracy: 0.1087 - loss: 4.9427 - val_accuracy: 0.7965 - val_loss: 1.2990
Epoch 2/20
[1m1057/1057[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 55ms/step - accuracy: 0.7973 - loss: 0.7892 - val_accuracy: 0.9863 - val_loss: 0.0748
Epoch 3/20
[1m1057/1057[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 55ms/step - accuracy: 0.9467 - loss: 0.2112 - val_accuracy: 0.9870 - val_loss: 0.0427
Epoch 4/20
[1m1057/1057[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 55ms/step - accuracy: 0.9665 - loss: 0.1241 - val_accuracy: 0.9894 - val_loss: 0.0359
Epoch 5/20
[1m1057/1057[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 54ms/step - accuracy: 0.9763 - loss: 0.0876 - val_accuracy: 0.9882 - val_loss: 0.0354
Epoch 6/20
[1m1057/1057[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 55ms/step - accuracy: 0.9776 - loss: 0.0745 - val_accuracy: 0.9901 - val_loss: 0.0342
Epoch 7/20
[1m

<keras.src.callbacks.history.History at 0x1b39f1e1310>

In [50]:
def predict_diseases(text_list):
    if isinstance(text_list, str):
        text_list = [text_list]

    cleaned_texts = [clean_text(t) for t in text_list]

    seq = tokenizer.texts_to_sequences(cleaned_texts)
    padded = pad_sequences(seq, maxlen=max_len, padding='post')

    predictions = model.predict(padded, verbose=0)

    results = []

    for pred in predictions:
        class_idx = np.argmax(pred)
        disease_name = label_encoder.inverse_transform([class_idx])[0]

        try:
            summary = data[data['disease_name'] == disease_name]['causes'].iloc[0]
        except:
            summary = ""

        results.append({
            "disease_name": disease_name,
            "summary": summary
        })

    return results

In [52]:
test_inputs = [
    "chest injury. any blunt or penetrating injury to your chest can cause lung collapse.",
    "transposition of the great arteries, also called tga, happen during maternity when the unborn baby's warmheartedness comprise forming.",
    "nasal polyps nasal bone polyps adenoidal polyps pinched polyps ar balmy growths on the trace of the nose or the spaces inside the nose,"
]

results = predict_diseases(test_inputs)

for i, r in enumerate(results):
    print(test_inputs[i], ": ", end= "")
    print("(", r["disease_name"], ")")
    print("-"*40)

chest injury. any blunt or penetrating injury to your chest can cause lung collapse. : ( Restless legs syndrome )
----------------------------------------
transposition of the great arteries, also called tga, happen during maternity when the unborn baby's warmheartedness comprise forming. : ( Cyclothymia (cyclothymic disorder) )
----------------------------------------
nasal polyps nasal bone polyps adenoidal polyps pinched polyps ar balmy growths on the trace of the nose or the spaces inside the nose, : ( Nasal polyps )
----------------------------------------


In [53]:
import pickle

model.save("models/causes_models/causes_model.keras")

with open('models/causes_models/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('models/causes_models/label_encoder.pickle', 'wb') as handle:
    pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("All components saved successfully!")

All components saved successfully!


In [54]:
import tensorflow as tf
import pickle
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

model = tf.keras.models.load_model("models/causes_models/causes_model.keras")

with open('models/causes_models/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

with open('models/causes_models/label_encoder.pickle', 'rb') as handle:
    label_encoder = pickle.load(handle)

def predict_saved_model(text):
    seq = tokenizer.texts_to_sequences([text.lower()])
    padded = pad_sequences(seq, maxlen=100, padding='post')
    prediction = model.predict(padded, verbose=0)
    class_idx = np.argmax(prediction)
    return label_encoder.inverse_transform([class_idx])[0]

print(predict_saved_model("symptoms of chronic sinusitis"))

Chronic sinusitis


In [55]:
model

<Sequential name=sequential_2, built=True>

In [56]:
tokenizer

<keras.src.legacy.preprocessing.text.Tokenizer at 0x1b391f2f9b0>

In [57]:
label_encoder

In [58]:
import pandas as pd
from fuzzywuzzy import process

data = pd.read_csv("data/causes_medical_dataset.csv")

if 'disease_name' not in data.columns or 'causes' not in data.columns:
    raise ValueError("Dataset must contain 'disease_name' and 'causes' columns")


def get_causes_from_disease(disease_query):
    query = disease_query.strip().lower()

    match = data[data['disease_name'].str.lower() == query]

    if not match.empty:
        disease_real_name = match.iloc[0]['disease_name']
        causes_text = match.iloc[0]['causes']

        return f"Disease: {disease_real_name}\n\nCauses: {causes_text}"

    return "Disease not found in database. Please check your spelling."


def smart_lookup(user_input):
    unique_diseases = data['disease_name'].dropna().unique().tolist()

    best_match, confidence = process.extractOne(user_input, unique_diseases)

    if confidence is not None and confidence > 80:
        cause = data[data['disease_name'] == best_match]['causes'].iloc[0]

        return f"Results for {best_match}:\n\n{cause}"

    return "Could not find a matching disease."


print(get_causes_from_disease("Pneumothorax"))
print(smart_lookup("sinusits"))

Disease: Pneumothorax

Causes: a pneumothorax can be caused by: chest injury. any blunt or penetrating injury to your chest can cause lung collapse. some injuries may happen during physical assaults or car crashes, while others may inadvertently occur during medical procedures that involve the insertion of a needle into the chest. lung disease. damaged lung tissue is more likely to collapse. lung damage can be caused by many types of underlying diseases, such as chronic obstructive pulmonary disease (copd), cystic fibrosis, lung cancer or pneumonia. cystic lung diseases, such as lymphangioleiomyomatosis and birt-hogg-dube syndrome, cause round, thin-walled air sacs in the lung tissue that can rupture, resulting in pneumothorax. ruptured air blisters. small air blisters (blebs) can develop on the top of the lungs. these air blisters sometimes burst — allowing air to leak into the space that surrounds the lungs. mechanical ventilation. a severe type of pneumothorax can occur in people wh