In [2]:
import pandas as pd

# Load your dataset
df = pd.read_csv('D:/diseasepredictor/data/Symptom2Disease.csv')
df = df[['label', 'text']]  # ensure correct columns
df.dropna(inplace=True)

df.head()


Unnamed: 0,label,text
0,Psoriasis,I have been experiencing a skin rash on my arm...
1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,Psoriasis,I have been experiencing joint pain in my fing...
3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,Psoriasis,"My nails have small dents or pits in them, and..."


In [3]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # remove punctuation/numbers
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text

df['clean_text'] = df['text'].apply(clean_text)
df.head()


Unnamed: 0,label,text,clean_text
0,Psoriasis,I have been experiencing a skin rash on my arm...,i have been experiencing a skin rash on my arm...
1,Psoriasis,"My skin has been peeling, especially on my kne...",my skin has been peeling especially on my knee...
2,Psoriasis,I have been experiencing joint pain in my fing...,i have been experiencing joint pain in my fing...
3,Psoriasis,"There is a silver like dusting on my skin, esp...",there is a silver like dusting on my skin espe...
4,Psoriasis,"My nails have small dents or pits in them, and...",my nails have small dents or pits in them and ...


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=1000)  # you can increase this later
X = tfidf.fit_transform(df['clean_text'])


In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df['label'])


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Evaluation
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))


                                 precision    recall  f1-score   support

                           Acne       1.00      1.00      1.00         7
                      Arthritis       0.91      1.00      0.95        10
               Bronchial Asthma       1.00      1.00      1.00        11
           Cervical spondylosis       1.00      1.00      1.00         7
                    Chicken pox       0.91      0.83      0.87        12
                    Common Cold       1.00      0.92      0.96        12
                         Dengue       0.90      0.75      0.82        12
          Dimorphic Hemorrhoids       1.00      1.00      1.00         7
               Fungal infection       1.00      1.00      1.00        13
                   Hypertension       1.00      1.00      1.00        10
                       Impetigo       1.00      1.00      1.00        11
                       Jaundice       1.00      1.00      1.00        11
                        Malaria       1.00      1.

In [10]:
import os
import joblib

# Make sure directory exists
model_dir = 'D:/diseasepredictor/models/'
os.makedirs(model_dir, exist_ok=True)

# Save the models
joblib.dump(clf, os.path.join(model_dir, 'text_disease_model.pkl'))
joblib.dump(tfidf, os.path.join(model_dir, 'tfidf_vectorizer.pkl'))
joblib.dump(le, os.path.join(model_dir, 'label_encoder.pkl'))


['D:/diseasepredictor/models/label_encoder.pkl']

In [11]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # remove punctuation/numbers
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text


In [14]:
import joblib

# Load the saved model, TF-IDF vectorizer, and label encoder
clf = joblib.load('D:/diseasepredictor/models/text_disease_model.pkl')
tfidf = joblib.load('D:/diseasepredictor/models/tfidf_vectorizer.pkl')
le = joblib.load('D:/diseasepredictor/models/label_encoder.pkl')

# User input (symptom description)
new_text = "I feel tired all the time and have pain in my joints"


# Preprocess input
cleaned = clean_text(new_text)
X_input = tfidf.transform([cleaned])

# Predict
predicted = clf.predict(X_input)
predicted_label = le.inverse_transform(predicted)

# Output
print("Predicted Disease:", predicted_label[0])


Predicted Disease: Psoriasis


In [1]:
from transformers import pipeline
pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli")





model.safetensors:  30%|##9       | 157M/526M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer_config.json: 0.00B [00:00, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

Device set to use cpu


<transformers.pipelines.zero_shot_classification.ZeroShotClassificationPipeline at 0x26a4df98a40>

In [1]:
import os
import json
import wikipedia
from transformers import pipeline
from tqdm.auto import tqdm
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
import torch
import time

# ----------------------------
# ✅ Configuration
# ----------------------------
torch.set_num_threads(os.cpu_count())
print(f"🛠️ PyTorch using {torch.get_num_threads()} CPU threads")

DISEASES = [
    "Psoriasis", "Diabetes mellitus", "Asthma", "Hypertension", 
    "Tuberculosis", "Malaria", "Arthritis", "Migraine", 
    "Pneumonia", "Hepatitis", "Major depressive disorder",
    "Anxiety disorder", "Obesity", "Epilepsy", "Stroke", 
    "Anemia", "Bronchitis", "COVID-19", "Chickenpox", 
    "Dengue fever", "Cholera", "Thyroid disease",
    "Sinusitis", "Allergy"
]

BATCH_SIZE = 4
MAX_WORKERS = min(4, os.cpu_count())
MIN_SENTENCE_LENGTH = 10
SCORE_THRESHOLD = 0.8
WIKI_BATCH_SIZE = 2
CACHE_DIR = "./wiki_cache"
os.makedirs(CACHE_DIR, exist_ok=True)

SENTENCE_PATTERN = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s')
WHITESPACE_PATTERN = re.compile(r'\s+')

def clean_text(text):
    return WHITESPACE_PATTERN.sub(' ', text).strip()

# ----------------------------
# 📄 Wikipedia Fetching
# ----------------------------
def fetch_single_wiki(disease):
    cache_path = os.path.join(CACHE_DIR, f"{disease.lower()}.txt")
    if os.path.exists(cache_path):
        with open(cache_path, 'r', encoding='utf-8') as f:
            return f.read()
    try:
        page = wikipedia.page(disease, auto_suggest=False)
        content = clean_text(page.content)
        with open(cache_path, 'w', encoding='utf-8') as f:
            f.write(content)
        return content
    except Exception as e:
        print(f"⚠️ Failed fetching {disease}: {str(e)}")
        return ""

def fetch_wikipedia_batch(diseases):
    results = {}
    with ThreadPoolExecutor(MAX_WORKERS) as executor:
        futures = {executor.submit(fetch_single_wiki, d): d for d in diseases}
        for future in as_completed(futures):
            disease = futures[future]
            results[disease] = future.result()
    return results

def split_sentences(text):
    if not text:
        return []
    sentences = SENTENCE_PATTERN.split(text)
    return [s.strip() for s in sentences if len(s.split()) >= MIN_SENTENCE_LENGTH]

# ----------------------------
# 🧠 Treatment Extraction
# ----------------------------
def process_sentence_batch(batch, classifier):
    try:
        results = classifier(batch, candidate_labels=["treatment", "therapy", "medication"], multi_label=True)
        return [
            batch[i] for i, res in enumerate(results)
            if any(score > SCORE_THRESHOLD for score in res['scores'])
        ]
    except Exception as e:
        print(f"⚠️ Batch error: {e}")
        return []

def process_disease_batch(disease_batch, classifier):
    wiki_contents = fetch_wikipedia_batch(disease_batch)
    treatments = {}
    for disease in disease_batch:
        text = wiki_contents.get(disease, "")
        sentences = split_sentences(text)
        filtered = process_sentence_batch(sentences, classifier)
        treatments[disease.lower()] = filtered
    return treatments

def build_treatments(diseases, classifier):
    all_treatments = {}
    batches = [diseases[i:i + WIKI_BATCH_SIZE] for i in range(0, len(diseases), WIKI_BATCH_SIZE)]
    for batch in tqdm(batches, desc="🔍 Processing"):
        try:
            batch_results = process_disease_batch(batch, classifier)
            all_treatments.update(batch_results)
        except Exception as e:
            print(f"⚠️ Error in batch: {e}")
            for d in batch:
                all_treatments[d.lower()] = []
    return all_treatments

# ----------------------------
# 🚀 Main Execution
# ----------------------------
def main():
    print("🚀 Starting extraction with CPU model...")

    classifier = pipeline(
        "zero-shot-classification",
        model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli",  # ✅ Light and CPU-friendly
        device=-1,
        batch_size=BATCH_SIZE,
        framework="pt"
    )

    start_time = time.time()
    treatment_data = build_treatments(DISEASES, classifier)

    output_file = "treatement/cpu_optimized_treatments.json"
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(treatment_data, f, indent=4, ensure_ascii=False)

    print(f"✅ Completed in {time.time() - start_time:.2f} seconds")
    print(f"📁 Results saved to: {output_file}")

if __name__ == "__main__":
    main()


🛠️ PyTorch using 8 CPU threads
🚀 Starting extraction with CPU model...


Device set to use cpu


🔍 Processing:   0%|          | 0/12 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


✅ Completed in 2259.64 seconds
📁 Results saved to: treatement/cpu_optimized_treatments.json


In [None]:
import os, re, json, time, joblib, base64
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML
from transformers import pipeline
import matplotlib.pyplot as plt
import seaborn as sns

# ==== STYLING ====
sns.set_style("whitegrid")
plt.style.use('default')
plt.rcParams['figure.facecolor'] = 'white'

primary_color = '#1a5276'
secondary_color = '#2980b9'
accent_color = '#e74c3c'
background_color = '#f5f9ff'

display(HTML("""
<style>
textarea, input, select {
    font-family: 'Segoe UI', sans-serif;
    font-size: 16px;
}
h3, h4 {
    font-weight: 600;
}
button:hover {
    opacity: 0.9;
    transition: 0.2s ease-in-out;
}
</style>
"""))

# ==== LOAD MODELS AND DATA ====
MODEL_DIR = 'D:/diseasepredictor/models'
TREATMENT_FILE = 'treatement/cpu_optimized_treatments.json'

try:
    clf = joblib.load(os.path.join(MODEL_DIR, 'text_disease_model.pkl'))
    tfidf = joblib.load(os.path.join(MODEL_DIR, 'tfidf_vectorizer.pkl'))
    le = joblib.load(os.path.join(MODEL_DIR, 'label_encoder.pkl'))
    with open(TREATMENT_FILE, 'r', encoding='utf-8') as f:
        treatment_data = json.load(f)
except Exception as e:
    raise RuntimeError(f"❌ Failed to load models or data: {str(e)}")

summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=-1)

# ==== EXAMPLES ====
EXAMPLE_SYMPTOMS = {
    "Psoriasis": "Red patches of skin covered with thick, silvery scales, dry cracked skin that may bleed, itching and burning",
    "Diabetes": "Increased thirst, frequent urination, extreme hunger, unexplained weight loss, fatigue",
    "Eczema": "Itchy, inflamed skin, rough leathery patches, oozing or crusting, swelling"
}

# ==== UI COMPONENTS ====
header = widgets.HTML(
    value=f"""
    <div style="background-color:{primary_color}; padding:20px; border-radius:10px; text-align:center; color:white">
        <h1><i class="fa fa-heartbeat" style="margin-right:10px"></i>AI Disease & Treatment Advisor(careBOT)</h1>
        <p style="margin:0">Clinical Decision Support System</p>
    </div>"""
)

symptom_input = widgets.Textarea(
    placeholder="Describe your symptoms in detail (e.g., 'red itchy rash on elbows')",
    layout=widgets.Layout(width='90%', height='150px')
)

char_counter = widgets.HTML("<p style='text-align:right; margin:0'>0/500 characters</p>")

def update_counter(change):
    char_counter.value = f"<p style='text-align:right; margin:0'>{len(change.new)}/500 characters</p>"

symptom_input.observe(update_counter, names='value')

example_dropdown = widgets.Dropdown(
    options=[("Select example...", "")] + list(EXAMPLE_SYMPTOMS.items()),
    description="Load example:",
    style={'description_width': 'initial'}
)
example_dropdown.observe(lambda change: setattr(symptom_input, 'value', change.new if change.new else ""), names='value')

predict_button = widgets.Button(description="Analyze Symptoms", button_style='success', icon='stethoscope')
clear_button = widgets.Button(description="Clear", button_style='warning', icon='eraser')
output_area = widgets.Output()

# ==== HELPERS ====
def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip().lower()

def create_severity_indicator(level):
    colors = ['#2ecc71', '#f39c12', '#e74c3c']
    label = ["Low", "Medium", "High"][level]
    return widgets.HTML(
        f"""<div style="display:flex; align-items:center">
            <div style="width:20px; height:20px; background-color:{colors[level]}; border-radius:50%; margin-right:10px"></div>
            <span>{label} severity</span>
        </div>"""
    )

def group_by_category(treatments):
    categories = {
        "Topical treatments": ["cream", "ointment", "topical", "tar", "moisturizer", "corticosteroid"],
        "Systemic treatments": ["oral", "systemic", "methotrexate", "DMARD", "steroid", "insulin"],
        "Biologics": ["biologic", "TNF", "interleukin", "antibody", "mab"],
        "Lifestyle & Diet tips": ["diet", "exercise", "stress", "lifestyle", "avoid", "trigger"]
    }
    grouped = {k: [] for k in categories}
    grouped["Other"] = []
    for treatment in treatments:
        lower = treatment.lower()
        for cat, keywords in categories.items():
            if any(kw in lower for kw in keywords):
                grouped[cat].append(treatment)
                break
        else:
            grouped["Other"].append(treatment)
    return grouped

def predict_disease(symptoms):
    cleaned = clean_text(symptoms)
    X_input = tfidf.transform([cleaned])
    pred_idx = clf.predict(X_input)
    probs = clf.predict_proba(X_input)
    confidence = round(max(probs[0]) * 100, 2)
    disease = le.inverse_transform(pred_idx)[0]
    treatments = treatment_data.get(disease.lower(), [])
    severity = 2 if "severe" in cleaned else 0 if "mild" in cleaned else 1
    return disease, severity, treatments, confidence

def generate_report(symptoms, disease, treatments):
    return f"""--- AI Diagnosis Report ---
Reported Symptoms:
{symptoms}

Predicted Disease: {disease}
Top 5 Treatments:
{', '.join(treatments[:5]) if treatments else 'No treatments found.'}

Disclaimer:
This tool does not replace professional medical advice.
"""

# ==== EVENTS ====
def on_predict_click(_):
    with output_area:
        clear_output()
        if not symptom_input.value.strip():
            print("❗ Please describe your symptoms.")
            return
        if len(symptom_input.value) < 15:
            print("❗ Please provide at least 15 characters.")
            return
        display(widgets.HTML("<p style='text-align:center'><i class='fa fa-spinner fa-spin fa-2x'></i><br>Analyzing...</p>"))
        time.sleep(1)
        disease, severity, treatments, confidence = predict_disease(symptom_input.value)
        clear_output()
        display(widgets.HTML(
            f"""<div style="background-color:{background_color}; padding:20px; border-radius:10px;">
                <h2 style="color:{primary_color}">Diagnosis Results</h2>
                <div style="display:flex; justify-content:space-between">
                    <div>
                        <p><strong>Reported Symptoms:</strong><br><em>{symptom_input.value}</em></p>
                    </div>
                    <div style="text-align:right">
                        <h3 style="color:{accent_color}; margin:0">{disease}</h3>
                        <p>Confidence: <strong>{confidence}%</strong></p>
                        {create_severity_indicator(severity).value}
                    </div>
                </div>
            </div>"""
        ))
        if treatments:
            try:
                summary = summarizer(" ".join(treatments[:20])[:1024], max_length=100, min_length=30, do_sample=False)[0]['summary_text']
                display(widgets.HTML(f"""
                <div style="background-color:#f5f5f5; padding:15px; border-radius:8px; margin:15px 0">
                    <h3 style="color:{primary_color}"><i class="fa fa-list-ul" style="margin-right:10px"></i>Treatment Overview</h3>
                    <p>{summary}</p>
                </div>
                """))
            except Exception as e:
                print("⚠️ Could not summarize treatment info:", e)
            grouped = group_by_category(treatments)
            for cat, items in grouped.items():
                if items:
                    accordion = widgets.Accordion(children=[
                        widgets.HTML("<ul>" + "".join([f"<li>{t}</li>" for t in items]) + "</ul>")
                    ])
                    accordion.set_title(0, cat)
                    display(accordion)

            # === DOWNLOAD REPORT BUTTON ===
            report = generate_report(symptom_input.value, disease, treatments)
            b64_report = base64.b64encode(report.encode()).decode()
            href = f'<a download="diagnosis_report.txt" href="data:text/plain;base64,{b64_report}" target="_blank"><button style="padding:10px 20px; font-size:16px; background-color:#3498db; color:white; border:none; border-radius:5px; cursor:pointer;">📄 Download Report</button></a>'
            display(HTML(href))
        else:
            display(widgets.HTML(
                f"""<div style="border:1px solid #ccc; border-radius:10px; padding:15px; background-color:#fff8e1">
                    <h3 style="color:{accent_color}"><i class="fa fa-info-circle"></i> Note</h3>
                    <p>No treatments found for this disease. Please consult a doctor.</p>
                </div>"""
            ))
        display(widgets.HTML(
            f"""<div style="border:1px solid #ccc; border-radius:10px; padding:15px; margin-top:15px">
                <h4 style="margin-top:0; color:{accent_color}">
                    <i class="fa fa-exclamation-triangle"></i> Important Disclaimer
                </h4>
                <p>This tool provides suggestions based on AI models and should not replace consultation with licensed healthcare professionals.</p>
            </div>"""
        ))

def on_clear_click(_):
    symptom_input.value = ""
    example_dropdown.value = ""
    with output_area: clear_output()

predict_button.on_click(on_predict_click)
clear_button.on_click(on_clear_click)

# ==== LAYOUT ====
input_box = widgets.VBox([
    widgets.HTML(f"<h3 style='color:{primary_color}'>Describe Your Symptoms</h3>"),
    symptom_input,
    char_counter,
    widgets.HBox([example_dropdown], layout=widgets.Layout(justify_content='flex-end')),
    widgets.HBox([predict_button, clear_button], layout=widgets.Layout(justify_content='center'))
], layout=widgets.Layout(width='90%', margin='20px 0'))

app = widgets.VBox([
    header,
    widgets.HTML("<div style='height:20px'></div>"),
    input_box,
    output_area
], layout=widgets.Layout(align_items='center'))

# ==== DISPLAY ====
display(app)


Device set to use cpu


VBox(children=(HTML(value='\n    <div style="background-color:#1a5276; padding:20px; border-radius:10px; text-…