In [1]:
!pip install joblib scikit-learn
!pip install pandas numpy



In [7]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv("combined_augmented_dataset.csv")  # Replace with path

# Setup
disease_col = "diseases"
symptom_cols = [col for col in df.columns if col != disease_col]

import re

# Sanitize symptom column names
sanitized_mapping = {}
for col in symptom_cols:
    clean_col = re.sub(r'\W+', '_', col.strip())  # Replace non-alphanum with _
    sanitized_mapping[col] = clean_col

# Rename columns
df.rename(columns=sanitized_mapping, inplace=True)
symptom_cols = [sanitized_mapping[col] for col in symptom_cols]

# Filter singleton classes
class_counts = df[disease_col].value_counts()
valid_classes = class_counts[class_counts >= 2].index
df = df[df[disease_col].isin(valid_classes)].reset_index(drop=True)

X = df[symptom_cols].astype(np.float32)
y = df[disease_col]

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, stratify=y_encoded, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define models
models = {
    "RandomForest": RandomForestClassifier(n_estimators=100,max_depth=50, max_features='sqrt',  random_state=42)
}

# Train and evaluate
for name, model in models.items():
    print(f"\n🔍 Training {name}...")

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0)

    print(f"{name} Accuracy: {acc:.4f}")
    print(report)

    joblib.dump(model, f"{name}_model.pkl")

# Save shared artifacts
joblib.dump(le, "label_encoder.pkl")
joblib.dump(symptom_cols, "symptom_cols.pkl")
joblib.dump(scaler, "scaler.pkl")


🔍 Training RandomForest...
RandomForest Accuracy: 0.7640
                                                          precision    recall  f1-score   support

                 (vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         2
                                                    AIDS       1.00      1.00      1.00         2
                                                    Acne       1.00      1.00      1.00         2
                                     Alcoholic hepatitis       1.00      1.00      1.00         3
                                                 Allergy       1.00      1.00      1.00         2
                                               Arthritis       1.00      1.00      1.00         2
                                        Bronchial Asthma       1.00      0.50      0.67         2
                                    Cervical spondylosis       1.00      1.00      1.00         2
                                             Chicken pox   

['scaler.pkl']

In [5]:
import joblib
import numpy as np
import pandas as pd
import os

# Load necessary artifacts (symptom columns and class names)
symptom_cols = joblib.load("symptom_cols.pkl")
# class_names = joblib.load("class_names.pkl") # Original line
# Assuming you used a scaler, uncomment the line below
# scaler = joblib.load("scaler.pkl")
# Assuming you used a label encoder for the target, uncomment the line below
label_encoder = joblib.load("label_encoder.pkl")


# Function to get user input for symptoms
def get_user_symptoms():
    print("Enter the symptoms you are experiencing, separated by commas:")
    symptoms_input = input()
    # Clean and split the input symptoms
    user_symptoms = [symptom.strip().lower() for symptom in symptoms_input.split(',') if symptom.strip()]
    return user_symptoms

# Function to preprocess user symptoms
def preprocess_user_symptoms(user_symptoms, symptom_cols):
    # Create a zero vector with the same length as the model's symptom columns
    input_vector = pd.DataFrame(0, index=[0], columns=symptom_cols)

    # Mark the presence of user-provided symptoms with 1
    # Sanitize user input symptoms to match the format of symptom_cols
    sanitized_user_symptoms = [s.replace(" ", "_") for s in user_symptoms]

    for symptom in sanitized_user_symptoms:
        if symptom in input_vector.columns:
            input_vector[symptom] = 1
        else:
            print(f"Warning: Symptom '{symptom}' not found in the known symptoms.")

    # If you used a scaler during training, uncomment and apply it here
    # input_vector_scaled = scaler.transform(input_vector)
    # return input_vector_scaled

    return input_vector


# Get symptoms from the user
user_symptoms = get_user_symptoms()

# Preprocess the user symptoms
input_data = preprocess_user_symptoms(user_symptoms, symptom_cols)

# Define the list of trained models
model_files = [
    "RandomForest_model.pkl"
]

# Iterate through each model and make a prediction
for model_file in model_files:
    if os.path.exists(model_file):
        print(f"\n--- Predictions using {model_file.replace('_model.pkl', '')} ---")
        model = joblib.load(model_file)

        # Handle scaling for MLP and Logistic Regression if scaler was used
        # if model_file in ["MLPClassifier_model.pkl", "LogisticRegression_model.pkl"] and 'scaler' in locals():
        #     input_data_processed = scaler.transform(input_data)
        # else:
        #     input_data_processed = input_data

        # Handle TabNet input format
        input_data_processed = input_data


        # Make a prediction
        predicted_disease_encoded = model.predict(input_data_processed)

        # Decode the prediction if label encoding was used
        if 'label_encoder' in locals():
            predicted_disease = label_encoder.inverse_transform(predicted_disease_encoded)[0]
        else:
            predicted_disease = predicted_disease_encoded[0] # Fallback if no label encoder

        print(f"Predicted Disease: {predicted_disease}")

        # Optional: Get prediction probabilities to show top N diseases
        if hasattr(model, 'predict_proba'):
            probabilities = model.predict_proba(input_data_processed)[0]
            top_n = 5
            top_indices = np.argsort(probabilities)[::-1][:top_n]
            print("Top 5 possible diseases:")
            # Use the label encoder's classes for decoding
            if 'label_encoder' in locals():
                for i in top_indices:
                    print(f"- {label_encoder.classes_[i]}: {probabilities[i]*100:.2f}%")
            elif hasattr(model, 'classes_'):
                 for i in top_indices:
                    print(f"- {model.classes_[i]}: {probabilities[i]*100:.2f}%")
            else:
                 print("Cannot retrieve class names for top predictions.")

    else:
        print(f"\nWarning: Model file '{model_file}' not found. Skipping.")

Enter the symptoms you are experiencing, separated by commas:

--- Predictions using RandomForest ---
Predicted Disease: erythema multiforme
Top 5 possible diseases:
- erythema multiforme: 61.00%
- chickenpox: 12.00%
- diaper rash: 5.00%
- Drug Reaction: 4.00%
- mononucleosis: 3.00%


In [8]:
import joblib
import numpy as np
import pandas as pd
import re

# Load assets
symptom_cols = joblib.load("symptom_cols.pkl")
label_encoder = joblib.load("label_encoder.pkl")
model = joblib.load("RandomForest_model.pkl")

# Extract symptoms from free-form user input
def extract_symptoms_from_text(text, known_symptoms):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9_ ]", "", text)  # remove punctuation
    found_symptoms = []
    for symptom in known_symptoms:
        if symptom.lower().replace("_", " ") in text:
            found_symptoms.append(symptom)
    return found_symptoms

# Convert extracted symptoms to binary input vector
def symptoms_to_input_vector(symptoms, all_symptom_cols):
    input_vector = pd.DataFrame(0, index=[0], columns=all_symptom_cols)
    for symptom in symptoms:
        if symptom in input_vector.columns:
            input_vector.at[0, symptom] = 1
    return input_vector

# ==== USER INPUT EXAMPLE ====
user_input = input("Describe your symptoms: ")  # e.g. "I have a fever and severe headache"

# Extract and vectorize
extracted_symptoms = extract_symptoms_from_text(user_input, symptom_cols)
input_vector = symptoms_to_input_vector(extracted_symptoms, symptom_cols)

# Predict
if extracted_symptoms:
    probs = model.predict_proba(input_vector)[0]
    top_n = 5
    top_indices = np.argsort(probs)[::-1][:top_n]
    print("\nTop probable diseases:")
    for i in top_indices:
        print(f"- {label_encoder.classes_[i]}: {probs[i]*100:.2f}%")
else:
    print("⚠️ No recognizable symptoms found in your input.")



Top probable diseases:
- pharyngitis: 22.53%
- otitis media: 20.13%
- noninfectious gastroenteritis: 11.00%
- strep throat: 5.00%
- infectious gastroenteritis: 4.00%


In [3]:
import pandas as pd

df = pd.read_csv("combined_dataset.csv")
df.columns = df.columns.str.strip()

# All symptom columns (excluding 'diseases')
symptom_cols = [col for col in df.columns if col != "diseases"]

# Create mapping: disease -> all associated symptoms
disease_symptom_map = {}

for disease, group in df.groupby("diseases"):
    present_symptoms = group[symptom_cols].sum()
    relevant = present_symptoms[present_symptoms > 0].index.tolist()
    disease_symptom_map[disease] = relevant


import joblib
joblib.dump(disease_symptom_map, "disease_symptom_map.pkl")

['disease_symptom_map.pkl']

In [15]:
import pandas as pd
import google.generativeai as genai
import json

# Configure Gemini API key
genai.configure(api_key="AIzaSyA5pOowHQoCsuXhQhFIQ4uLKtjPB6y3iRo")  # Replace with your actual key
model = genai.GenerativeModel("gemini-2.5-pro")

# Load dataset and get first 5 diseases
df = pd.read_csv("combined_dataset.csv")
unique_diseases = sorted(df["diseases"].dropna().unique())[:5]

disease_info = {}

# Loop through and get info from Gemini
for disease in unique_diseases:
    prompt = (
        f"Give a brief description and 2-3 key precautions for the disease: {disease}. "
        "Make it concise. Just write the disease name, its description and Precautions in points."
    )
    print(f"\n🦠 {disease}")
    try:
        response = model.generate_content(prompt)
        print(response.text)
    except Exception as e:
        print(f"[Error] {e}")

with open("disease_info.json", "w", encoding="utf-8") as f:
    json.dump(disease_info, f, indent=2, ensure_ascii=False)

print("\n✅ Saved results to disease_info.json")


🦠 (vertigo) Paroymsal  Positional Vertigo
[Error] 500 An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting

🦠 AIDS
[Error] Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 1.

🦠 Acne
[Error] Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 1.

🦠 Alcoholic hepatitis
[Error] 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePe