In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [2]:
import warnings

In [4]:
print("=== Disease Prediction Menu ===")
print("1 - XGboost (eXtreme Gradient Boosting)")
print("2 - Random Forest")
print("3 - Logistic Regression")
print("4 - Naive Bayes")
print("5 - Run all models")

choice = int(input("Enter your choice (1-5): "))

if choice == 1:
    train_df = pd.read_csv("Training.csv")
    test_df = pd.read_csv("Testing.csv")

    warnings.filterwarnings("ignore") 
    

    # Drop Unnamed columns (if present)
    train_df = train_df.loc[:, ~train_df.columns.str.contains('^Unnamed')]
    test_df = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]
    
    X_train = train_df.drop(columns=["prognosis"])
    y_train = train_df["prognosis"]

    X_test = test_df.drop(columns=["prognosis"])
    y_test = test_df["prognosis"]

# Encode target labels
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)   # same encoder use karna zaroori hai


    # Initialize XGBoost model
    xgb_model = XGBClassifier(
    n_estimators=200,       # number of trees
    learning_rate=0.1,      # step size shrinkage
    max_depth=6,            # tree depth
    random_state=42,
    use_label_encoder=False,
    eval_metric="mlogloss"  # avoids warning
)

# Train on training data
    xgb_model.fit(X_train, y_train)

# Predictions
    y_pred = xgb_model.predict(X_test)

# Evaluation
    print("Accuracy:", accuracy_score(y_test, y_pred))
    # print("\nClassification Report:\n", classification_report(y_test, y_pred))
    # print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


    # ==== USER INPUT PART ====

# Symptoms list from training dataset
    symptoms = X_train.columns.tolist()

    print("Available symptoms:")
    print(symptoms)

# User se symptoms lena
    user_input = input("\nEnter your symptoms separated by commas: ").split(",")

# Strip whitespace 
    user_input = [s.strip() for s in user_input]

# Convert user input into input vector
    input_vector = [0] * len(symptoms)
    for s in user_input:
        if s in symptoms:
            input_vector[symptoms.index(s)] = 1
        else:
            print(f"⚠️ Warning: '{s}' not found in symptom list.")

# Predict probabilities
    probs = xgb_model.predict_proba([input_vector])[0]

# Top 3 disease indices
    top_indices = probs.argsort()[-3:][::-1]

    print("\n✅ Top 3 probable diseases with probability:")
    for i in top_indices:
        disease = le.inverse_transform([i])[0]
        print(f"{disease} -> {round(probs[i]*100, 2)}%")
    
    pass
    
elif choice == 2:
    
    # Load dataset
    train_df = pd.read_csv("Training.csv")
    test_df = pd.read_csv("Testing.csv")

    warnings.filterwarnings("ignore") 
    
# Drop unnamed columns
    train_df = train_df.loc[:, ~train_df.columns.str.contains('^Unnamed')]
    test_df = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]

# Features and target
    X_train = train_df.drop(columns=["prognosis"])
    y_train = train_df["prognosis"]

    X_test = test_df.drop(columns=["prognosis"])
    y_test = test_df["prognosis"]

# Encode target labels
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)

# Train Random Forest
    rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
    rf_model.fit(X_train, y_train)

# Evaluate
    y_pred = rf_model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    # print("\nClassification Report:\n", classification_report(y_test, y_pred))
    # print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ==== USER INPUT ====
    symptoms = X_train.columns.tolist()
# Mapping: lowercase + remove spaces for easier input
    symptom_map = {s.lower().replace(" ", ""): s for s in symptoms}

    print("\nAvailable symptoms (type similar to this):")
    print(list(symptom_map.keys()))

# Take user input
    user_input = input("Enter your symptoms separated by commas: ").split(",")
    user_input = [s.strip().lower().replace(" ", "") for s in user_input]

# Convert user input to vector
    input_vector = [0] * len(symptoms)
    for s in user_input:
        if s in symptom_map:
            input_vector[symptoms.index(symptom_map[s])] = 1
        else:
            print(f"⚠️ Warning: '{s}' not found in symptom list.")

# Convert to DataFrame
    input_df = pd.DataFrame([input_vector], columns=symptoms)

# Predict probabilities
    probs = rf_model.predict_proba(input_df)[0]
    top_indices = probs.argsort()[-3:][::-1]  # top 3

    print("\n✅ Top 3 probable diseases with probability:")
    for i in top_indices:
        disease = le.inverse_transform([i])[0]
        print(f"{disease} -> {round(probs[i]*100, 2)}%")

    
    pass
elif choice == 3:


    # Load data
    train_df = pd.read_csv("Training.csv")
    test_df = pd.read_csv("Testing.csv")

    warnings.filterwarnings("ignore") 
    
# Drop unnamed columns
    train_df = train_df.loc[:, ~train_df.columns.str.contains('^Unnamed')]
    test_df = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]

# Features and target
    X_train = train_df.drop(columns=["prognosis"])
    y_train = train_df["prognosis"]

    X_test = test_df.drop(columns=["prognosis"])
    y_test = test_df["prognosis"]

# Encode target
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)

# Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression
    lr_model = LogisticRegression(max_iter=1000, random_state=42)
    lr_model.fit(X_train_scaled, y_train)

# Predict & evaluate
    y_pred = lr_model.predict(X_test_scaled)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    # print("\nClassification Report:\n", classification_report(y_test, y_pred))
    # print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ==== USER INPUT ====
    symptoms = X_train.columns.tolist()
    print("Available symptoms:", symptoms)

    user_input = input("Enter your symptoms separated by commas: ").split(",")
    user_input = [s.strip() for s in user_input]

# Convert user input to vector
    input_vector = [0] * len(symptoms)
    for s in user_input:
        if s in symptoms:
            input_vector[symptoms.index(s)] = 1
        else:
            print(f"⚠️ Warning: '{s}' not found in symptom list.")

# Convert to DataFrame, scale, predict
    input_df = pd.DataFrame([input_vector], columns=symptoms)
    input_vector_scaled = scaler.transform(input_df)
    probs = lr_model.predict_proba(input_vector_scaled)[0]

# Top 3 disease indices
    top_indices = probs.argsort()[-3:][::-1]

    print("\n✅ Top 3 probable diseases with probability (Logistic Regression):")
    for i in top_indices:
        disease = le.inverse_transform([i])[0]
        print(f"{disease} -> {round(probs[i]*100, 2)}%")

    pass
elif choice == 4:

    # === Load dataset ===
    train_df = pd.read_csv("Training.csv")
    test_df = pd.read_csv("Testing.csv")

    warnings.filterwarnings("ignore") 
    
# Drop unnamed columns
    train_df = train_df.loc[:, ~train_df.columns.str.contains('^Unnamed')]
    test_df = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]

# Features and target
    X_train = train_df.drop(columns=["prognosis"])
    y_train = train_df["prognosis"]
    X_test = test_df.drop(columns=["prognosis"])
    y_test = test_df["prognosis"]

# Encode target labels
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)

# Train Multinomial Naive Bayes
    nb_model = MultinomialNB()
    nb_model.fit(X_train, y_train)  # No scaling needed for MultinomialNB

# Evaluate
    y_pred = nb_model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    # print("\nClassification Report:\n", classification_report(y_test, y_pred))
    # print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ==== USER INPUT ====
    symptoms = X_train.columns.tolist()
    symptom_map = {s.lower().replace(" ", ""): s for s in symptoms}

    print("\nAvailable symptoms (lowercase & no spaces):")
    print(list(symptom_map.keys()))

# Take user input
    user_input = input("\nEnter your symptoms separated by commas: ").split(",")
    user_input = [s.strip().lower().replace(" ", "") for s in user_input]

# Convert input to vector
    input_vector = [0] * len(symptoms)
    for s in user_input:
        if s in symptom_map:
            input_vector[symptoms.index(symptom_map[s])] = 1
        else:
            print(f"⚠️ Warning: '{s}' not found in symptom list.")

# Predict probabilities
    probs = nb_model.predict_proba([input_vector])[0]

# Top 3 probable diseases
    top_indices = probs.argsort()[-3:][::-1]
    print("\n✅ Top 3 probable diseases with probability (Naive Bayes):")
    for i in top_indices:
        disease = le.inverse_transform([i])[0]
        print(f"{disease} -> {round(probs[i]*100, 2)}%")

    pass
elif choice == 5:

    train_df = pd.read_csv("Training.csv")
    test_df = pd.read_csv("Testing.csv")

    warnings.filterwarnings("ignore") 
# Check first 5 rows
    # Drop Unnamed columns (if present)
    train_df = train_df.loc[:, ~train_df.columns.str.contains('^Unnamed')]
    test_df = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]
    
    X_train = train_df.drop(columns=["prognosis"])
    y_train = train_df["prognosis"]

    X_test = test_df.drop(columns=["prognosis"])
    y_test = test_df["prognosis"]

# Encode target labels
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)   # same encoder use karna zaroori hai



    # ==== USER INPUT PART ====

# Symptoms list from training dataset
    symptoms = X_train.columns.tolist()

    print("Available symptoms:")
    print(symptoms)

# User se symptoms lena
    user_input = input("\nEnter your symptoms separated by commas: ").split(",")

# Strip whitespace 
    user_input = [s.strip() for s in user_input]

# Convert user input into input vector
    input_vector = [0] * len(symptoms)
    for s in user_input:
        if s in symptoms:
            input_vector[symptoms.index(s)] = 1
        else:
            print(f"⚠️ Warning: '{s}' not found in symptom list.")


    

       # Initialize XGBoost model
    xgb_model = XGBClassifier(
    n_estimators=200,       # number of trees
    learning_rate=0.1,      # step size shrinkage
    max_depth=6,            # tree depth
    random_state=42,
    use_label_encoder=False,
    eval_metric="mlogloss"  # avoids warning
)

# Train on training data
    xgb_model.fit(X_train, y_train)

# Predictions
    y_pred = xgb_model.predict(X_test)

# Evaluation
    print("Accuracy:", accuracy_score(y_test, y_pred))
    # print("\nClassification Report:\n", classification_report(y_test, y_pred))
    # print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))      

    # Predict probabilities
    probs = xgb_model.predict_proba([input_vector])[0]

# Top 3 disease indices
    top_indices = probs.argsort()[-3:][::-1]

    print("\n✅ Top 3 probable diseases with probability ():")
    for i in top_indices:
        disease = le.inverse_transform([i])[0]
        print(f"{disease} -> {round(probs[i]*100, 2)}%")



    # Train Random Forest
    rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
    rf_model.fit(X_train, y_train)

# Evaluate
    y_pred = rf_model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    # print("\nClassification Report:\n", classification_report(y_test, y_pred))
    # print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


    # Convert to DataFrame
    input_df = pd.DataFrame([input_vector], columns=symptoms)

# Predict probabilities
    probs = rf_model.predict_proba(input_df)[0]
    top_indices = probs.argsort()[-3:][::-1]  # top 3

    print("\n✅ Top 3 probable diseases with probability (Random Forest):")
    for i in top_indices:
        disease = le.inverse_transform([i])[0]
        print(f"{disease} -> {round(probs[i]*100, 2)}%")

    
    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression
    lr_model = LogisticRegression(max_iter=1000, random_state=42)
    lr_model.fit(X_train_scaled, y_train)

# Predict & evaluate
    y_pred = lr_model.predict(X_test_scaled)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    # print("\nClassification Report:\n", classification_report(y_test, y_pred))
    # print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

    # Convert to DataFrame, scale, predict
    input_df = pd.DataFrame([input_vector], columns=symptoms)
    input_vector_scaled = scaler.transform(input_df)
    probs = lr_model.predict_proba(input_vector_scaled)[0]

# Top 3 disease indices
    top_indices = probs.argsort()[-3:][::-1]

    print("\n✅ Top 3 probable diseases with probability (Logistic Regression):")
    for i in top_indices:
        disease = le.inverse_transform([i])[0]
        print(f"{disease} -> {round(probs[i]*100, 2)}%")

    # Train Multinomial Naive Bayes
    nb_model = MultinomialNB()
    nb_model.fit(X_train, y_train)  # No scaling needed for MultinomialNB

# Evaluate
    y_pred = nb_model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    # print("\nClassification Report:\n", classification_report(y_test, y_pred))
    # print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


    # Predict probabilities
    probs = nb_model.predict_proba([input_vector])[0]

# Top 3 probable diseases
    top_indices = probs.argsort()[-3:][::-1]
    print("\n✅ Top 3 probable diseases with probability (Naive Bayes):")
    for i in top_indices:
        disease = le.inverse_transform([i])[0]
        print(f"{disease} -> {round(probs[i]*100, 2)}%")

    
    pass
else:
    print("❌ Invalid choice")

=== Disease Prediction Menu ===
1 - XGboost (eXtreme Gradient Boosting)
2 - Random Forest
3 - Logistic Regression
4 - Naive Bayes
5 - Run all models


Enter your choice (1-5):  5


Available symptoms:
['itching', 'skin_rash', 'nodal_skin_eruptions', 'continuous_sneezing', 'shivering', 'chills', 'joint_pain', 'stomach_pain', 'acidity', 'ulcers_on_tongue', 'muscle_wasting', 'vomiting', 'burning_micturition', 'spotting_ urination', 'fatigue', 'weight_gain', 'anxiety', 'cold_hands_and_feets', 'mood_swings', 'weight_loss', 'restlessness', 'lethargy', 'patches_in_throat', 'irregular_sugar_level', 'cough', 'high_fever', 'sunken_eyes', 'breathlessness', 'sweating', 'dehydration', 'indigestion', 'headache', 'yellowish_skin', 'dark_urine', 'nausea', 'loss_of_appetite', 'pain_behind_the_eyes', 'back_pain', 'constipation', 'abdominal_pain', 'diarrhoea', 'mild_fever', 'yellow_urine', 'yellowing_of_eyes', 'acute_liver_failure', 'fluid_overload', 'swelling_of_stomach', 'swelled_lymph_nodes', 'malaise', 'blurred_and_distorted_vision', 'phlegm', 'throat_irritation', 'redness_of_eyes', 'sinus_pressure', 'runny_nose', 'congestion', 'chest_pain', 'weakness_in_limbs', 'fast_heart_rat


Enter your symptoms separated by commas:  mild_fever,yellow_urine,yellowing_of_eyes, acute_liver_failure


Accuracy: 0.9761904761904762

✅ Top 3 probable diseases with probability ():
hepatitis A -> 97.3%
Hepatitis C -> 0.76%
Chronic cholestasis -> 0.37%
Accuracy: 0.9761904761904762

✅ Top 3 probable diseases with probability (Random Forest):
Hepatitis C -> 16.5%
hepatitis A -> 16.0%
Chicken pox -> 12.5%
Accuracy: 0.9761904761904762

✅ Top 3 probable diseases with probability (Logistic Regression):
hepatitis A -> 38.61%
Hepatitis B -> 9.7%
Hepatitis C -> 8.26%
Accuracy: 1.0

✅ Top 3 probable diseases with probability (Naive Bayes):
hepatitis A -> 38.51%
Hepatitis B -> 26.36%
Hepatitis E -> 19.47%


In [18]:
import warnings


In [7]:
X_train.columns


Index(['itching', 'skin_rash', 'nodal_skin_eruptions', 'continuous_sneezing',
       'shivering', 'chills', 'joint_pain', 'stomach_pain', 'acidity',
       'ulcers_on_tongue',
       ...
       'pus_filled_pimples', 'blackheads', 'scurring', 'skin_peeling',
       'silver_like_dusting', 'small_dents_in_nails', 'inflammatory_nails',
       'blister', 'red_sore_around_nose', 'yellow_crust_ooze'],
      dtype='object', length=132)