In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [8]:
binary_data = pd.read_csv('../data/binary_dataset.csv')

In [9]:
# Separate features (X) and target (y)
X = binary_data.drop('condition', axis=1)
y = binary_data['condition']

# Split into train and test sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [11]:
# === Models and their Hyperparameter Grids ===
models_and_params = {
    "Decision Tree": {
        "model": DecisionTreeClassifier(random_state=42),
        "params": {
            "max_depth": [5, 10, 20, None],
            "min_samples_split": [2, 5, 10]
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [10, 20, None],
            "min_samples_split": [2, 5]
        }
    },
    "Gradient Boosting": {
        "model": GradientBoostingClassifier(random_state=42),
        "params": {
            "learning_rate": [0.01, 0.1, 0.2],
            "n_estimators": [100, 150],
            "max_depth": [3, 5, 10]
        }
    },
    "Naive Bayes": {
        "model": MultinomialNB(),
        "params": {
            "alpha": [0.5, 1.0, 1.5]
        }
    },
    "Logistic Regression": {
        "model": LogisticRegression(max_iter=1000),
        "params": {
            "C": [0.1, 1.0, 10],
            "solver": ['lbfgs', 'liblinear']
        }
    }
}

In [12]:
# === Model Training and Evaluation ===
best_model = None
best_score = 0
best_name = ""
results = {}

for name, mp in models_and_params.items():
    print(f"\n🔍 Tuning {name}...")
    grid = GridSearchCV(mp["model"], mp["params"], cv=5, n_jobs=-1, scoring='accuracy')
    grid.fit(X_train, y_train)
    
    best_estimator = grid.best_estimator_
    y_pred = best_estimator.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc

    print(f"✅ Best Params: {grid.best_params_}")
    print(f"📈 Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))

    if acc > best_score:
        best_score = acc
        best_model = best_estimator
        best_name = name


🔍 Tuning Decision Tree...
✅ Best Params: {'max_depth': None, 'min_samples_split': 2}
📈 Accuracy: 1.0000
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        24
                                   AIDS       1.00      1.00      1.00        24
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic hepatitis       1.00      1.00      1.00        24
                                Allergy       1.00      1.00      1.00        24
                              Arthritis       1.00      1.00      1.00        24
                       Bronchial Asthma       1.00      1.00      1.00        24
                   Cervical spondylosis       1.00      1.00      1.00        24
                            Chicken pox       1.00      1.00      1.00        24
                    Chronic cholestasis       1.00      1.00      1.00        24
   

KeyboardInterrupt: 

In [13]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_params = {
    "max_depth": [5, 10, 20, None],
    "min_samples_split": [2, 5, 10]
}

grid_dt = GridSearchCV(dt_model, dt_params, cv=5, scoring='accuracy')
grid_dt.fit(X_train, y_train)

print("Best Decision Tree Parameters:", grid_dt.best_params_)
print("Decision Tree Accuracy:", grid_dt.score(X_test, y_test))
print(classification_report(y_test, grid_dt.predict(X_test)))


Best Decision Tree Parameters: {'max_depth': None, 'min_samples_split': 2}
Decision Tree Accuracy: 1.0
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        24
                                   AIDS       1.00      1.00      1.00        24
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic hepatitis       1.00      1.00      1.00        24
                                Allergy       1.00      1.00      1.00        24
                              Arthritis       1.00      1.00      1.00        24
                       Bronchial Asthma       1.00      1.00      1.00        24
                   Cervical spondylosis       1.00      1.00      1.00        24
                            Chicken pox       1.00      1.00      1.00        24
                    Chronic cholestasis       1.00      1.00      1.00        24
     

In [14]:
rf_model = RandomForestClassifier(random_state=42)
rf_params = {
    "n_estimators": [100, 200],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5]
}

grid_rf = GridSearchCV(rf_model, rf_params, cv=5, scoring='accuracy')
grid_rf.fit(X_train, y_train)

print("Best Random Forest Parameters:", grid_rf.best_params_)
print("Random Forest Accuracy:", grid_rf.score(X_test, y_test))
print(classification_report(y_test, grid_rf.predict(X_test)))


Best Random Forest Parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}
Random Forest Accuracy: 1.0
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        24
                                   AIDS       1.00      1.00      1.00        24
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic hepatitis       1.00      1.00      1.00        24
                                Allergy       1.00      1.00      1.00        24
                              Arthritis       1.00      1.00      1.00        24
                       Bronchial Asthma       1.00      1.00      1.00        24
                   Cervical spondylosis       1.00      1.00      1.00        24
                            Chicken pox       1.00      1.00      1.00        24
                    Chronic cholestasis       1.00      1.00      1

In [15]:
gb_model = GradientBoostingClassifier(random_state=42)
gb_params = {
    "learning_rate": [0.01, 0.1, 0.2],
    "n_estimators": [100, 150],
    "max_depth": [3, 5, 10]
}

grid_gb = GridSearchCV(gb_model, gb_params, cv=5, scoring='accuracy')
grid_gb.fit(X_train, y_train)

print("Best Gradient Boosting Parameters:", grid_gb.best_params_)
print("Gradient Boosting Accuracy:", grid_gb.score(X_test, y_test))
print(classification_report(y_test, grid_gb.predict(X_test)))


Best Gradient Boosting Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
Gradient Boosting Accuracy: 1.0
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        24
                                   AIDS       1.00      1.00      1.00        24
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic hepatitis       1.00      1.00      1.00        24
                                Allergy       1.00      1.00      1.00        24
                              Arthritis       1.00      1.00      1.00        24
                       Bronchial Asthma       1.00      1.00      1.00        24
                   Cervical spondylosis       1.00      1.00      1.00        24
                            Chicken pox       1.00      1.00      1.00        24
                    Chronic cholestasis       1.00      1.00 